# Bag of Words Meets Bags of Popcorn

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('labeledTrainData.tsv',sep="\t")

In [3]:
#clean the reviews
from bs4 import BeautifulSoup
df['review'] = df['review'].apply(lambda x: BeautifulSoup(x,"lxml").text)

### NLTK download
If you didn't work with NLTK before on this machine, you'll need to download some stuff first, i.e.

```
import nltk
nltk.download("punkt")
nltk.download("stopwords")
```

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(u'[A-Za-z]+')

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [5]:
#Our custom tokenizer that only accepts words greater that two letters.
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    for i in tokens:
        if len(i)>2:
            yield(wordnet_lemmatizer.lemmatize(i))

In [6]:
df['review'].head()

0    With all this stuff going down at the moment w...
1    \The Classic War of the Worlds\" by Timothy Hi...
2    The film starts with a manager (Nicholas Bell)...
3    It must be assumed that those who praised this...
4    Superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object

In [7]:
# Import the stop word list
from nltk.corpus import stopwords 
stop_words= stopwords.words("english")

In [8]:
#Make a term frequency matrix
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    tokenizer=tokenize,
    stop_words=stop_words,
    max_features=5000
)

In [10]:
train_data_features = cv.fit_transform(df['review']).toarray()
#train_data_features = train_data_features.toarray()
print("train data features", train_data_features.shape)

train data features (25000, 5000)


### This is our model - RandomForestClassifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)

#train
forest = forest.fit(train_data_features, df['sentiment'])

In [12]:
# Read and clean test dataset
test = pd.read_csv('testData.tsv',sep="\t")
test['review'] = test['review'].apply(lambda x: BeautifulSoup(x,"lxml").text)


In [13]:
# TF for test
test_data_features = cv.transform(test['review']).toarray()
#test_data_features = test_data_features.toarray()
print("test data features ", test_data_features.shape)

test data features  (25000, 5000)


In [14]:
#prediction for test dataset
result = forest.predict_proba(test_data_features)[:,1]

In [15]:
# writing to a file
output = pd.DataFrame(data={"id":test["id"],"sentiment":result})

output.to_csv( "Bag_of_Words_model_2.csv", index=False, quoting=3 )

### This gives 0.92 on the leaderboard