In [1]:
import pandas as pd
train = pd.read_csv('../Dataset/word2vec-nlp/labeledTrainData.tsv',\
                    header=0, delimiter='\t', quoting=3)

In [7]:
# preprocessing steps
from bs4 import BeautifulSoup    
import re
import nltk
from nltk.corpus import stopwords
nltk.data.path.append("../Dataset/nltk_data/")
def review_to_words( raw_review ):
    review_text = BeautifulSoup(raw_review).get_text()     
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))

In [9]:
# preprocess all data
import time
num_reviews = train["review"].size
clean_train_reviews = []
start_time = time.time()
for i in range( 0, num_reviews ):
    clean_train_reviews.append( review_to_words( train["review"][i] ))
print(" ---%s seconds---" % (time.time()-start_time))

 ---16.44306755065918 seconds---


In [10]:
# creating bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# clean_train_reviews: a list of strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [16]:
print(train_data_features[10])

[0 0 0 ... 0 0 0]


In [17]:
# random forest
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( train_data_features, train["sentiment"] )

In [19]:
test = pd.read_csv("../Dataset/word2vec-nlp/testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

num_reviews = len(test["review"])
clean_test_reviews = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append(clean_review )

Cleaning and parsing the test set movie reviews...



In [20]:
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model_RF.csv", index=False, quoting=3 )