In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('labeledTrainData.tsv',
                    header=0,
                    delimiter='\t',
                    quoting=3) # ignore double quotes

In [3]:
print(train.shape)
print(train.columns.values)
#print(train["review"][0])

(25000, 3)
['id' 'sentiment' 'review']


In [4]:
example1 = BeautifulSoup(train['review'][0]) # remove html tags
print(example1.get_text())

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 mi

In [5]:
# remove punctuation and numbers

letters_only = re.sub('[^a-zA-Z]', # search
                      ' ', # replace
                      example1.get_text())
print(letters_only)    

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    mi

In [6]:
lower_case = letters_only.lower() # convert to lower case
words = lower_case.split() # split into words

In [7]:
nltk.download('stopwords') # downnload stop-words

[nltk_data] Downloading package stopwords to /home/nadiia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# remove stop words

words = [w for w in words if not w in stopwords.words('english')]
print(words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

In [10]:
def review_to_words(raw_review):
    '''
    Converts a raw review to a string of words
    '''
    # remove html tags
    review_text = BeautifulSoup(raw_review).get_text()
    
    # remove non-letters
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    
    # lower case and tokenize
    words = letters_only.lower().split()
    
    # convert stop words to a set for faster searching
    stops = set(stopwords.words('english'))
    
    # remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    # join words into space-separated string
    return(' '.join(meaningful_words))

In [11]:
clean_review = review_to_words(train['review'][0])
print(clean_review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [12]:
# Get number of reviews in train
num_reviews = train['review'].size

clean_train_reviews = []

In [13]:
print('Cleaning and parsing the training set movie reviews...\n')

for i in range(num_reviews):
    if ((i + 1) % 1000 == 0):
        print('Review {0} in {1}'.format(i + 1, num_reviews))
    clean_train_reviews.append(review_to_words(train['review'][i]))

Cleaning and parsing the training set movie reviews...

Review 1000 in 25000
Review 2000 in 25000
Review 3000 in 25000
Review 4000 in 25000
Review 5000 in 25000
Review 6000 in 25000
Review 7000 in 25000
Review 8000 in 25000
Review 9000 in 25000
Review 10000 in 25000
Review 11000 in 25000
Review 12000 in 25000
Review 13000 in 25000
Review 14000 in 25000
Review 15000 in 25000
Review 16000 in 25000
Review 17000 in 25000
Review 18000 in 25000
Review 19000 in 25000
Review 20000 in 25000
Review 21000 in 25000
Review 22000 in 25000
Review 23000 in 25000
Review 24000 in 25000
Review 25000 in 25000


In [14]:
print('Creating the bag of words...\n')

# Sklearn bag-of-words tool
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)

# Fit model and learn vocabulary from list of string
# Tranform train data into feature vectors

train_data_features = vectorizer.fit_transform(clean_train_reviews)

Creating the bag of words...



In [15]:
print(type(train_data_features))
print(train_data_features.shape)
display(train_data_features)

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 5000)


<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1975048 stored elements in Compressed Sparse Row format>

In [16]:
# Convert features matrix to array
train_data_features = train_data_features.toarray()

In [17]:
# Look at vocabulary
vocabulary = vectorizer.get_feature_names()
print(type(vocabulary))
#print(vocabulary)

<class 'list'>


In [18]:
# Sum up counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# Occurence of each word in train data
#for tag, count in zip(vocabulary, dist):
#    print(count, tag)

In [19]:
# Initialize model, fit on bag of words as features and train labels

forest = RandomForestClassifier(n_estimators = 100)
forest.fit(train_data_features, train['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
test = pd.read_csv('testData.tsv',
                   header=0,
                   delimiter='\t',
                   quoting=3 )

In [22]:
print(test.shape)

(25000, 2)


In [23]:
num_reviews = len(test['review'])
clean_test_reviews = [] 

In [24]:
print('Cleaning and parsing the test set movie reviews...\n')

for i in range(num_reviews):
    if((i + 1) % 1000 == 0):
        print('Review {0} of {1}\n'.format(i + 1, num_reviews))
    clean_review = review_to_words(test['review'][i])
    clean_test_reviews.append(clean_review)

Cleaning and parsing the test set movie reviews...

Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [25]:
# Bag of words features for test set
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [26]:
result = forest.predict(test_data_features)

In [27]:
output = pd.DataFrame({'id': test['id'],
                       'sentiment': result})
output.to_csv('Bag_of_Words_model.csv', index=False, quoting=3)