In [105]:
from google.colab import drive
from google.colab import files
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [107]:
# Import packages
import os
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Read data
train = pd.read_csv('/drive/My Drive/FTMLE - Tonga/Data/movie_review.csv', encoding='utf-8', sep='\t')
test = pd.read_csv('/drive/My Drive/FTMLE - Tonga/Data/movie_review_evaluation.csv', encoding='utf-8', sep='\t')

In [0]:
train.head(5)

In [0]:
test.head(5)

In [112]:
print('The dimension of train data is {}.'.format(train.shape))
print('The dimension of test data is {}.'.format(test.shape))

The dimension of train data is (22500, 3).
The dimension of test data is (2500, 2).


###Removing HTML Markup

In [0]:
train['review_bs'] = train['review'].apply(lambda x: BeautifulSoup(x, 'html.parser'))
# train.review_bs[0].get_text()

###Dealing with Punctuation, Numbers and Stopwords

In [0]:
train['review_letters_only'] = train['review_bs'].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x.get_text()))
# train['review_letters_only'][0]

In [0]:
train['review_words'] = train['review_letters_only'].apply(lambda x: x.lower().split())
# train['review_words'][0]

In [80]:
set_of_stopwords = set(stopwords.words("english"))
train['review_meaningful_words'] = train['review_words'].apply(lambda x: [w for w in x if not w in set_of_stopwords])

num_removed = len(train['review_words'][0]) - len(train['review_meaningful_words'][0])
print('The number of stop words removed is {0}.'.format(num_removed))

For the first review entry, the number of stop words removed is 218.


###Steming and Lemming (the accuracy afterward is higher without steming and lemming, therefore it will not be apply)

In [0]:
# porter_stemmer = PorterStemmer()
# wordnet_lemmatizer = WordNetLemmatizer()

# train['review_stemmed'] = train['review_meaningful_words'].apply(
#     lambda x: [porter_stemmer.stem(w) for w in x])
# train['review_stemmed'] = train['review_cleaned'].apply(
#     lambda x: [wordnet_lemmatizer.lemmatize(w) for w in x])

In [0]:
# train['review_cleaned'] = train['review_stemmed'].apply(lambda x: ' '.join(x)) # uncomment if using stemming
train['review_cleaned'] = train['review_meaningful_words'].apply(lambda x: ' '.join(x)) # comment if using stemming

In [0]:
train.drop(['review', 'review_bs', 'review_letters_only', 'review_words', 'review_meaningful_words'], 
           axis=1, inplace=True)

In [84]:
print(train['review_cleaned'][0])

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

###Creating Features from a Bag of Words

In [0]:
vectorizer = CountVectorizer(analyzer="word", preprocessor=None, tokenizer=None, stop_words=None, max_features=5000)

In [0]:
train_data_features = vectorizer.fit_transform(list(train['review_cleaned'].values))
train_data_features = train_data_features.toarray()

In [87]:
print('The dimension of train_data_features is {}.'.format(train_data_features.shape))

The dimension of train_data_features is (22500, 5000).


In [0]:
vocab = vectorizer.get_feature_names()

In [0]:
def clean_reviews(reviews, remove_stopwords=False, stem=False):
    """
    to clean review strings
    review: a list of review strings
    remove_stop_words: whether to remove stop words
    output: a list of clean reviews
    """
    # 1. Remove HTML
    reviews_text = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text(), reviews))
    #
    # 2. Remove non-letters
    reviews_text = list(map(lambda x: re.sub("[^a-zA-Z]"," ", x), reviews_text))
    #
    # 3. Convert words to lower case and split them
    words = list(map(lambda x: x.lower().split(), reviews_text))
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        set_of_stopwords = set(stopwords.words("english"))
        meaningful_words = list(map(lambda x: [w for w in x if not w in set_of_stopwords], words))
    
    # 5. Optionally stem the words
    if stem:
        porter_stemmer = PorterStemmer()
        wordnet_lemmatizer = WordNetLemmatizer()
        stemmed_words = list(map(lambda x: [porter_stemmer.stem(w) for w in x], meaningful_words))
        stemmed_words = list(map(lambda x:[wordnet_lemmatizer.lemmatize(w) for w in x], stemmed_words))
    
        # 6. Join the words to a single string
        clean_review = map(lambda x: ' '.join(x), stemmed_words)
    else:
        clean_review = list(map(lambda x: ' '.join(x), meaningful_words))
    
    return clean_review

In [0]:
# Get a bag of words for the test set, and convert to a numpy array
clean_test_reviews = clean_reviews(list(test['review'].values), remove_stopwords=True)
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

###Build models with train set

In [0]:
# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(train['review_cleaned'], train['sentiment'], test_size = 0.2)

In [0]:
X_test.head()

In [0]:
count_vectorizer = CountVectorizer()

# Transform the sentence into a vector of number 
X_train_bag = count_vectorizer.fit_transform(X_train)

###TfIDF

In [0]:
vectorizer = TfidfVectorizer()

# Transform the sentences into vectors of numbers using tfidf method 
X_train_tfidf = vectorizer.fit_transform(X_train)

###Random Forest

TF-IDF performance

In [95]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
rfc.fit(X_train_tfidf, y_train)
X_test_tfidf = vectorizer.transform(X_test)
y_pred = rfc.predict(X_test_tfidf)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1907  332]
 [ 318 1943]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      2239
           1       0.85      0.86      0.86      2261

    accuracy                           0.86      4500
   macro avg       0.86      0.86      0.86      4500
weighted avg       0.86      0.86      0.86      4500



Bag of words performance

In [96]:
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
rfc.fit(X_train_bag, y_train)
X_test_bow = count_vectorizer.transform(X_test)
y_pred = rfc.predict(X_test_bow)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1926  313]
 [ 328 1933]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      2239
           1       0.86      0.85      0.86      2261

    accuracy                           0.86      4500
   macro avg       0.86      0.86      0.86      4500
weighted avg       0.86      0.86      0.86      4500



###K-nearest neighbor

TF-IDF performance

In [97]:
knn = KNeighborsClassifier(n_neighbors=2, weights='distance')
knn.fit(X_train_tfidf, y_train)
y_pred = knn.predict(X_test_tfidf)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1563  676]
 [ 380 1881]]
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      2239
           1       0.74      0.83      0.78      2261

    accuracy                           0.77      4500
   macro avg       0.77      0.77      0.76      4500
weighted avg       0.77      0.77      0.76      4500



Bag of words performance

In [98]:
knn = KNeighborsClassifier(n_neighbors=2, weights='distance')
knn.fit(X_train_bag, y_train)
y_pred = knn.predict(X_test_bow)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1437  802]
 [1010 1251]]
              precision    recall  f1-score   support

           0       0.59      0.64      0.61      2239
           1       0.61      0.55      0.58      2261

    accuracy                           0.60      4500
   macro avg       0.60      0.60      0.60      4500
weighted avg       0.60      0.60      0.60      4500



Base on the outcome accuracy, the random forest model is chosen to evaluate the test set

###Time to test it out!

In [0]:
# Fit the forest to the training set, using the bag of words as features and the sentiment labels as labels
rfc.fit(train_data_features, train['sentiment'])

# Use the random forest to make sentiment label predictions
result = rfc.predict(test_data_features)
output = pd.DataFrame(data={'id':test['id'],'sentiment':result})

# # Use pandas to write the comma-separated output file
output.to_csv(r'rfc_results.csv', index=False, quoting=3)
files.download('rfc_results.csv')   
print('Wrote results complete')