In [1]:
# Imports
import os
import random
import numpy as np
import pandas as pd
import re
import seaborn as sns
import scipy
import contractions
import matplotlib.pyplot as plt
%matplotlib inline

import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Creating dataset using Pandas

def load_imdb_sentiment_analysis_dataset(data_path, seed = 32):
    imdb_data_path = os.path.join(data_path, 'aclImdb')

    # Load data from training folder
    texts = []
    labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(imdb_data_path, 'train', category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    texts.append(f.read())
                labels.append(0 if category == 'neg' else 1)

    # Load data from test folder
    for category in ['pos', 'neg']:
        test_path = os.path.join(imdb_data_path, 'test', category)
        for fname in sorted(os.listdir(test_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    texts.append(f.read())
                labels.append(0 if category == 'neg' else 1)
    
    random.seed(seed)
    random.shuffle(texts)
    random.seed(seed)
    random.shuffle(labels)
    
    list_of_dict = []
    for i in range(0, len(texts)):
        temp_dict = {}
        temp_dict['texts'] = texts[i]
        temp_dict['labels'] = labels[i]
        list_of_dict.append(temp_dict)
    dataf = pd.DataFrame.from_dict(list_of_dict)    
    
    return dataf

df = load_imdb_sentiment_analysis_dataset('/Users/kushagraseth/Documents/Repositories/acl-imdb-nlp')
print(df.tail())
x = df['texts']
y = df['labels']

                                                   texts  labels
49995  Pepe le Moko, played by Charles Boyer, is some...       0
49996  I went to see "Evening" because of the cast. I...       0
49997  This was a very good movie and is absolutely u...       1
49998  As a Dane I'm proud of the handful of good Dan...       0
49999  Bette Midler is indescribable in this concert....       1


In [5]:
# Tokenization
def tokenize(text):
    return [i.text for i in tokenizer(text)]

# Text Lemmatization
def lemmatization(text):
    doc = nlp(text)
    temp_list = []
    for token in doc:
        temp_list.append(token)
    return ' '.join(map(str, temp_list))

# Stop Word Removal
def stopword_removal(text):
    temp_list = []
    for item in tokenize(text):
        if item not in stopwords:
            temp_list.append(item)
    return ' '.join(temp_list)

In [6]:
# Pre-Process Text
def preprocess_text(text):
    # Removing all HTML Tags
    text = re.sub(r'<.*?>', '', text)
    # Removing links
    text = re.sub(r'http\S+', '', text)
    # Remove Text Contractions
    text = contractions.fix(text)
    # Removing special characters and numbers
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
#     # Lemmatization
#     text = lemmatization(text)
#     # Stop word Removal
#     text = stopword_removal(text)
    # Removing single characters
    text = re.sub(r's+[a-zA-Z]s+', '', text)
    # Replacing multi-spaces by a single space
    text = re.sub(r'\s+', ' ', text)
    return text

In [7]:
x_corpus = []
for idx, row in x.items():
    sentence = preprocess_text(row)
    x_corpus.append(sentence)
print(x_corpus)    

Paranoid Park is about Alex a 16 year old skater who cau the death even if accidentally of a security guard As the movie goes on Alex deals with little issues like his parent s divorce and the sexual heat of his girlfriend Jennifer played by Taylor Momsen who you can currently see in Gossip Girl while he is consumed by guilt I had seen just one movie by Gus Van Sant so far Elephant and I can assure you that he kept all his manneri while doing this one Once again he cro time lines Once again he u weird filming techniques like the never ending shots of people walking maybe you like these filming techniques I know I do not The plot feels unsatisfactory to me its almost to simple to be explored for such a long time The feeling I get same with Elephant is that Gus Van Sant tries too hard to make an artistic movie which cau the movie itself to loose substance Also Gabe Nevins was pretty bad as Alex there is a difference between looking alienated or detached and looking like a robot If you ar

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



This is a truly awful film What they have done is taken a TV show which was never aimed at young children given it the George Lucas treatment i e ruined it by kiddifying it to appeal to the younger audience OK so the Thunderbirds TV show was not exactly the most cerebral of shows in fact it was pretty formulaic but it was always enjoyable to watch especially when the models got blown up and the voice cast was not too bad This suffers from bad casting bad acting with the notable exceptions of Sophia Myles as Lady Penelope Ron Cook as Parker who seem to be the only cast members to have a clue about how their characters should be played after this travesty I would not let Frakes direct traffic The whole point of Thunderbirds was that it was about the whole Tracy family how they worked as a team preventing diters or coming to the rescue of those involved in diters Avoid this rubbish like the plague I only give it 1 out of 10 because a zero rating is not supported 
Not having seen the film 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



From the second the music swelled second one of the movie and it was movie hack tripe I knew I was in for a very long ride Horrendously clich d I laughed a lot and knew how the plot ended WELL before the ending they did not use Louisbourg particularly well and the costuming and hair were kind of awful My particular favourite makeup moment is that the only way they age Depardieu as far as I could see was by putting a straight hair wig on him instead of wavy I could go on about the ridiculous unsuitability of the music for a long time the movie could be improved massively by an 18th century score ETA AH it is that horrible moviemusic guy Patrick Doyle who is responsible for the score say no More He should NOT be allowed near historical movies he should stick to 20th century settings The visit to the notable people portion was also hilarious particularly his little visit to Madame Pompadour who was not particularly convincingly played I thought the only actor who appeared grounded in the 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
# Splitting the dataset into the Training set and Validation set
np.random.seed(32)
x_train, x_test, y_train, y_test = train_test_split(x_corpus, 
                                                    y, 
                                                    random_state = 0, 
                                                    test_size = 0.1, 
                                                    shuffle = True)

In [9]:
# Fit Tf-Idf Vectorizer to Data set
vectorizer = TfidfVectorizer(ngram_range = (1, 2), 
                                 min_df = 2)
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector = vectorizer.transform(x_test)

In [None]:
def classifier_score(clf):
    clf.fit(x_train_vector, y_train)
    score = clf.score(x_test_vector, y_test)
    y_pred = clf.predict(x_test_vector)
    cr = classification_report(y_test, y_pred)
    return score, cr

# Logistic Regression
clf_lr = classifier_score(LogisticRegression(random_state = 32))
print('Logistic Regression Score: ', clf_lr[0])
print('Logistic Regression Classification Report: \n', clf_lr[1])

# SVM
clf_svm = classifier_score(svm.SVC(kernel = 'linear'))
print('SVM Score: ', clf_svm[0])
print('SVM Classification Report: \n', clf_svm[1])

# Naive Bayes
clf_nb = classifier_score(BernoulliNB(alpha = 1 , binarize = 0.0 , fit_prior = False , class_prior = None))
print('Naive Bayes Score: ', clf_nb[0])
print('Naive Bayes Classification Report: \n', clf_nb[1])

# Decision Tree
clf_dt = classifier_score(DecisionTreeClassifier(random_state = 32))
print('Decision Tree Score: ', clf_dt[0])
print('Decision Tree Classification Report: \n', clf_dt[1])

# Random Forest
clf_rf = classifier_score(RandomForestClassifier(random_state = 32))
print('Random Forest Score: ', clf_rf[0])
print('Random Forest Classification Report: \n', clf_rf[1])

Logistic Regression Score:  0.9064
Logistic Regression Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.89      0.91      2511
           1       0.90      0.92      0.91      2489

    accuracy                           0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000



In [None]:
# Line Plot Accuracy Comparison
accuracy_list = [clf_lr[0], clf_svm[0], clf_nb[0], clf_dt[0], clf_rf[0]]
model_list = ['Logistic Regression', 'SVM', 'Naive Bayes', 'Decision Tree', 'Random Forest']

plt.style.use('ggplot')
plt.plot(model_list, accuracy_list, color="green", linewidth = '3', marker = 'o')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison', color = 'black')
plt.legend(shadow = True, labelcolor = 'black')
plt.show()  