# Multiclass Sentiment Analysis Using IMDB Reviews

**Authors:** Joseph Babel, Cameron Harte

**Description:**
We build several models using different classification algorithms in conjunction with n-gram and tf-idf features to find the best combination for classifying IMDB reviews into multiple sentiment classes.

**Task Distribution:**

Joseph Babel: Preprocess data, train and test SGD classifier, train and test SVM classifier, build confusion matrix with better prediction metrics

Cameron Harte: Build csv files for train and test data, build ngram models, train and test NB classifier, train and test ME classifier

**Required Modules:**
Python 3.8, scikit-learn 0.24.1, nltk 3.5

## Download (Only If Missing Stopwords File)

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Imports

In [1]:
# FILE I/O
import csv  # generate csv files
import os  # operating system functions

# DATA PREPROCESSING
import re  # regex
from nltk.corpus import stopwords # stopwords
from nltk.stem.porter import * # 

# FEATURE EXTRACTOR
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# CLASSIFIERS
from sklearn.linear_model import SGDClassifier # stochastic gradient descent
from sklearn.naive_bayes import MultinomialNB # naive bayes

# METRICS
from sklearn.metrics import accuracy_score

# UTILITY
import math

## Create CSV file
This will create both train and test csv files that include the raw text review with classification.

In [2]:
def create_csv_file(filename, train):
    header = ['row_number', 'text', 'classification']
    
    if train == True:
        path_to_mostly_neg = "Labeled-Data/Train/Mostly-Negative/"
        path_to_slightly_neg = "Labeled-Data/Train/Slightly-Negative/"
        path_to_neutral = "Labeled-Data/Train/Neutral/"
        path_to_slightly_pos = "Labeled-Data/Train/Slightly-Positive/"
        path_to_mostly_pos = "Labeled-Data/Train/Mostly-Positive/"
    else:
        path_to_mostly_neg = "Labeled-Data/Test/Mostly-Negative/"
        path_to_slightly_neg = "Labeled-Data/Test/Slightly-Negative/"
        path_to_neutral = "Labeled-Data/Test/Neutral/"
        path_to_slightly_pos = "Labeled-Data/Test/Slightly-Positive/"
        path_to_mostly_pos = "Labeled-Data/Test/Mostly-Positive/"
        
    count = 0
    
    with open(filename, "w", newline = '') as f1:
        writer = csv.writer(f1, delimiter = ',')
        writer.writerow(header)
        # add mostly negative reviews
        for f in os.listdir(path_to_mostly_neg):
            classification = 0
            if f.endswith(".txt"):
                open_file = open(path_to_mostly_neg+f, "r")
                data = open_file.read()
                writer.writerow([count, f'"{data}"', classification])
                count += 1
                open_file.close()
        # add slightly negative reviews
        for f in os.listdir(path_to_slightly_neg):
            classification = 1
            if f.endswith(".txt"):
                open_file = open(path_to_slightly_neg+f, "r")
                data = open_file.read()
                writer.writerow([count, f'"{data}"', classification])
                count += 1
                open_file.close()
        # add neutral reviews
        for f in os.listdir(path_to_neutral):
            classification = 2
            if f.endswith(".txt"):
                open_file = open(path_to_neutral+f, "r")
                data = open_file.read()
                writer.writerow([count, f'"{data}"', classification])
                count += 1
                open_file.close()
        # add slightly positive reviews
        for f in os.listdir(path_to_slightly_pos):
            classification = 3
            if f.endswith(".txt"):
                open_file = open(path_to_slightly_pos+f, "r")
                data = open_file.read()
                writer.writerow([count, f'"{data}"', classification])
                count += 1
                open_file.close()
        # add mostly positive reviews
        for f in os.listdir(path_to_mostly_pos):
            classification = 4
            if f.endswith(".txt"):
                open_file = open(path_to_mostly_pos+f, "r")
                data = open_file.read()
                writer.writerow([count, f'"{data}"', classification])
                count += 1
                open_file.close()

In [3]:
create_csv_file("imdb_train.csv", train = True)
create_csv_file("imdb_test.csv", train = False)

## Create List of Reviews and Classifications
review_text_train, review_text_test - contain review text in preparation for converting them into n-grams.

y_train, y_test - contain classification labels for both training and testing our model.

In [4]:
def create_reviews_list(filename):
    review_text = []
    classification = []
    with open(filename,'r') as csvfile:
            reader = csv.reader(csvfile, delimiter = ',')
            next(reader, None)
            for row in reader:
                review_text.append(row[1])
                classification.append(row[2])
    return review_text, classification

In [5]:
review_text_train, y_train = create_reviews_list("imdb_train.csv")
review_text_test, y_test = create_reviews_list("imdb_test.csv")

# convert classifications to list of integers
y_train = list(map(int, y_train))
y_test = list(map(int, y_test))

## Preprocess Data
1) Remove html tags

2) Remove special characters

3) Convert to lowercase

4) Remove stopwords

5) Apply stemming


In [6]:
def preprocess_data(review_text):
    stemmer = PorterStemmer()
    for index, row in enumerate(review_text):
        row = re.sub(r'<.*?>', '', row) # remove html tags
        row = re.sub(r'[^a-zA-Z. ]', '', row) # remove special characters
        row = row.lower() # convert to lowercase
        row = " ".join([stemmer.stem(word) for word in row.split() if word not in set(stopwords.words('english'))]) # remove stop words and apply stemming
        review_text[index] = row
    return review_text

In [7]:
review_text_train=preprocess_data(review_text_train)
review_text_test=preprocess_data(review_text_test)

## Convert Review Text Into N-Grams

In [8]:
def text_to_ngram(review_text_train, review_text_test, ngram_range, tfidf):
    if tfidf == True:
        # use_idf when 'True' enables inverse-document-frequency re-weighting
        # ngram_range = ngram_range sets the lower and upper boundary of range of n-values
        tfidfvec = TfidfVectorizer(use_idf = True, analyzer = 'word', ngram_range = ngram_range)
        
        # training data learns vocabulary dictionary and returns document-term matrix 
        x_train = tfidfvec.fit_transform(review_text_train)
        
        # transforms the test data to document-term matrix
        x_test = tfidfvec.transform(review_text_test)
    else:
        cvec = CountVectorizer(analyzer = 'word', ngram_range = ngram_range)
        x_train = cvec.fit_transform(review_text_train)
        
        x_test = cvec.transform(review_text_test)
    return x_train, x_test
                                    

## SGD Classifier (Stochastic Gradient Descent)

In [9]:
def sgd_classifier(review_text_train, y_train, review_text_test, ngram_range, tfidf):
    x_train, x_test = text_to_ngram(review_text_train, review_text_test, ngram_range, tfidf)
    
    clf = SGDClassifier(loss = "hinge", penalty = "l1")

    clf.fit(x_train, y_train)
    
    prediction = clf.predict(x_test)
    
    return prediction

## SVM Classifier (Support Vector Machine)

In [10]:
def svm_classifier(review_text_train, y_train, review_text_test, ngram_range, tfidf):
    x_train, x_test = text_to_ngram(review_text_train, review_text_test, ngram_range, tfidf)

## NB Classifier (Naive Bayes)

In [11]:
def nb_classifier(review_text_train, y_train, review_text_test, ngram_range, tfidf):
    x_train, x_test = text_to_ngram(review_text_train, review_text_test, ngram_range, tfidf)
    
    clf = MultinomialNB()
    
    clf.fit(x_train, y_train)
    
    prediction = clf.predict(x_test)
    
    return prediction

## ME Classifier (Maximum Entropy)

In [12]:
def me_classifier(review_text_train, y_train, review_text_test, ngram_range, tfidf):
    x_train, x_test = text_to_ngram(review_text_train, review_text_test, ngram_range, tfidf)

# Predictions
## SGD Classifier

In [37]:
y_pred_unigram = sgd_classifier(review_text_train, y_train, review_text_test, (1,1), False)
y_pred_bigram = sgd_classifier(review_text_train, y_train, review_text_test, (2,2), False)
y_pred_trigram = sgd_classifier(review_text_train, y_train, review_text_test, (3,3), False)
y_pred_unigram_bigram = sgd_classifier(review_text_train, y_train, review_text_test, (1,2), False)
y_pred_bigram_trigram = sgd_classifier(review_text_train, y_train, review_text_test, (2,3), False)
y_pred_unigram_bigram_trigram = sgd_classifier(review_text_train, y_train, review_text_test, (1,3), False)

y_pred_unigram_tfidf = sgd_classifier(review_text_train, y_train, review_text_test, (1,1), True)
y_pred_bigram_tfidf  = sgd_classifier(review_text_train, y_train, review_text_test, (2,2), True)
y_pred_trigram_tfidf  = sgd_classifier(review_text_train, y_train, review_text_test, (3,3), True)
y_pred_unigram_bigram_tfidf  = sgd_classifier(review_text_train, y_train, review_text_test, (1,2), True)
y_pred_bigram_trigram_tfidf  = sgd_classifier(review_text_train, y_train, review_text_test, (2,3), True)
y_pred_unigram_bigram_trigram_tfidf  = sgd_classifier(review_text_train, y_train, review_text_test, (1,3), True)

print("SGD Classifier Accuracy Scores:")
print("Unigram:\t\t\t\t" + str(accuracy_score(y_test, y_pred_unigram)))
print("Bigram:\t\t\t\t\t" + str(accuracy_score(y_test, y_pred_bigram)))
print("Trigram:\t\t\t\t" + str(accuracy_score(y_test, y_pred_trigram)))
print("Unigram + Bigram:\t\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram)))
print("Bigram + Trigram:\t\t\t" + str(accuracy_score(y_test, y_pred_bigram_trigram)))
print("Unigram + Bigram + Trigram:\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_trigram)) + "\n")

print("Unigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_unigram_tfidf)))
print("Bigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_bigram_tfidf)))
print("Trigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_trigram_tfidf)))
print("Unigram + Bigram w/ tf-idf:\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_tfidf)))
print("Bigram + Trigram w/ tf-idf:\t\t" + str(accuracy_score(y_test, y_pred_bigram_trigram_tfidf)))
print("Unigram + Bigram + Trigram w/ tf-idf:\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_trigram_tfidf)) + "\n")

SGD Classifier Accuracy Scores:
Unigram:				0.31666666666666665
Bigram:					0.31
Trigram:				0.27
Unigram + Bigram:			0.31
Bigram + Trigram:			0.30333333333333334
Unigram + Bigram + Trigram:		0.33

Unigram w/ tf-idf:			0.3
Bigram w/ tf-idf:			0.3
Trigram w/ tf-idf:			0.24333333333333335
Unigram + Bigram w/ tf-idf:		0.36333333333333334
Bigram + Trigram w/ tf-idf:		0.31666666666666665
Unigram + Bigram + Trigram w/ tf-idf:	0.36



## NB Classifier

In [38]:
y_pred_unigram = nb_classifier(review_text_train, y_train, review_text_test, (1,1), False)
y_pred_bigram = nb_classifier(review_text_train, y_train, review_text_test, (2,2), False)
y_pred_trigram = nb_classifier(review_text_train, y_train, review_text_test, (3,3), False)
y_pred_unigram_bigram = nb_classifier(review_text_train, y_train, review_text_test, (1,2), False)
y_pred_bigram_trigram = nb_classifier(review_text_train, y_train, review_text_test, (2,3), False)
y_pred_unigram_bigram_trigram = nb_classifier(review_text_train, y_train, review_text_test, (1,3), False)

y_pred_unigram_tfidf = nb_classifier(review_text_train, y_train, review_text_test, (1,1), True)
y_pred_bigram_tfidf  = nb_classifier(review_text_train, y_train, review_text_test, (2,2), True)
y_pred_trigram_tfidf  = nb_classifier(review_text_train, y_train, review_text_test, (3,3), True)
y_pred_unigram_bigram_tfidf  = nb_classifier(review_text_train, y_train, review_text_test, (1,2), True)
y_pred_bigram_trigram_tfidf  = nb_classifier(review_text_train, y_train, review_text_test, (2,3), True)
y_pred_unigram_bigram_trigram_tfidf  = nb_classifier(review_text_train, y_train, review_text_test, (1,3), True)

print("NB Classifier Accuracy Scores:")
print("Unigram:\t\t\t\t" + str(accuracy_score(y_test, y_pred_unigram)))
print("Bigram:\t\t\t\t\t" + str(accuracy_score(y_test, y_pred_bigram)))
print("Trigram:\t\t\t\t" + str(accuracy_score(y_test, y_pred_trigram)))
print("Unigram + Bigram:\t\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram)))
print("Bigram + Trigram:\t\t\t" + str(accuracy_score(y_test, y_pred_bigram_trigram)))
print("Unigram + Bigram + Trigram:\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_trigram)) + "\n")

print("Unigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_unigram_tfidf)))
print("Bigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_bigram_tfidf)))
print("Trigram w/ tf-idf:\t\t\t" + str(accuracy_score(y_test, y_pred_trigram_tfidf)))
print("Unigram + Bigram w/ tf-idf:\t\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_tfidf)))
print("Bigram + Trigram w/ tf-idf:\t\t" + str(accuracy_score(y_test, y_pred_bigram_trigram_tfidf)))
print("Unigram + Bigram + Trigram w/ tf-idf:\t" + str(accuracy_score(y_test, y_pred_unigram_bigram_trigram_tfidf)) + "\n")

NB Classifier Accuracy Scores:
Unigram:				0.36333333333333334
Bigram:					0.3566666666666667
Trigram:				0.23666666666666666
Unigram + Bigram:			0.3566666666666667
Bigram + Trigram:			0.35
Unigram + Bigram + Trigram:		0.36333333333333334

Unigram w/ tf-idf:			0.35333333333333333
Bigram w/ tf-idf:			0.3466666666666667
Trigram w/ tf-idf:			0.23666666666666666
Unigram + Bigram w/ tf-idf:		0.36333333333333334
Bigram + Trigram w/ tf-idf:		0.3433333333333333
Unigram + Bigram + Trigram w/ tf-idf:	0.36333333333333334

