In [1]:
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import numpy as np
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")

# read datasets from csv files, first column is review, second column is rating/sentiment 
imdb_training_data = pd.read_csv('IMDB-train.txt', sep = "\t", header = None)
imdb_validation_data = pd.read_csv('IMDB-valid.txt', sep = "\t", header = None)
imdb_test_data = pd.read_csv('IMDB-test.txt', sep = "\t", header = None)

yelp_training_data = pd.read_csv('yelp-train.txt', sep = "\t", header = None)
yelp_validation_data = pd.read_csv('yelp-valid.txt', sep = "\t", header = None)
yelp_test_data = pd.read_csv('yelp-test.txt', sep = "\t", header = None)
    
imdb_train_reviews, imdb_train_labels = [review.replace('<br /><br />', ' ') for review in imdb_training_data[0]],list(imdb_training_data[1])
imdb_valid_reviews, imdb_valid_labels = [review.replace('<br /><br />', ' ') for review in imdb_validation_data[0]], list(imdb_validation_data[1])
imdb_test_reviews, imdb_test_labels = [review.replace('<br /><br />', ' ') for review in imdb_test_data[0]], list(imdb_test_data[1])

yelp_train_reviews, yelp_train_labels = [review.replace('<br /><br />', ' ') for review in yelp_training_data[0]],list(yelp_training_data[1])
yelp_valid_reviews, yelp_valid_labels = [review.replace('<br /><br />', ' ') for review in yelp_validation_data[0]], list(yelp_validation_data[1])
yelp_test_reviews, yelp_test_labels = [review.replace('<br /><br />', ' ') for review in yelp_test_data[0]], list(yelp_test_data[1])

# When lemmatizing, we need to convert from NLTK's part of speec
# to wordnet's recognized parts of speech
def get_wordnet_pos(treebank_pos):
    if treebank_pos.startswith('J'):
        return wordnet.ADJ
    elif treebank_pos.startswith('V'):
        return wordnet.VERB
    elif treebank_pos.startswith('N'):
        return wordnet.NOUN
    elif treebank_pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def sentence_tokenize(sentence, lem = WordNetLemmatizer()):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    return [lem.lemmatize(w, pos=get_wordnet_pos(pos)) for (w, pos) in tagged_tokens]

tfidf_vectorizer = TfidfVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    max_features = 10000
)

tuple_tfidf_vectorizer = TfidfVectorizer(
    input = "content",
    tokenizer = sentence_tokenize,
    ngram_range = (1, 2),
    max_features = 10000
)

def tfidf_vectorize(sentences, ngram=False):
    if ngram:
        return tuple_tfidf_vectorizer.transform(tqdm_notebook(sentences))
    else:
        return tfidf_vectorizer.transform(tqdm_notebook(sentences))


# Fit vectorizer on imdb training and validation 

tfidf_vectorizer.fit(tqdm_notebook(imdb_train_reviews + imdb_valid_reviews))
tuple_tfidf_vectorizer.fit(tqdm_notebook(imdb_train_reviews + imdb_valid_reviews))

# Vectorize for imdb dataset
vectorized_train_data_imdb_bow = tfidf_vectorize(imdb_train_reviews)
vectorized_valid_data_imdb_bow = tfidf_vectorize(imdb_valid_reviews)
vectorized_test_data_imdb_bow = tfidf_vectorize(imdb_test_reviews)
vectorized_train_data_imdb_bow = vectorized_train_data_imdb_bow.toarray()
vectorized_valid_data_imdb_bow = vectorized_valid_data_imdb_bow.toarray()
vectorized_test_data_imdb_bow = vectorized_test_data_imdb_bow.toarray()

vectorized_train_data_imdb_bigram = tfidf_vectorize(imdb_train_reviews,ngram=True)
vectorized_valid_data_imdb_bigram = tfidf_vectorize(imdb_valid_reviews,ngram=True)
vectorized_test_data_imdb_bigram = tfidf_vectorize(imdb_test_reviews,ngram=True)
vectorized_train_data_imdb_bigram = vectorized_train_data_imdb_bigram.toarray()
vectorized_valid_data_imdb_bigram = vectorized_valid_data_imdb_bigram.toarray()
vectorized_test_data_imdb_bigram = vectorized_test_data_imdb_bigram.toarray()

# Vectorize yelp dataset
tfidf_vectorizer.fit(tqdm_notebook(yelp_train_reviews + yelp_valid_reviews))
tuple_tfidf_vectorizer.fit(tqdm_notebook(yelp_train_reviews + yelp_valid_reviews))

vectorized_train_data_yelp_bow = tfidf_vectorize(yelp_train_reviews)
vectorized_valid_data_yelp_bow = tfidf_vectorize(yelp_valid_reviews)
vectorized_test_data_yelp_bow = tfidf_vectorize(yelp_test_reviews)
vectorized_train_data_yelp_bow = vectorized_train_data_yelp_bow.toarray()
vectorized_valid_data_yelp_bow = vectorized_valid_data_yelp_bow.toarray()
vectorized_test_data_yelp_bow = vectorized_test_data_yelp_bow.toarray()

vectorized_train_data_yelp_bigram = tfidf_vectorize(yelp_train_reviews,ngram=True)
vectorized_valid_data_yelp_bigram = tfidf_vectorize(yelp_valid_reviews,ngram=True)
vectorized_test_data_yelp_bigram = tfidf_vectorize(yelp_test_reviews,ngram=True)
vectorized_train_data_yelp_bigram = vectorized_train_data_yelp_bigram.toarray()
vectorized_valid_data_yelp_bigram = vectorized_valid_data_yelp_bigram.toarray()
vectorized_test_data_yelp_bigram = vectorized_test_data_yelp_bigram.toarray()

# Predefine split for training and validation data, for use when cross validating
ps_imdb = PredefinedSplit([-1 for s in imdb_train_reviews] + [0 for s in imdb_valid_reviews])
ps_yelp = PredefinedSplit([-1 for s in yelp_train_reviews] + [0 for s in yelp_valid_reviews])

HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




In [2]:
# Hypertune for Multinomial Naive Bayes on imdb
parameters = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}

# Bag of words IMDB
clf = MultinomialNB()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bow, vectorized_valid_data_imdb_bow)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bag of words Multinomial Naive Bayes (IMDB):", grid.best_params_)
print('Optimal accuracy of Bag of Words Multinomial Naive Bayes on IMDB dataset:', grid.score(vectorized_test_data_imdb_bow, imdb_test_labels))

# Bigram IMDB
clf = MultinomialNB()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bigram, vectorized_valid_data_imdb_bigram)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bigram Multinomial Naive Bayes (IMDB):", grid.best_params_)

print('Optimal accuracy of Bigram Multinomial Naive Bayes on IMDB dataset:', grid.score(vectorized_test_data_imdb_bigram, imdb_test_labels))

# Bag of words Yelp
clf = MultinomialNB()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bow, vectorized_valid_data_yelp_bow)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bag of words Multinomial Naive Bayes (Yelp):", grid.best_params_)
print('Optimal accuracy of Bag of Words Multinomial Naive Bayes on Yelp dataset:', grid.score(vectorized_test_data_yelp_bow, yelp_test_labels))

# Bag of words Yelp
clf = MultinomialNB()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bigram, vectorized_valid_data_yelp_bigram)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bigram Multinomial Naive Bayes (Yelp):", grid.best_params_)
print('Optimal accuracy of Bigram Multinomial Naive Bayes on Yelp dataset:', grid.score(vectorized_test_data_yelp_bigram, yelp_test_labels))

Best params for Bag of words Multinomial Naive Bayes (IMDB): {'alpha': 0.1}
Optimal accuracy of Bag of Words Multinomial Naive Bayes on IMDB dataset: 0.82936
Best params for Bigram Multinomial Naive Bayes (IMDB): {'alpha': 0.0001}
Optimal accuracy of Bigram Multinomial Naive Bayes on IMDB dataset: 0.85812
Best params for Bag of words Multinomial Naive Bayes (Yelp): {'alpha': 0.1}
Optimal accuracy of Bag of Words Multinomial Naive Bayes on Yelp dataset: 0.464
Best params for Bigram Multinomial Naive Bayes (Yelp): {'alpha': 0.1}
Optimal accuracy of Bigram Multinomial Naive Bayes on Yelp dataset: 0.523


In [3]:
#Hypertune for Linear SVM on imdb
parameters = {'C':np.linspace(0.001, 10, 10), 'tol':np.linspace(1e-9, 1e-5, 5)}

# Bag of words IMDB
clf = LinearSVC()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bow, vectorized_valid_data_imdb_bow)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bag of words Linear SVM (IMDB):", grid.best_params_)
print('Optimal accuracy of Bag of Words Linear SVM on IMDB dataset:', grid.score(vectorized_test_data_imdb_bow, imdb_test_labels))

# Bigram IMDB
clf = LinearSVC()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bigram, vectorized_valid_data_imdb_bigram)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bigram Linear SVM (IMDB):", grid.best_params_)

print('Optimal accuracy of Bigram Linear SVM on IMDB dataset:', grid.score(vectorized_test_data_imdb_bigram, imdb_test_labels))

# Bag of words Yelp
clf = LinearSVC()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bow, vectorized_valid_data_yelp_bow)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bag of words Linear SVM (Yelp):", grid.best_params_)
print('Optimal accuracy of Bag of Words Linear SVM on Yelp dataset:', grid.score(vectorized_test_data_yelp_bow, yelp_test_labels))

# Bag of words Yelp
clf = LinearSVC()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bigram, vectorized_valid_data_yelp_bigram)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bigram Linear SVM  (Yelp):", grid.best_params_)
print('Optimal accuracy of Bigram Linear SVM on Yelp dataset:', grid.score(vectorized_test_data_yelp_bigram, yelp_test_labels))

Best params for Bag of words Linear SVM (IMDB): {'C': 1.1119999999999999, 'tol': 1e-09}
Optimal accuracy of Bag of Words Linear SVM on IMDB dataset: 0.873
Best params for Bigram Linear SVM (IMDB): {'C': 1.1119999999999999, 'tol': 1e-09}
Optimal accuracy of Bigram Linear SVM on IMDB dataset: 0.88756
Best params for Bag of words Linear SVM (Yelp): {'C': 1.1119999999999999, 'tol': 1e-09}
Optimal accuracy of Bag of Words Linear SVM on Yelp dataset: 0.4915
Best params for Bigram Linear SVM  (Yelp): {'C': 1.1119999999999999, 'tol': 1e-09}
Optimal accuracy of Bigram Linear SVM on Yelp dataset: 0.512


In [4]:
# Hypertune for decision trees on imdb
parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],'max_depth':range(15, 26, 1)}

# Bag of words IMDB
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bow, vectorized_valid_data_imdb_bow)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bag of words Decision Trees Classifier (IMDB):", grid.best_params_)
print('Optimal accuracy of Bag of Words Decision Trees Classifier on IMDB dataset:', grid.score(vectorized_test_data_imdb_bow, imdb_test_labels))

# Bigram IMDB
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, parameters, cv=ps_imdb)
grid.fit(np.concatenate((vectorized_train_data_imdb_bigram, vectorized_valid_data_imdb_bigram)), imdb_train_labels + imdb_valid_labels)
print("Best params for Bigram Decision Trees Classifier (IMDB):", grid.best_params_)

print('Optimal accuracy of Bigram Decision Trees Classifier on IMDB dataset:', grid.score(vectorized_test_data_imdb_bigram, imdb_test_labels))

# Bag of words Yelp
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bow, vectorized_valid_data_yelp_bow)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bag of words Decision Trees Classifier (Yelp):", grid.best_params_)
print('Optimal accuracy of Bag of Words Decision Trees Classifier on Yelp dataset:', grid.score(vectorized_test_data_yelp_bow, yelp_test_labels))

# Bag of words Yelp
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, parameters, cv=ps_yelp)
grid.fit(np.concatenate((vectorized_train_data_yelp_bigram, vectorized_valid_data_yelp_bigram)), yelp_train_labels + yelp_valid_labels)
print("Best params for Bigram Decision Trees Classifier (Yelp):", grid.best_params_)
print('Optimal accuracy of Bigram Decision Trees Classifier on Yelp dataset:', grid.score(vectorized_test_data_yelp_bigram, yelp_test_labels))

Best params for Bag of words Decision Trees Classifier (IMDB): {'criterion': 'gini', 'max_depth': 22, 'splitter': 'random'}
Optimal accuracy of Bag of Words Decision Trees Classifier on IMDB dataset: 0.73932
Best params for Bigram Decision Trees Classifier (IMDB): {'criterion': 'gini', 'max_depth': 19, 'splitter': 'random'}
Optimal accuracy of Bigram Decision Trees Classifier on IMDB dataset: 0.7284
Best params for Bag of words Decision Trees Classifier (Yelp): {'criterion': 'gini', 'max_depth': 16, 'splitter': 'random'}
Optimal accuracy of Bag of Words Decision Trees Classifier on Yelp dataset: 0.359
Best params for Bigram Decision Trees Classifier (Yelp): {'criterion': 'gini', 'max_depth': 19, 'splitter': 'random'}
Optimal accuracy of Bigram Decision Trees Classifier on Yelp dataset: 0.3985
