In [7]:
import sklearn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
import sklearn.metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import warnings


warnings.filterwarnings("ignore")
# read datasets from csv files, first column is review, second column is rating/sentiment 
imdb_training_data = pd.read_csv('IMDB-train.txt', sep = "\t", header = None)
imdb_validation_data = pd.read_csv('IMDB-valid.txt', sep = "\t", header = None)
imdb_test_data = pd.read_csv('IMDB-test.txt', sep = "\t", header = None)

# yelp_training_data = pd.read_csv('yelp-train.txt', sep = "\t", header = None)
# yelp_validation_data = pd.read_csv('yelp-valid.txt', sep = "\t", header = None)
# yelp_test_data = pd.read_csv('yelp-test.txt', sep = "\t", header = None)
    
imdb_train_reviews, imdb_train_labels = [review.replace('<br /><br />', ' ') for review in imdb_training_data[0]],list(imdb_training_data[1])
imdb_valid_reviews, imdb_valid_labels = [review.replace('<br /><br />', ' ') for review in imdb_validation_data[0]], list(imdb_validation_data[1])
imdb_test_reviews, imdb_test_labels = [review.replace('<br /><br />', ' ') for review in imdb_test_data[0]], list(imdb_test_data[1])
    
countVectorizer = CountVectorizer(max_features = 10000)

vectorized_train_data = countVectorizer.fit_transform(imdb_train_reviews)
vectorized_valid_data = countVectorizer.transform(imdb_valid_reviews)
vectorized_test_data = countVectorizer.transform(imdb_test_reviews)
vectorized_train_data = vectorized_train_data.toarray()
vectorized_valid_data = vectorized_valid_data.toarray()
vectorized_test_data = vectorized_test_data.toarray()


# Predefine split for training and validation data, for use when cross validating
ps = PredefinedSplit([-1 for s in vectorized_train_data] + [0 for s in vectorized_valid_data])


In [42]:
# Hypertune for Multinomial Naive Bayes on imdb
parameters = {"alpha": [1e-4, 0.01, 0.1, 1.0, 2.0, 10.0]}
clf = MultinomialNB()
grid_mnb = GridSearchCV(clf, parameters, cv=ps)
grid_mnb.fit(np.concatenate((vectorized_train_data, vectorized_valid_data)), imdb_train_labels + imdb_valid_labels)
print("Best params for Multinomial Naive Bayes:", grid_mnb.best_params_)

print('Optimal accuracy of Multinomial Naive Bayes on IMDB dataset:', grid_mnb.score(vectorized_test_data, imdb_test_labels))

Best params for Multinomial Naive Bayes: {'alpha': 0.01}
Optimal accuracy of Multinomial Naive Bayes on IMDB dataset: 0.81696


In [47]:
#Hypertune for Linear SVM on imdb
parameters = {'C':np.linspace(0.001, 10, 10), 'tol':np.linspace(1e-9, 1e-5, 5), "max_iter": range(1000, 10001, 1000)}

clf = LinearSVC()
grid_linear_svc = GridSearchCV(clf, parameters, cv = ps)
grid_linear_svc.fit(np.concatenate((vectorized_train_data, vectorized_valid_data)), imdb_train_labels + imdb_valid_labels)
print("Best params for Linear SVM:", grid_linear_svc.best_params_)

print('Optimal accuracy of Linear SVC on IMDB dataset:', grid_linear_svc.score(vectorized_test_data, imdb_test_labels))





Best estimator for Linear SVM: {'C': 0.001, 'tol': 1e-09}
Optimal accuracy of Linear SVC on IMDB dataset: 0.87152


In [9]:
# Hypertune for decision trees on imdb
parameters = {'criterion':['gini', 'entropy'], 'splitter':['best', 'random'],'max_depth':range(15, 26, 1)}
clf = DecisionTreeClassifier()
grid_trees = GridSearchCV(clf, parameters, cv = ps)
grid_trees.fit(np.concatenate((vectorized_train_data, vectorized_valid_data)), imdb_train_labels + imdb_valid_labels)

print("Best params for Decision Trees Classifier:", grid_trees.best_params_)

print('Optimal accuracy of Decision Trees Classifier on IMDB dataset:', grid_trees.score(vectorized_test_data, imdb_test_labels))

Best estimator for Decision Trees Classifier: {'criterion': 'gini', 'max_depth': 15.0, 'splitter': 'random'}
Optimal accuracy of Decision Trees Classifier on IMDB dataset: 0.72936


In [None]:
# Hypertune for Random Forests on imdb
parameters = {'criterion':['gini','entropy'],
          'n_estimators':[10,20,50,100],
          'max_features':['auto', 'log2', 100, 200]}

clf = DecisionTreeClassifier()
grid_forest = GridSearchCV(clf, parameters, cv = ps)
grid_forest.fit(np.concatenate((vectorized_train_data, vectorized_valid_data)), imdb_train_labels + imdb_valid_labels)

print("Best params for Random Forests Classifier:", grid_forest.best_params_)

print('Optimal accuracy of Random Forests Classifier on IMDB dataset:', grid_forest.score(vectorized_test_data, imdb_test_labels))