In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np
import csv
from nltk.util import ngrams
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report

filepath1 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/test_pos.csv"
with open(filepath1, 'r') as f:
    reader = csv.reader(f)
    test_pos_list = list(reader)
filepath2 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/test_neg.csv"
with open(filepath2, 'r') as f:
    reader = csv.reader(f)
    test_neg_list = list(reader)
filepath3 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/train_pos.csv"
with open(filepath3, 'r') as f:
    reader = csv.reader(f)
    train_pos_list = list(reader)
filepath4 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/train_neg.csv"
with open(filepath4, 'r') as f:
    reader = csv.reader(f)
    train_neg_list = list(reader)
filepath5 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/val_pos.csv"
with open(filepath5, 'r') as f:
    reader = csv.reader(f)
    val_pos_list = list(reader)
filepath6 = "/Users/mgh/MGH/CE/Waterloo/MSCI 641 NLP/Text-Analytics/Assignment2/val_neg.csv"
with open(filepath6, 'r') as f:
    reader = csv.reader(f)
    val_neg_list = list(reader)


In [2]:
def remove_bracket(file):  
    for j in range(len(file)):
        file[j][0] = file[j][0].replace("['","'")
        file[j][-1] = file[j][-1].replace("']","'")
    return file

In [3]:
def join_doc(listfile):
    for i in range(len(listfile)):
        listfile[i]="".join(listfile[i])
    return listfile

In [4]:
test_neg_list,test_pos_list,val_neg_list,val_pos_list,train_neg_list,train_pos_list=remove_bracket(test_neg_list),remove_bracket(test_pos_list),remove_bracket(val_neg_list),remove_bracket(val_pos_list),remove_bracket(train_neg_list),remove_bracket(train_pos_list)


In [5]:
# merge the positive and negative reviews
train_list = train_neg_list+train_pos_list
val_list = val_neg_list+val_pos_list
test_list = test_neg_list+test_pos_list
train_label = ['neg']*len(train_neg_list)+['pos']*len(train_pos_list)
val_label = ['neg']*len(val_neg_list)+['pos']*len(val_pos_list)
test_label = ['neg']*len(test_neg_list)+['pos']*len(test_pos_list)

In [6]:
train_list_uni, val_list_uni,test_list_uni= join_doc(train_list), join_doc(val_list), join_doc(test_list)
train_list_bi, val_list_bi,test_list_bi= join_doc(train_list), join_doc(val_list), join_doc(test_list)
train_list_mix, val_list_mix,test_list_mix= join_doc(train_list), join_doc(val_list), join_doc(test_list)


In [7]:
def bag_of_words(train_file, val_file, test_file, ngrams):
    vectorizer = CountVectorizer(ngram_range=ngrams)
    train_file = vectorizer.fit_transform(train_file)
    val_file= vectorizer.transform(val_file)
    test_file = vectorizer.transform(test_file)
    feature_list = vectorizer.get_feature_names()
    return train_file, val_file, test_file, feature_list

In [8]:
def gridsearchMNB(X_train,X_test,y_train,y_test):    
    tuned_parameters ={'alpha': list(range(20))}
    clf=GridSearchCV(MultinomialNB(),tuned_parameters,scoring="accuracy")
    clf.fit(X_train,y_train)
    print("Parameter tuned: alpha")
    print("Search space: alpha = ", list(range(20)))
    #print(clf.cv_results_['params'])
    print("Best parameters set found:",clf.best_params_)

    print("Optimized accuracy on validation set:",clf.score(X_test,y_test))
    print("Detailed classification report:")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    return clf.best_params_['alpha']

In [9]:
def model_accuracy(X_train,X_val,X_test,y_train,y_val,y_test,ngrams):
    
    X_train,X_val,X_test,feature_list = bag_of_words(X_train,X_val,X_test,ngrams)
    best_alpha = gridsearchMNB(X_train,X_val,y_train,y_val)
    nb = MultinomialNB(alpha=best_alpha)
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print("**************************************")
    print("Model accuracy: ",metrics.accuracy_score(y_test, y_pred))
    print("Confusion matrix: ")
    print(metrics.confusion_matrix(y_test, y_pred))
    print("Detailed classification report:")
    print(classification_report(y_test, y_pred))

In [10]:
text_features = {'unigrams':(1,1),'bigrams':(2,2),'unigrams+bigrams':(1,2)}

In [None]:
for i in text_features:
    print("-------------------------------------------------------")
    print("Text features: ",i)
    model_accuracy(join_doc(train_list), join_doc(val_list), join_doc(test_list),train_label,val_label,test_label,text_features[i])

-------------------------------------------------------
Text features:  unigrams
Parameter tuned: alpha
Search space: alpha =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Best parameters set found: {'alpha': 1}
Optimized accuracy on validation set: 0.8091001238878252
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.82      0.80      0.81     39946
         pos       0.80      0.82      0.81     39965

    accuracy                           0.81     79911
   macro avg       0.81      0.81      0.81     79911
weighted avg       0.81      0.81      0.81     79911

**************************************
Model accuracy:  0.8064746220842928
Confusion matrix: 
[[31861  8085]
 [ 7380 32586]]
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.81      0.80      0.80     39946
         pos       0.80      0.82      0.81     39966

    accuracy             

In [10]:
model_accuracy(train_list_uni, val_list_uni,test_list_uni,train_label,val_label,test_label,text_features['unigrams'])

-------------------------------------------------------
Parameter tuned: alpha
Search space: alpha =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Best parameters set found: {'alpha': 1}
Optimized accuracy on validation set: 0.8091001238878252
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.82      0.80      0.81     39946
         pos       0.80      0.82      0.81     39965

    accuracy                           0.81     79911
   macro avg       0.81      0.81      0.81     79911
weighted avg       0.81      0.81      0.81     79911

**************************************
Model accuracy:  0.8064746220842928
Confusion matrix: 
[[31861  8085]
 [ 7380 32586]]
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.81      0.80      0.80     39946
         pos       0.80      0.82      0.81     39966

    accuracy                           0.81     79

In [None]:
train_list_uni, val_list_uni, test_list_uni,uni_feature = bag_of_words(train_list_uni, val_list_uni, test_list_uni)
print("Unigram feature list: ",uni_feature[:20])

gridsearchMNB(train_list_uni, val_list_uni,)




In [11]:
from sklearn.model_selection import GridSearchCV


X_train,X_test,y_train,y_test=train_list_uni,val_list_uni,train_label,val_label

tuned_parameters ={'alpha': list(range(20))}
clf=GridSearchCV(MultinomialNB(),tuned_parameters,scoring="accuracy")
clf.fit(X_train,y_train)

print(clf.cv_results_['params'])
print("Best parameters set found:",clf.best_params_)

print("Optimized accuracy:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
    

[{'alpha': 0}, {'alpha': 1}, {'alpha': 2}, {'alpha': 3}, {'alpha': 4}, {'alpha': 5}, {'alpha': 6}, {'alpha': 7}, {'alpha': 8}, {'alpha': 9}, {'alpha': 10}, {'alpha': 11}, {'alpha': 12}, {'alpha': 13}, {'alpha': 14}, {'alpha': 15}, {'alpha': 16}, {'alpha': 17}, {'alpha': 18}, {'alpha': 19}]
Best parameters set found: {'alpha': 1}
Optimized accuracy: 0.8091001238878252
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.82      0.80      0.81     39946
         pos       0.80      0.82      0.81     39965

    accuracy                           0.81     79911
   macro avg       0.81      0.81      0.81     79911
weighted avg       0.81      0.81      0.81     79911



In [12]:
nb = MultinomialNB(alpha=clf.best_params_['alpha'])
nb.fit(train_list_uni, train_label)
test_pred = nb.predict(test_list_uni)
print(metrics.accuracy_score(test_label, test_pred))
print(metrics.confusion_matrix(test_label, test_pred))
print(classification_report(test_label, test_pred))

In [20]:
vectorizer = CountVectorizer(ngram_range=(2,2))
train_list_bi = vectorizer.fit_transform(train_list_bi)
val_list_bi = vectorizer.transform(val_list_bi)
test_list_bi = vectorizer.transform(test_list_bi)

In [22]:
bigram_feature = vectorizer.get_feature_names()
print(bigram_feature[:50000])



In [23]:
X_train,X_test,y_train,y_test=train_list_bi,val_list_bi,train_label,val_label

tuned_parameters ={'alpha': list(range(20))}
clf=GridSearchCV(MultinomialNB(),tuned_parameters,scoring="accuracy")
clf.fit(X_train,y_train)

print(clf.cv_results_['params'])
print("Best parameters set found:",clf.best_params_)

print("Optimized accuracy:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

[{'alpha': 0}, {'alpha': 1}, {'alpha': 2}, {'alpha': 3}, {'alpha': 4}, {'alpha': 5}, {'alpha': 6}, {'alpha': 7}, {'alpha': 8}, {'alpha': 9}, {'alpha': 10}, {'alpha': 11}, {'alpha': 12}, {'alpha': 13}, {'alpha': 14}, {'alpha': 15}, {'alpha': 16}, {'alpha': 17}, {'alpha': 18}, {'alpha': 19}]
Best parameters set found: {'alpha': 1}
Optimized accuracy: 0.825318166460187
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.84      0.80      0.82     39946
         pos       0.81      0.85      0.83     39965

    accuracy                           0.83     79911
   macro avg       0.83      0.83      0.83     79911
weighted avg       0.83      0.83      0.83     79911



In [24]:
nb = MultinomialNB(alpha=clf.best_params_['alpha'])
nb.fit(train_list_bi, train_label)
test_pred = nb.predict(test_list_bi)
print(metrics.accuracy_score(test_label, test_pred))
print(metrics.confusion_matrix(test_label, test_pred))
print(classification_report(test_label, test_pred))

0.8242691961157274
[[31872  8074]
 [ 5969 33997]]
              precision    recall  f1-score   support

         neg       0.84      0.80      0.82     39946
         pos       0.81      0.85      0.83     39966

    accuracy                           0.82     79912
   macro avg       0.83      0.82      0.82     79912
weighted avg       0.83      0.82      0.82     79912



In [25]:
train_list_mix, val_list_mix,test_list_mix= join_doc(train_list), join_doc(val_list), join_doc(test_list)

In [26]:
vectorizer = CountVectorizer(ngram_range=(1,2))
train_list_mix = vectorizer.fit_transform(train_list_mix)
val_list_mix = vectorizer.transform(val_list_mix)
test_list_mix = vectorizer.transform(test_list_mix)

In [27]:
mixgram_feature = vectorizer.get_feature_names()
print(mixgram_feature[:50000])



In [28]:
X_train,X_test,y_train,y_test=train_list_mix,val_list_mix,train_label,val_label

tuned_parameters ={'alpha': list(range(20))}
clf=GridSearchCV(MultinomialNB(),tuned_parameters,scoring="accuracy")
clf.fit(X_train,y_train)

print(clf.cv_results_['params'])
print("Best parameters set found:",clf.best_params_)

print("Optimized accuracy:",clf.score(X_test,y_test))
print("Detailed classification report:")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))

[{'alpha': 0}, {'alpha': 1}, {'alpha': 2}, {'alpha': 3}, {'alpha': 4}, {'alpha': 5}, {'alpha': 6}, {'alpha': 7}, {'alpha': 8}, {'alpha': 9}, {'alpha': 10}, {'alpha': 11}, {'alpha': 12}, {'alpha': 13}, {'alpha': 14}, {'alpha': 15}, {'alpha': 16}, {'alpha': 17}, {'alpha': 18}, {'alpha': 19}]
Best parameters set found: {'alpha': 1}
Optimized accuracy: 0.8324761296942849
Detailed classification report:
              precision    recall  f1-score   support

         neg       0.85      0.81      0.83     39946
         pos       0.82      0.85      0.84     39965

    accuracy                           0.83     79911
   macro avg       0.83      0.83      0.83     79911
weighted avg       0.83      0.83      0.83     79911



In [30]:
nb = MultinomialNB(alpha=clf.best_params_['alpha'])
nb.fit(train_list_mix, train_label)
test_pred = nb.predict(test_list_mix)
print(metrics.accuracy_score(test_label, test_pred))
print(metrics.confusion_matrix(test_label, test_pred))
print(classification_report(test_label, test_pred))

0.8314395835418961
[[32512  7434]
 [ 6036 33930]]
              precision    recall  f1-score   support

         neg       0.84      0.81      0.83     39946
         pos       0.82      0.85      0.83     39966

    accuracy                           0.83     79912
   macro avg       0.83      0.83      0.83     79912
weighted avg       0.83      0.83      0.83     79912



In [3]:
def bigrams_trans(listfile):
    temp_list=[]
    for i in range(len(listfile)):
        temp_list.append(list(ngrams(listfile[i],2)))
    return listfile

In [7]:
# produce the bigram test features
train_list_bi, val_list_bi, test_list_bi = bigrams_trans(train_list),bigrams_trans(val_list),bigrams_trans(test_list)
print(train_list_bi[:5])

[["'this'", " 'is'", " 'just'", " 'one'", " 'of'", " 'those'", " 'toys'", " 'that'", " 'i'", " 'found'", " 'annoying'", " 'but'", " 'my'", " 'little'", " 'girl'", " 'seemed'", " 'to'", " 'love'", " '.'"], ["'most'", " 'of'", " 'our'", " 'knives'", " 'were'", " 'too'", " 'thick'", " 'at'", " 'the'", " 'handle'", " 'or'", " 'too'", " 'wide'", " 'at'", " 'the'", " 'blade'", " 'to'", " 'sit'", " 'down'", " 'inside'", " 'it'", " 'fully'", " '.'"], ["'like'", " 'close'", " 'to'", " 'two'", " 'weeks'", " 'to'", " 'even'", " 'leave'", " 'the'", " 'facility'", " '.'"], ["'this'", " 'dove'", " 'product'", " 'is'", " 'advertised'", " 'as'", " 'a'", " 'skin'", " 'nourishing'", " 'wash'", " 'with'", " 'a'", " 'new'", " 'nutriummoisture'", " 'technology'", " '.'"], ["'like'", " 'i'", " 'said'", " 'get'", " 'what'", " 'you'", " 'pay'", " 'for'", " 'these'", " 'didn'", ' "\'"', " 't'", " 'even'", " 'make'", " 'it'", " 'to'", " 'the'", " 'woods'", " '.'"]]


In [None]:
train_list = np.asarray(train_list)
val_list = np.asarray(val_list)
test_list = np.asarray(test_list)

train_list_bi = np.asarray(train_list_bi)
val_list_bi = np.asarray(val_list_bi)
test_list_bi = np.asarray(test_list_bi)

train_label = np.asarray(train_label)
val_label = np.asarray(val_label)
test_label = np.asarray(test_label)

In [None]:
print(len(train_label),len(train_list))

In [None]:
from scipy.sparse import csr_matrix
indptr = [0]
indices = []
data = []
vocabulary = {}
for d in train_list[:300000]:
    for term in d:
        index = vocabulary.setdefault(term, len(vocabulary))
        indices.append(index)
        data.append(1)
    indptr.append(len(indices))
train_matrix = csr_matrix((data, indices, indptr), dtype=int).toarray()

In [None]:
from scipy.stats import randint
print(randint.rvs(0,1000,size=100))

In [None]:
type(train_list)

In [None]:
from sklearn.feature_extraction import DictVectorizer
vect = DictVectorizer()
vect.fit_transform(train_list).toarray()

In [None]:
X_train,X_test,y_train,y_test=train_list,val_list,train_label,val_label
clf = MultinomialNB(alpha=1)

X_train,X_test,y_train,y_test=train_list,val_list,train_label,val_label
clf.fit(X_train,y_train)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
def test_RandomizedSearchCV():

    '''
    Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class.
    :return:  None
    '''
    X_train,X_test,y_train,y_test=train_list,val_list,train_label,val_label
    y_train=MultiLabelBinarizer().fit_transform(y_train)
    y_test=MultiLabelBinarizer().fit_transform(y_test)
    
    tuned_parameters ={'alpha': randint.rvs(0,10,size=5)}
    clf=RandomizedSearchCV(MultinomialNB(),tuned_parameters,scoring="accuracy")
    clf.fit(X_train,y_train)
    print("Best parameters set found:",clf.best_params_)
    print("Randomized Grid scores:")
    for params, mean_score, scores in clf.grid_scores_:
             print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params))

    print("Optimized Score:",clf.score(X_test,y_test))
    print("Detailed classification report:")
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

In [None]:
test_RandomizedSearchCV()

In [None]:
print([1:10])