# Machine Learning Research--- Sentiment Analysis of Different Models on Different Datasets

Import needed packages

In [40]:
import nltk
from nltk.tag import pos_tag
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import FreqDist, classify, NaiveBayesClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re,string, random
import pandas as pd
import pymongo
import numpy as np
pd.set_option('display.max_columns', 20)
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import random as rn
from text_processor import*

## Now Test Models on Twitter Sentiment 140 Sample

Get data of Sentiment 140 sample from MongoDB

In [2]:
remote_db="localhost"
remote_link = "mongodb://" + remote_db + ":27017/"
my_client = pymongo.MongoClient(remote_link)
try:
    info = my_client.server_info()  # Forces a call.
    my_db=my_client.get_database('Twitter_Sentiment140')
    my_tb=my_db['Sentiment140']
    np.random.seed(10)  #use seed, to make sure the result is replicable
    random_test_set_seq=np.random.choice(range(1,100001), 20000, replace=False).tolist()
    raw_test_data=my_tb.find({"sequence_no": {'$in': random_test_set_seq}})
    raw_train_data=my_tb.find({"sequence_no": {'$nin': random_test_set_seq}})
    print("Success in accessing the database")
except ServerSelectionTimeoutError:
    print("Database is down.")


Success in accessing the database


Using lists/matrixes to save all train data and test data (preparing for models)

In [3]:
train_label=[]
train_token_list_matrix=[]
test_label=[]
test_token_list_matrix=[]
train_raw_text=[]
test_raw_text=[]
for each_row in raw_train_data:
    train_label.append(each_row['label'])
    train_token_list_matrix.append(each_row['tokenized_text'])
    train_raw_text.append(each_row['text'])
for each_row in raw_test_data:
    test_label.append(each_row['label'])
    test_token_list_matrix.append(each_row['tokenized_text'])
    test_raw_text.append(each_row['text'])

For Naive Bayes Model,the input training data and test data have to be further processed

In [4]:
index=0
train_dataset_for_NB=[]
test_dataset_for_NB=[]
for each_token_list in train_token_list_matrix:
    tweet_dict=dict([token, True] for token in each_token_list)
    if(train_label[index]==-1):
        train_dataset_for_NB.append((tweet_dict,"Negative"))
    else:
        train_dataset_for_NB.append((tweet_dict, "Positive"))
    index=index+1
for each_token_list in test_token_list_matrix:
    tweet_dict=dict([token, True] for token in each_token_list)
    test_dataset_for_NB.append(tweet_dict)

    

Train the Naive Bayes Model

In [5]:
NB_classifier = NaiveBayesClassifier.train(train_dataset_for_NB[:80000])

Analyze the performance of NB_classifier on test dataset

In [6]:
NB_result=[]
for each_tweet in test_dataset_for_NB:
    if NB_classifier.classify(each_tweet)=='Negative':
        NB_result.append(-1)
    else:
        NB_result.append(1)
print("The accuracy score is: ",accuracy_score(test_label,NB_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,NB_result,labels=[-1,1]))

The accuracy score is:  0.74925
Confusion Matrix is:  [[8378 1634]
 [3381 6607]]


Analyze the performance of NB_classifier on training dataset

In [7]:
NB_result=[]
for each_tweet in train_dataset_for_NB[:80000]:
    if NB_classifier.classify(each_tweet[0])=='Negative':
        NB_result.append(-1)
    else:
        NB_result.append(1)
print("The accuracy score is: ",accuracy_score(train_label[:80000],NB_result))
print("Confusion Matrix is: ",confusion_matrix(train_label[:80000],NB_result,labels=[-1,1]))

The accuracy score is:  0.8395375
Confusion Matrix is:  [[36249  3589]
 [ 9248 30914]]


Now Test Vader's performance on test dataset. Notice Vader is rule-based;it has no training data

In [8]:
sia = SentimentIntensityAnalyzer()
vader_result=[]
for each_tweet in test_raw_text:
    score=sia.polarity_scores(each_tweet)['compound']
    if(score<=-0.05):
        vader_result.append(-1)
    elif(score>0.05):
        vader_result.append(1)
    else:
        vader_result.append(0)
print("The accuracy score is: ",accuracy_score(test_label,vader_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,vader_result,labels=[-1,0,1]))

The accuracy score is:  0.51845
Confusion Matrix is:  [[4227 2636 3149]
 [   0    0    0]
 [1003 2843 6142]]


Introduce Doc2Vec

In [9]:
import pickle
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
doc2vec_model= Doc2Vec.load("Doc2Vec/doc2vec_model.pickle")

For SVM,RF and LSTM,the training data and test data have to be further processed with Doc2Vec

In [10]:
train_dataset_vectorize=[]
test_dataset_vectorize=[]
for each_token_list in train_token_list_matrix:
    vector = doc2vec_model.infer_vector(each_token_list)
    train_dataset_vectorize.append(vector)
for each_token_list in test_token_list_matrix:
    vector = doc2vec_model.infer_vector(each_token_list)
    test_dataset_vectorize.append(vector)

Train Vector Machine Model

In [11]:
svm_model=SVC(C=1, kernel='rbf',random_state=10)
svm_model.fit(train_dataset_vectorize[:80000],train_label[:80000])


SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)

Measure SVM's performance on test dataset

In [12]:
predicted_result=svm_model.predict(test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.69365
Confusion Matrix is:  [[6614 3398]
 [2729 7259]]


Measure SVM's performance on training dataset

In [13]:
predicted_result=svm_model.predict(train_dataset_vectorize[:80000])
print("The accuracy score is: ",accuracy_score(train_label[:80000],predicted_result))
print("Confusion Matrix is: ",confusion_matrix(train_label[:80000],predicted_result,labels=[-1,1]))

The accuracy score is:  0.7943125
Confusion Matrix is:  [[30262  9576]
 [ 6879 33283]]


Train Random Forest Model

In [69]:
rf_model=RandomForestClassifier(n_estimators=400,criterion='gini',max_depth=10,random_state=10,n_jobs=-1)
rf_model.fit(train_dataset_vectorize[:80000],train_label[:80000])


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

Measure RF's performance on test dataset

In [70]:
predicted_result=rf_model.predict(test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.65325
Confusion Matrix is:  [[6267 3745]
 [3190 6798]]


Measure RF's performance on training dataset

In [16]:
predicted_result=rf_model.predict(train_dataset_vectorize[:80000])
print("The accuracy score is: ",accuracy_score(train_label[:80000],predicted_result))
print("Confusion Matrix is: ",confusion_matrix(train_label[:80000],predicted_result,labels=[-1,1]))

The accuracy score is:  0.8376375
Confusion Matrix is:  [[31851  7987]
 [ 5002 35160]]


Introduce Neural Network (Keras+Tensorflow)

In [132]:
import os
import tensorflow as tf
from keras import backend as K
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#comment/uncomment to choose CPU or GPU
#choose CPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 
# np.random.seed(10)
# rn.seed(10)
# os.environ['PYTHONHASHSEED']=str(10)
# config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=12,inter_op_parallelism_threads=12,device_count = {'CPU':12})
# tf.random.set_seed(10)
# sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=config)
# tf.compat.v1.keras.backend.set_session(sess)
#choose GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # set the value to 0, the system will use the first GPU detected
# ---------------------------------------
#notice while GPU is used, we can not use random seed
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras import optimizers
from keras import regularizers

In [139]:
def neural_network_model(hidden_layer, hidden_unit, opt):
    model = Sequential()
    model.add(Dense(hidden_unit, input_dim=200, activation='relu'))  #input layer+first hidden layer
    for i in range(hidden_layer - 1):  # the line above already adds one hidden layer
        model.add(Dense(hidden_unit, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  #output layer
    model.compile(loss='mean_squared_logarithmic_error', optimizer=opt, metrics=['accuracy'])
    return model

In [163]:
nn_model = KerasClassifier(build_fn=neural_network_model, hidden_layer=3,
                                        hidden_unit=256, opt='adam', epochs=10,
                                        batch_size=32)
nn_model.fit(np.array(train_dataset_vectorize[:80000]),np.array(train_label[:80000]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1aac0e54fd0>

Measure Neural Network's performance on test dataset

In [164]:
predicted_result = nn_model.predict(np.array(test_dataset_vectorize))
predicted_result = predicted_result.reshape((1, len(test_dataset_vectorize)))[0]
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.67655
Confusion Matrix is:  [[6460 3552]
 [2917 7071]]


Measure Neural Network's performance on test dataset


In [165]:
predicted_result=nn_model.predict(np.array(train_dataset_vectorize[:80000]))
predicted_result = predicted_result.reshape((1, 80000))[0]
print("The accuracy score is: ",accuracy_score(train_label[:80000],predicted_result))
print("Confusion Matrix is: ",confusion_matrix(train_label[:80000],predicted_result,labels=[-1,1]))

The accuracy score is:  0.8483125
Confusion Matrix is:  [[32829  7009]
 [ 5126 35036]]


## Now Test on IMDb Movie Reviews Samples

Get IMDb movie reviews sample from database

In [22]:
remote_db="localhost"
remote_link = "mongodb://" + remote_db + ":27017/"
my_client = pymongo.MongoClient(remote_link)
try:
    info = my_client.server_info()  # Forces a call.
    my_db=my_client.get_database('IMDb')
    my_tb=my_db['movie_reviews']
    np.random.seed(10)  #use seed, to make sure the result is replicable
    random_train_set_seq=np.random.choice(range(1,9906), 7920, replace=False).tolist()  #take 7920 movie reviews to train (about 80%), at random
    imdb_raw_test_data=my_tb.find({"sequence_no": {'$nin': random_train_set_seq}})
    imdb_raw_train_data=my_tb.find({"sequence_no": {'$in': random_train_set_seq}})
    print("Success in accessing the database")
except ServerSelectionTimeoutError:
    print("Database is down.")

Success in accessing the database


In [23]:
imdb_train_label=[]
imdb_train_token_list_matrix=[]
imdb_test_label=[]
imdb_test_token_list_matrix=[]
imdb_train_raw_text=[]
imdb_test_raw_text=[]
for each_row in imdb_raw_train_data:
    if each_row['scores']<=6:   #the mean score of IMDb movie reviews is between 6-7, according to Google；so score below 6 is thought as negative
        imdb_train_label.append(-1)
    else:
        imdb_train_label.append(1)
    imdb_raw_text=each_row['review_titles']+" "+each_row['comment']  #title+comment
    imdb_train_token_list_matrix.append(processText_for_sentiment_analysis(imdb_raw_text))
    imdb_train_raw_text.append(imdb_raw_text)
for each_row in imdb_raw_test_data:
    if each_row['scores']<=6:   #the mean score of IMDb movie reviews is between 6-7, according to Google
        imdb_test_label.append(-1)
    else:
        imdb_test_label.append(1)
    imdb_raw_text=each_row['review_titles']+" "+each_row['comment']  #title+comment
    imdb_test_token_list_matrix.append(processText_for_sentiment_analysis(imdb_raw_text))
    imdb_test_raw_text.append(imdb_raw_text)

In [24]:
print("Data distribution:")
count_pos=0
count_neg=0
for i in imdb_train_label:
    if(i==1):
        count_pos=count_pos+1
    else:
        count_neg=count_neg+1
print("In training dataset, there are {} positive reviews and {} negative reviews".format(count_pos,count_neg))
count_pos=0
count_neg=0
for i in imdb_test_label:
    if(i==1):
        count_pos=count_pos+1
    else:
        count_neg=count_neg+1
print("In test dataset, there are {} positive reviews and {} negative reviews".format(count_pos,count_neg))

Data distribution:
In training dataset, there are 4714 positive reviews and 3206 negative reviews
In test dataset, there are 1186 positive reviews and 799 negative reviews


Prepare data for Naive Bayes

In [25]:
index=0
imdb_train_dataset_for_NB=[]
imdb_test_dataset_for_NB=[]
for each_token_list in imdb_train_token_list_matrix:
    tweet_dict=dict([token, True] for token in each_token_list)
    if(imdb_train_label[index]==-1):
        imdb_train_dataset_for_NB.append((tweet_dict,"Negative"))
    else:
        imdb_train_dataset_for_NB.append((tweet_dict, "Positive"))
    index=index+1

for each_token_list in imdb_test_token_list_matrix:
    tweet_dict=dict([token, True] for token in each_token_list)
    imdb_test_dataset_for_NB.append(tweet_dict)

Train Naive Bayes Model

In [157]:
IMDb_NB_classifier = NaiveBayesClassifier.train(imdb_train_dataset_for_NB)

Analyze the performance of Naive Bayes on Test Dataset

In [156]:
imdb_NB_result=[]
for each_comment in imdb_test_dataset_for_NB:
    if IMDb_NB_classifier.classify(each_comment)=='Negative':
        imdb_NB_result.append(-1)
    else:
        imdb_NB_result.append(1)
print("The accuracy score is: ",accuracy_score(imdb_test_label,imdb_NB_result))
print("The f1 score is: ",f1_score(imdb_test_label,imdb_NB_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,imdb_NB_result,labels=[-1,1]))

The accuracy score is:  0.8473551637279597
The f1 score is:  0.8789452656811826
Confusion Matrix is:  [[ 582  217]
 [  86 1100]]


Analyze the performance of Naive Bayes on Training Dataset

In [28]:
imdb_NB_result=[]
for each_comment in imdb_train_dataset_for_NB:
    if IMDb_NB_classifier.classify(each_comment[0])=='Negative':
        imdb_NB_result.append(-1)
    else:
        imdb_NB_result.append(1)
print("The accuracy score is: ",accuracy_score(imdb_train_label,imdb_NB_result))
print("The f1 score is: ",f1_score(imdb_train_label,imdb_NB_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_train_label,imdb_NB_result,labels=[-1,1]))

The accuracy score is:  0.9142676767676767
The f1 score is:  0.9293370798210011
Confusion Matrix is:  [[2776  430]
 [ 249 4465]]


For SVM,RF and LSTM,the training data and test data have to be further processed with Doc2Vec

In [29]:
imdb_train_dataset_vectorize=[]
imdb_test_dataset_vectorize=[]
for each_token_list in imdb_train_token_list_matrix:
    vector = doc2vec_model.infer_vector(each_token_list)
    imdb_train_dataset_vectorize.append(vector)
for each_token_list in imdb_test_token_list_matrix:
    vector = doc2vec_model.infer_vector(each_token_list)
    imdb_test_dataset_vectorize.append(vector)

Train support vector machine

In [30]:
imdb_svm_model=SVC(C=1, kernel='rbf',random_state=10)
imdb_svm_model.fit(imdb_train_dataset_vectorize,imdb_train_label)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=10, shrinking=True, tol=0.001,
    verbose=False)

Analyze the performance of SVM on Test Dataset

In [31]:
predicted_result=imdb_svm_model.predict(imdb_test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.8841309823677582
The f1 score is:  0.9036850921273032
Confusion Matrix is:  [[ 676  123]
 [ 107 1079]]


Analyze the performance of SVM on Training Dataset

In [32]:
predicted_result=imdb_svm_model.predict(imdb_train_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_train_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_train_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_train_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.9349747474747475
The f1 score is:  0.9457151892062822
Confusion Matrix is:  [[2919  287]
 [ 228 4486]]


Train Random Forest

In [33]:
imdb_rf_model=RandomForestClassifier(n_estimators=400,criterion='gini',max_depth=15,random_state=10,n_jobs=-1)
imdb_rf_model.fit(imdb_train_dataset_vectorize,imdb_train_label)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

Analyze the performance of Random Forest on Test Dataset

In [34]:
predicted_result=imdb_rf_model.predict(imdb_test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.8327455919395466
The f1 score is:  0.8684627575277337
Confusion Matrix is:  [[ 557  242]
 [  90 1096]]


Analyze the performance of Random Forest on Training Dataset

In [35]:
predicted_result=imdb_rf_model.predict(imdb_train_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_train_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_train_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_train_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.9998737373737374
The f1 score is:  0.9998939442146568
Confusion Matrix is:  [[3205    1]
 [   0 4714]]


Now Test Vader(Rule Based)

In [36]:
sia = SentimentIntensityAnalyzer()
vader_result=[]
for each_tweet in imdb_test_raw_text:
    score=sia.polarity_scores(each_tweet)['compound']
    if(score<=-0.05):
        vader_result.append(-1)
    elif(score>0.05):
        vader_result.append(1)
    else:
        vader_result.append(0)
print("The accuracy score is: ",accuracy_score(imdb_test_label,vader_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,vader_result,labels=[-1,0,1]))

The accuracy score is:  0.7400503778337532
Confusion Matrix is:  [[ 456    2  341]
 [   0    0    0]
 [ 170    3 1013]]


Train Neural Network Model

In [37]:
imdb_nn_model = KerasClassifier(build_fn=neural_network_model, hidden_layer=3,
                                        hidden_unit=256, opt='adam', epochs=10,
                                        batch_size=32)
imdb_nn_model.fit(np.array(imdb_train_dataset_vectorize),np.array(imdb_train_label))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1aabcf963c8>

Analyze the performance of Neural Network on test dataset

In [38]:
predicted_result=imdb_nn_model.predict(np.array(imdb_test_dataset_vectorize))
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.8770780856423174
The f1 score is:  0.8959044368600683
Confusion Matrix is:  [[ 691  108]
 [ 136 1050]]


Analyze the performance of Neural Network on training dataset

In [39]:
predicted_result=imdb_nn_model.predict(np.array(imdb_train_dataset_vectorize))
print("The accuracy score is: ",accuracy_score(imdb_train_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_train_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_train_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.9853535353535353
The f1 score is:  0.9876857749469213
Confusion Matrix is:  [[3152   54]
 [  62 4652]]


## Now use models trained on IMDb to Test Twitter

Naive Bayes Model

In [158]:
NB_result=[]
for each_tweet in test_dataset_for_NB:
    if IMDb_NB_classifier.classify(each_tweet)=='Negative':
        NB_result.append(-1)
    else:
        NB_result.append(1)
print("The accuracy score is: ",accuracy_score(test_label,NB_result))
print("The f1 score is: ",f1_score(test_label,NB_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,NB_result,labels=[-1,1]))

The accuracy score is:  0.54635
The f1 score is:  0.5815229924818965
Confusion Matrix is:  [[4623 5389]
 [3684 6304]]


Support Vector Machine

In [147]:
predicted_result=imdb_svm_model.predict(test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("The f1 score is: ",f1_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.582
The f1 score is:  0.5744247607411932
Confusion Matrix is:  [[5998 4014]
 [4346 5642]]


Random Forest

In [148]:
predicted_result=imdb_rf_model.predict(test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("The f1 score is: ",f1_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.547
The f1 score is:  0.4497084548104956
Confusion Matrix is:  [[7238 2774]
 [6286 3702]]


Neural Network

In [150]:
predicted_result=imdb_nn_model.predict(np.array(test_dataset_vectorize))
print("The accuracy score is: ",accuracy_score(test_label,predicted_result))
print("The f1 score is: ",f1_score(test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.5643
The f1 score is:  0.44882985452245416
Confusion Matrix is:  [[7738 2274]
 [6440 3548]]


## Now use models trained on Twitter to Test IMDb

Naive Bayes Model

In [151]:
NB_result=[]
for each_comment in imdb_test_dataset_for_NB:
    if NB_classifier.classify(each_comment)=='Negative':
        NB_result.append(-1)
    else:
        NB_result.append(1)
print("The accuracy score is: ",accuracy_score(imdb_test_label,NB_result))
print("The f1 score is: ",f1_score(imdb_test_label,NB_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,NB_result,labels=[-1,1]))

The accuracy score is:  0.7234256926952141
The f1 score is:  0.7658848614072497
Confusion Matrix is:  [[538 261]
 [288 898]]


Support Vector Machine

In [152]:
predicted_result=svm_model.predict(imdb_test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.4100755667506297
The f1 score is:  0.04563977180114099
Confusion Matrix is:  [[ 786   13]
 [1158   28]]


Random Forest

In [153]:
predicted_result=rf_model.predict(imdb_test_dataset_vectorize)
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.5717884130982368
The f1 score is:  0.5319383259911895
Confusion Matrix is:  [[652 147]
 [703 483]]


Neural Network

In [155]:
predicted_result=nn_model.predict(np.array(imdb_test_dataset_vectorize))
print("The accuracy score is: ",accuracy_score(imdb_test_label,predicted_result))
print("The f1 score is: ",f1_score(imdb_test_label,predicted_result))
print("Confusion Matrix is: ",confusion_matrix(imdb_test_label,predicted_result,labels=[-1,1]))

The accuracy score is:  0.5768261964735516
The f1 score is:  0.5945945945945945
Confusion Matrix is:  [[529 270]
 [570 616]]
