In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_ds = pd.read_csv( "data_for_sentiment_analysis", delimiter="\t" )
train_ds.head(5)

Unnamed: 0,sentiment,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [4]:
df_model_selection = pd.read_csv("Model_statistics.csv")

In [5]:
COLUMN_NAMES = ["Process","Model Name", "F1 Scores","Range of F1 Scores","Std Deviation of F1 Scores"]

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import metrics
def stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y):
    global df_model_selection
    skf = StratifiedKFold(n_splits=5, random_state=29)
    weighted_f1_score = []
    for train_index, val_index in skf.split(X,y):
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        model_obj.fit(X_train, y_train)##### HERE ###
        test_ds_predicted = model_obj.predict( X_test ) ##### HERE ####   
        #print( metrics.classification_report( y_test, test_ds_predicted ) )    
        weighted_f1_score.append(round(f1_score(y_test, test_ds_predicted , average='weighted'),2))
        
    sd_weighted_f1_score = np.std(weighted_f1_score, ddof=1)
    range_of_f1_scores = "{}-{}".format(min(weighted_f1_score),max(weighted_f1_score))    
    df_model_selection = pd.concat([df_model_selection,pd.DataFrame([[process,model_name,sorted(weighted_f1_score),range_of_f1_scores,sd_weighted_f1_score]], columns =COLUMN_NAMES) ])

In [7]:
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS
#Printing first few stop words
print("Few stop words: ", list(my_stop_words)[0:10])

Few stop words:  ['another', 'rather', 'these', 'mine', 'nobody', 'twelve', 'his', 'forty', 'became', 'us']


In [8]:
# Adding custom words to the list of stop words
my_stop_words = text.ENGLISH_STOP_WORDS.union( ['harry', 'potter', 'code', 'vinci', 'da',
'harri', 'mountain', 'movie', 'movies'])

In [9]:
# Setting stop words list
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer( stop_words = my_stop_words,
max_features = 1000 )
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,
counts = features_counts ) )

# Data Cleaning - Stemming or Lemmatization

### To get words into root form and hence in a motivation of decreasing few more features

In [10]:
#1. PorterStemmer
#2. LancasterStemmer

In [11]:
from nltk.stem.snowball import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()



#Custom function for stemming and stop word removal
def stemmed_words(doc):
    ### Stemming of words
    stemmed_words = (stemmer.stem(w) for w in analyzer(doc))
    ### Remove the words in stop words list
    non_stop_words = [ word for word in list(set(stemmed_words) - set(my_stop_words)) ]
    return non_stop_words

In [12]:
count_vectorizer = CountVectorizer( analyzer=stemmed_words,
max_features = 1000)
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,
counts = features_counts ) )
feature_counts.sort_values( "counts", ascending = False )[0:15]

Unnamed: 0,features,counts
80,brokeback,1930
406,love,1837
801,suck,1378
922,wa,1142
43,awesom,1116
432,mission,1090
344,imposs,1090
438,movi,1052
392,like,823
298,hate,636


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer( analyzer=stemmed_words,max_features = 1000)


feature_vector = tfidf_vectorizer.fit( train_ds.text )
train_ds_features = tfidf_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()

In [14]:
features

['10',
 '17',
 '33',
 '6th',
 'abl',
 'absolut',
 'absurd',
 'academi',
 'accept',
 'accompani',
 'ach',
 'acn',
 'act',
 'action',
 'actor',
 'actual',
 'admir',
 'ador',
 'adult',
 'ago',
 'agre',
 'alreadi',
 'alway',
 'amaz',
 'ang',
 'angel',
 'ani',
 'anim',
 'anyon',
 'anyth',
 'appar',
 'appeal',
 'articl',
 'asian',
 'ask',
 'asleep',
 'ass',
 'attempt',
 'attract',
 'audrey',
 'author',
 'aw',
 'award',
 'awesom',
 'awesomest',
 'azkaban',
 'bad',
 'ball',
 'ban',
 'bang',
 'basic',
 'bean',
 'beat',
 'beauti',
 'becaus',
 'becom',
 'befor',
 'begin',
 'believ',
 'besid',
 'best',
 'better',
 'bias',
 'big',
 'bit',
 'bitch',
 'black',
 'blame',
 'blond',
 'blood',
 'board',
 'bobbypin',
 'bodi',
 'bogu',
 'bonker',
 'book',
 'bore',
 'bought',
 'boycot',
 'brilliant',
 'brokeback',
 'brown',
 'btw',
 'bullshit',
 'butt',
 'buy',
 'bye',
 'came',
 'capot',
 'car',
 'care',
 'case',
 'catch',
 'catcher',
 'challeng',
 'chang',
 'charact',
 'children',
 'chines',
 'choic',
 'ch

In [15]:
# Convert the document vector matrix into dataframe
train_ds_df = pd.DataFrame(train_ds_features.todense())
# Assign the features names to the column
train_ds_df.columns = features
# Assign the sentiment labels to the train_ds
train_ds_df['sentiment'] = train_ds.sentiment
train_ds_df

Unnamed: 0,10,17,33,6th,abl,absolut,absurd,academi,accept,accompani,...,year,yesterday,yip,young,younger,yuck,yuh,zach,zen,sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [16]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( train_ds_features,train_ds.sentiment,test_size = 0.3,random_state = 42 )

# Naive Bayes Model for Sentiment Classification

In [17]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit( train_X.toarray(), train_y )

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
test_ds_predicted = nb_clf.predict( test_X.toarray() )

In [19]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       873
           1       0.98      0.99      0.98      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [20]:
model_obj = nb_clf
model_name = "Binomial Naive Bayes Classifier"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Logistic Regression 

In [21]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_X.toarray(), train_y)
test_ds_predicted = logreg.predict( test_X.toarray() )

In [22]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       873
           1       0.98      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [23]:
model_obj = logreg
model_name = "Logistic Regression"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy')

decision_tree.fit(train_X.toarray(), train_y)
test_ds_predicted = decision_tree.predict( test_X.toarray() )

In [25]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [26]:
model_obj = decision_tree
model_name = "Decission Tree"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=10)

In [28]:
random_forest.fit(train_X.toarray(), train_y)
test_ds_predicted = random_forest.predict( test_X.toarray() )

In [29]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [30]:
model_obj = random_forest
model_name = "Random Forest"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# XG Boost

In [31]:
from xgboost import XGBClassifier
xgboost = XGBClassifier()

In [32]:
xgboost.fit(train_X.toarray(), train_y)
test_ds_predicted = xgboost.predict( test_X.toarray() )

In [33]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [34]:
model_obj = xgboost
model_name = "XG Boost"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# SGD Classifier

In [35]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

sgd = OneVsRestClassifier(SGDClassifier())

In [36]:
sgd.fit(train_X.toarray(), train_y)
test_ds_predicted = sgd.predict( test_X.toarray() )

In [37]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       873
           1       0.99      1.00      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [38]:
model_obj = sgd
model_name = "Stochastic Gradient Descent"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Gaussian Process Classifier

In [39]:
from sklearn.gaussian_process import GaussianProcessClassifier
gausian_process = GaussianProcessClassifier()

In [40]:
gausian_process.fit(train_X.toarray(), train_y)
test_ds_predicted = gausian_process.predict( test_X.toarray() )

In [41]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       873
           1       0.98      0.99      0.99      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [42]:
model_obj = gausian_process
model_name = "Gausian Process"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

KeyboardInterrupt: 

# KNN Classifier

In [43]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [44]:
knn.fit(train_X.toarray(), train_y)
test_ds_predicted = knn.predict( test_X.toarray() )

In [45]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       873
           1       0.97      0.98      0.98      1203

    accuracy                           0.97      2076
   macro avg       0.97      0.97      0.97      2076
weighted avg       0.97      0.97      0.97      2076



In [46]:
model_obj = knn
model_name = "K Nearst Neighbour"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Linear Discriminant Analysis

In [47]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [48]:
lda.fit(train_X.toarray(), train_y)
test_ds_predicted = lda.predict( test_X.toarray() )

In [49]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       873
           1       0.96      0.97      0.97      1203

    accuracy                           0.96      2076
   macro avg       0.96      0.96      0.96      2076
weighted avg       0.96      0.96      0.96      2076



In [50]:
model_obj = lda
model_name = "Linear Discriminant Analysis"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
1,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.99, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
2,Bag Of Words with NLTK Stemming,Decission Tree,"[0.92, 0.98, 0.98, 0.99, 1.0]",0.92-1.0,0.031305
3,Bag Of Words with NLTK Stemming,Random Forest,"[0.93, 0.97, 0.98, 0.99, 1.0]",0.93-1.0,0.027019
4,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
5,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.95, 0.99, 0.99, 0.99, 1.0]",0.95-1.0,0.019494
6,Bag Of Words with NLTK Stemming,Gausian Process,"[0.94, 0.99, 0.99, 1.0, 1.0]",0.94-1.0,0.0251
7,Bag Of Words with NLTK Stemming,K Nearst Neighbour,"[0.92, 0.97, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
8,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
9,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.98, 0.99, 1.0]",0.95-1.0,0.019235


# Support Vector Machine

In [51]:
from sklearn.svm import SVC
svm = SVC()

In [52]:
svm.fit(train_X.toarray(), train_y)
test_ds_predicted = svm.predict( test_X.toarray() )

In [53]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       1.00      0.12      0.22       873
           1       0.61      1.00      0.76      1203

    accuracy                           0.63      2076
   macro avg       0.81      0.56      0.49      2076
weighted avg       0.77      0.63      0.53      2076



In [None]:
model_obj = svm
model_name = "Support Vector Machine"
process = "TFIDF with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

In [None]:
df_model_selection.to_csv("Model_statistics.csv",index = False)