In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_ds = pd.read_csv( "data_for_sentiment_analysis", delimiter="\t" )
train_ds.head(5)

Unnamed: 0,sentiment,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [5]:
COLUMN_NAMES = ["Process","Model Name", "F1 Scores","Range of F1 Scores","Std Deviation of F1 Scores"]

In [6]:
df_model_selection = pd.read_csv("Model_statistics.csv")

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import metrics
def stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y):
    global df_model_selection
    skf = StratifiedKFold(n_splits=5, random_state=29)
    weighted_f1_score = []
    for train_index, val_index in skf.split(X,y):
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        model_obj.fit(X_train, y_train)##### HERE ###
        test_ds_predicted = model_obj.predict( X_test ) ##### HERE ####   
        #print( metrics.classification_report( y_test, test_ds_predicted ) )    
        weighted_f1_score.append(round(f1_score(y_test, test_ds_predicted , average='weighted'),2))
        
    sd_weighted_f1_score = np.std(weighted_f1_score, ddof=1)
    range_of_f1_scores = "{}-{}".format(min(weighted_f1_score),max(weighted_f1_score))    
    df_model_selection = pd.concat([df_model_selection,pd.DataFrame([[process,model_name,sorted(weighted_f1_score),range_of_f1_scores,sd_weighted_f1_score]], columns =COLUMN_NAMES) ])

In [8]:
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS
#Printing first few stop words
print("Few stop words: ", list(my_stop_words)[0:10])

Few stop words:  ['an', 'out', 'although', 'anything', 'onto', 'call', 'ten', 'noone', 'anywhere', 'six']


In [9]:
# Adding custom words to the list of stop words
my_stop_words = text.ENGLISH_STOP_WORDS.union( ['harry', 'potter', 'code', 'vinci', 'da',
'harri', 'mountain', 'movie', 'movies'])

In [10]:
# Setting stop words list
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer( stop_words = my_stop_words,
max_features = 1000 )
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,
counts = features_counts ) )

# Data Cleaning - Stemming or Lemmatization

### To get words into root form and hence in a motivation of decreasing few more features

In [11]:
#1. PorterStemmer
#2. LancasterStemmer

In [12]:
from nltk.stem.snowball import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()



#Custom function for stemming and stop word removal
def stemmed_words(doc):
    ### Stemming of words
    stemmed_words = (stemmer.stem(w) for w in analyzer(doc))
    ### Remove the words in stop words list
    non_stop_words = [ word for word in list(set(stemmed_words) - set(my_stop_words)) ]
    return non_stop_words

In [13]:
count_vectorizer = CountVectorizer( analyzer=stemmed_words,
max_features = 1000)
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,
counts = features_counts ) )
feature_counts.sort_values( "counts", ascending = False )[0:15]

Unnamed: 0,features,counts
80,brokeback,1930
406,love,1837
801,suck,1378
922,wa,1142
43,awesom,1116
432,mission,1090
344,imposs,1090
438,movi,1052
392,like,823
298,hate,636


In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z003RJMK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
import nltk
from nltk.stem import PorterStemmer
# library for regular expressions
import re
stemmer = PorterStemmer()

In [16]:
def get_stemmed_tokens( doc ):
    # Tokenize the documents to words
    all_tokens = [word for word in nltk.word_tokenize(doc)]
    clean_tokens = []
    # remove the all characters other than alphabets. It takes a regex for matching.
    for each_token in all_tokens:
        if re.search('[a-zA-Z]', each_token):
            clean_tokens.append(each_token)
    
    
    # Stem the words
    stemmed_tokens = [stemmer.stem(t) for t in clean_tokens]
    return stemmed_tokens

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=500,stop_words='english',tokenizer=get_stemmed_tokens,ngram_range=(1,2))

In [18]:
feature_vector = tfidf_vectorizer.fit( train_ds.text )
train_ds_features = tfidf_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()

In [19]:
features

["'m",
 "'re",
 "'re gon",
 "'s",
 "'s great",
 "'s like",
 "'s mom",
 "'s onli",
 "'s retart",
 "'s right",
 "'s stupid",
 "'yeah",
 "'yeah got",
 'absolut',
 'absolut awesom',
 'accept',
 'ach',
 'ach cock',
 'acn',
 'acn love',
 'alway',
 'alway know',
 'anyon',
 'anyon say',
 'ass',
 'award',
 'award remind',
 'awesom',
 'awesom book',
 'awesom ca',
 'awesom movi',
 "awesom n't",
 'awesom stori',
 'awesome..',
 'b',
 'b suck',
 'beauti',
 'becaus',
 'becaus awesom',
 'becaus hate',
 'becaus know',
 'becaus like',
 'becaus love',
 'becaus outshin',
 'becaus type',
 'becom',
 'becom accept',
 'begin',
 'better',
 'better read',
 'big',
 'big time',
 'bitch',
 'black',
 'black guy',
 'blond',
 'blond rock-hard',
 'bobbypin',
 'bobbypin insan',
 'bonker',
 'book',
 'book catcher',
 'bore',
 'brokeback',
 'brokeback mountain',
 'bye..',
 'ca',
 "ca n't",
 'care',
 'care anyon',
 'catcher',
 'catcher tye',
 'charact',
 'charact die',
 'clean',
 'clean tabl',
 'cock',
 'code',
 'code awes

In [20]:
# Convert the document vector matrix into dataframe
train_ds_df = pd.DataFrame(train_ds_features.todense())
# Assign the features names to the column
train_ds_df.columns = features
# Assign the sentiment labels to the train_ds
train_ds_df['sentiment'] = train_ds.sentiment
train_ds_df

Unnamed: 0,'m,'re,'re gon,'s,'s great,'s like,'s mom,'s onli,'s retart,'s right,...,whimper,whimper nois,worth,worth know,wotshisfac,wotshisfac need,yeah,zen,zen da,sentiment
0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.33902,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6913,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6914,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6915,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6916,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [21]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( train_ds_features,train_ds.sentiment,test_size = 0.3,random_state = 42 )

# Naive Bayes Model for Sentiment Classification

In [22]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit( train_X.toarray(), train_y )

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [23]:
test_ds_predicted = nb_clf.predict( test_X.toarray() )

In [24]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       1.00      0.94      0.97       873
           1       0.96      1.00      0.98      1203

    accuracy                           0.97      2076
   macro avg       0.98      0.97      0.97      2076
weighted avg       0.97      0.97      0.97      2076



In [25]:
model_obj = nb_clf
model_name = "Binomial Naive Bayes Classifier"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Logistic Regression 

In [26]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_X.toarray(), train_y)
test_ds_predicted = logreg.predict( test_X.toarray() )

In [27]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [28]:
model_obj = logreg
model_name = "Logistic Regression"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy')

decision_tree.fit(train_X.toarray(), train_y)
test_ds_predicted = decision_tree.predict( test_X.toarray() )

In [30]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       873
           1       0.99      0.98      0.98      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [31]:
model_obj = decision_tree
model_name = "Decission Tree"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=10)

In [33]:
random_forest.fit(train_X.toarray(), train_y)
test_ds_predicted = random_forest.predict( test_X.toarray() )

In [34]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       873
           1       0.98      0.98      0.98      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [35]:
model_obj = random_forest
model_name = "Random Forest"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# XG Boost

In [36]:
from xgboost import XGBClassifier
xgboost = XGBClassifier()

In [37]:
xgboost.fit(train_X.toarray(), train_y)
test_ds_predicted = xgboost.predict( test_X.toarray() )

In [38]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [39]:
model_obj = xgboost
model_name = "XG Boost"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# SGD Classifier

In [40]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

sgd = OneVsRestClassifier(SGDClassifier())

In [41]:
sgd.fit(train_X.toarray(), train_y)
test_ds_predicted = sgd.predict( test_X.toarray() )

In [42]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [43]:
model_obj = sgd
model_name = "Stochastic Gradient Descent"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Gaussian Process Classifier

In [44]:
from sklearn.gaussian_process import GaussianProcessClassifier
gausian_process = GaussianProcessClassifier()

In [45]:
gausian_process.fit(train_X.toarray(), train_y)
test_ds_predicted = gausian_process.predict( test_X.toarray() )

In [46]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       873
           1       0.98      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [47]:
model_obj = gausian_process
model_name = "Gausian Process"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# KNN Classifier

In [48]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [49]:
knn.fit(train_X.toarray(), train_y)
test_ds_predicted = knn.predict( test_X.toarray() )

In [50]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       873
           1       0.99      0.98      0.98      1203

    accuracy                           0.98      2076
   macro avg       0.98      0.98      0.98      2076
weighted avg       0.98      0.98      0.98      2076



In [51]:
model_obj = knn
model_name = "K Nearst Neighbour"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Linear Discriminant Analysis

In [52]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [53]:
lda.fit(train_X.toarray(), train_y)
test_ds_predicted = lda.predict( test_X.toarray() )

In [54]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       873
           1       0.99      0.99      0.99      1203

    accuracy                           0.99      2076
   macro avg       0.99      0.99      0.99      2076
weighted avg       0.99      0.99      0.99      2076



In [55]:
model_obj = lda
model_name = "Linear Discriminant Analysis"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


# Support Vector Machine

In [56]:
from sklearn.svm import SVC
svm = SVC()

In [57]:
svm.fit(train_X.toarray(), train_y)
test_ds_predicted = svm.predict( test_X.toarray() )

In [58]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

              precision    recall  f1-score   support

           0       1.00      0.09      0.17       873
           1       0.60      1.00      0.75      1203

    accuracy                           0.62      2076
   macro avg       0.80      0.55      0.46      2076
weighted avg       0.77      0.62      0.51      2076



In [59]:
model_obj = svm
model_name = "Support Vector Machine"
process = "Ngram with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,F1 Scores,Range of F1 Scores,Std Deviation of F1 Scores
0,Bag Of Words with NLTK Stemming,Linear Discriminant Analysis,"[0.98, 0.99, 0.99, 0.99, 1.0]",0.98-1.0,0.007071
1,Bag Of Words with NLTK Stemming,Stochastic Gradient Descent,"[0.96, 0.98, 0.99, 1.0, 1.0]",0.96-1.0,0.016733
2,Bag Of Words with NLTK Stemming,XG Boost,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
3,Bag Of Words with NLTK Stemming,Support Vector Machine,"[0.95, 0.97, 0.99, 0.99, 1.0]",0.95-1.0,0.02
4,Bag Of Words with NLTK Stemming,Logistic Regression,"[0.95, 0.98, 0.99, 1.0, 1.0]",0.95-1.0,0.020736
5,TFIDF with NLTK Stemming,Binomial Naive Bayes Classifier,"[0.92, 0.98, 0.99, 0.99, 1.0]",0.92-1.0,0.032094
6,TFIDF with NLTK Stemming,Logistic Regression,"[0.95, 0.97, 1.0, 1.0, 1.0]",0.95-1.0,0.023022
7,TFIDF with NLTK Stemming,Decission Tree,"[0.93, 0.95, 0.98, 0.98, 1.0]",0.93-1.0,0.027749
8,TFIDF with NLTK Stemming,Random Forest,"[0.95, 0.98, 0.99, 0.99, 1.0]",0.95-1.0,0.019235
9,TFIDF with NLTK Stemming,XG Boost,"[0.96, 0.97, 0.97, 0.99, 1.0]",0.96-1.0,0.016432


In [60]:
df_model_selection.to_csv("Model_statistics.csv",index = False)