# <font color=green>Sentiment Analysis</font>

# <font color=orange>Click bellow for redirecting to repsective pages</font>

1. [Exploring the dataset](#Exploring-the-dataset)

   
2. [Conversion of text to Cross Sectional Data](#Conversion-of-text-to-Cross-Sectional-Data)


3. [Naive Bayes Model](#Naive-Bayes-Model)

### Load the data 

##### 1 is positive Review and 0 is Negative Review

In [156]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_ds = pd.read_csv( "/content/drive/My Drive/DeepLearning_Simili/Projects/Natural Language Processing NLP/Lesson 2_Feature_Engineering_on_Text_Data/data_for_sentiment_analysis", delimiter="\t" )


In [157]:
train_ds.head()

Unnamed: 0,sentiment,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [None]:
train_ds.tail()

In [None]:
train_ds.columns

In [None]:
train_ds[train_ds.sentiment == 1][0:5]

In [None]:
train_ds[train_ds.sentiment == 0][0:5]

# Exploring the dataset

In [158]:
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6918 entries, 0 to 6917
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  6918 non-null   int64 
 1   text       6918 non-null   object
dtypes: int64(1), object(1)
memory usage: 108.2+ KB


In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
plt.figure( figsize=(6,5))

In [None]:
# create count plot
ax = sn.countplot(x='sentiment', data=train_ds)
# annotate
for p in ax.patches:
    ax.annotate(p.get_height(), (p.get_x()+0.1, p.get_height()+50))

In [None]:
review_volume = train_ds["sentiment"].value_counts()

In [None]:
(review_volume[0]/train_ds.shape[0])*100

In [None]:
(review_volume[1]/train_ds.shape[0])*100

### Inference : The no. of data points for both the catagres are balanced and hence it is good to proceed with Classification

# Conversion of text to Cross Sectional Data

* Count Vector Model (Bag Of Word)
* Term Frequency model
* Term Frequency - Inverse Document Frequency (TF-IDF) model
* Ngram(s) model

### Count Vector Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()
# Create the dictionary from the corpus
feature_vector = count_vectorizer.fit(train_ds.text)

In [None]:
feature_vector  

In [None]:
# Get the feature names
features = feature_vector.get_feature_names()
print( "Total number of features: ", len(features))

In [None]:
import random
sampleWords = random.sample(features, 3)
print(sampleWords, end=' ')

In [None]:
train_ds_features = count_vectorizer.transform( train_ds.text )
type(train_ds_features)

In [None]:
print(train_ds_features.shape)
print(train_ds_features[0:1,0:1])

## Displaying Document Vectors

In [None]:
# Converting the matrix to a dataframe
train_ds_df = pd.DataFrame(train_ds_features.todense())

In [None]:
# Setting the column names to the features i.e. words
train_ds_df.columns = features

In [None]:
train_ds_df.head()

In [None]:
train_ds[0:1]

In [None]:
train_ds_df.iloc[0:1, 150:157]

### Removing low frequency words

In [None]:
# summing up the occurances of features column wise
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
features_counts

In [None]:
feature_counts_df = pd.DataFrame( dict( features = features,
counts = features_counts ) )

In [None]:
feature_counts_df

In [None]:
plt.figure( figsize=(12,5))
plt.hist(feature_counts_df.counts, bins=50, range = (0, 2000));
plt.xlabel( 'Frequency of words' )
plt.ylabel( 'Density' );

In [None]:
# Initialize the CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
# Create the dictionary from the corpus
feature_vector = count_vectorizer.fit( train_ds.text )
# Get the feature names
features = feature_vector.get_feature_names()
# Transform the document into vectors
train_ds_features = count_vectorizer.transform( train_ds.text )
# Count the frequency of the features
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,counts = features_counts ) )

In [None]:
feature_counts

### Removing Stop Words 

In [None]:
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS

In [None]:
len(my_stop_words) 

In [None]:
# Adding custom words to the list of stop words
my_stop_words = text.ENGLISH_STOP_WORDS.union( ['harry', 'potter', 'code', 'vinci', 'da','harry', 'mountain', 'movie', 'movies'])

In [None]:
len(my_stop_words)

## Creating Count Vectors with removal of Stop Words and Considering highly frequent words 

In [None]:
count_vectorizer = CountVectorizer( stop_words = my_stop_words,max_features = 1000 )
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,counts = features_counts ) )

In [None]:
feature_counts.sort_values( "counts", ascending = False )[0:15]

In [None]:
from nltk.stem.snowball import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

In [None]:
def stemmed_words(doc):
    ### Stemming of words
    stemmed_words = (stemmer.stem(w) for w in analyzer(doc))
    ### Remove the words in stop words list
    non_stop_words = [ word for word in list(set(stemmed_words) - set(my_stop_words)) ]
    return non_stop_words

### Features with Stemmed Words and filtered by Stop Words which are highly frequent word  

In [None]:
count_vectorizer = CountVectorizer( analyzer=stemmed_words, max_features = 1000)
feature_vector = count_vectorizer.fit( train_ds.text )
train_ds_features = count_vectorizer.transform( train_ds.text )
features = feature_vector.get_feature_names()
features_counts = np.sum( train_ds_features.toarray(), axis = 0 )
feature_counts = pd.DataFrame( dict( features = features,
counts = features_counts ) )
feature_counts.sort_values( "counts", ascending = False )[0:15]

In [None]:
# Convert the document vector matrix into dataframe
train_ds_df = pd.DataFrame(train_ds_features.todense())
# Assign the features names to the column
train_ds_df.columns = features
# Assign the sentiment labels to the train_ds
train_ds_df['sentiment'] = train_ds.sentiment

In [None]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split( train_ds_features,train_ds.sentiment,test_size = 0.3,random_state = 42 )

### Naive Bayes Model

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit( train_X.toarray(), train_y )

test_ds_predicted = nb_clf.predict( test_X.toarray() )

from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

from sklearn import metrics
cm = metrics.confusion_matrix( test_y, test_ds_predicted )
sn.heatmap(cm, annot=True, fmt='.2f' );

In [None]:
COLUMN_NAMES = ["Process","Model Name", "F1 Scores","Range of F1 Scores","Std Deviation of F1 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

In [None]:
df_model_selection

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn import metrics

In [None]:
def stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y):
    global df_model_selection
    
    skf = StratifiedKFold(n_splits, random_state=29)
    
    weighted_f1_score = []
    print(skf.split(X,y))
    for train_index, val_index in skf.split(X,y):
        X_train, X_test = X[train_index], X[val_index] 
        y_train, y_test = y[train_index], y[val_index]
        
        
        model_obj.fit(X_train, y_train)##### HERE ###
        test_ds_predicted = model_obj.predict( X_test ) ##### HERE ####   
        #print( metrics.classification_report( y_test, test_ds_predicted ) )    
        weighted_f1_score.append(round(f1_score(y_test, test_ds_predicted , average='weighted'),2))
        
    sd_weighted_f1_score = np.std(weighted_f1_score, ddof=1)
    range_of_f1_scores = "{}-{}".format(min(weighted_f1_score),max(weighted_f1_score))    
    df_model_selection = pd.concat([df_model_selection,pd.DataFrame([[process,model_name,sorted(weighted_f1_score),range_of_f1_scores,sd_weighted_f1_score]], columns =COLUMN_NAMES) ])

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit( train_X.toarray(), train_y )

In [None]:
model_obj = nb_clf
model_name = "Binomial Naive Bayes Classifier"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)


df_model_selection

# Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_X.toarray(), train_y)
test_ds_predicted = logreg.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = logreg
model_name = "Logistic Regression"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy')

decision_tree.fit(train_X.toarray(), train_y)
test_ds_predicted = decision_tree.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = decision_tree
model_name = "Decission Tree"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=10)

In [None]:
random_forest.fit(train_X.toarray(), train_y)
test_ds_predicted = random_forest.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = random_forest
model_name = "Random Forest"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# XG Boost

In [None]:
from xgboost import XGBClassifier
xgboost = XGBClassifier()

In [None]:
xgboost.fit(train_X.toarray(), train_y)
test_ds_predicted = xgboost.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = xgboost
model_name = "XG Boost"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

sgd = OneVsRestClassifier(SGDClassifier())

In [None]:
sgd.fit(train_X.toarray(), train_y)
test_ds_predicted = sgd.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = sgd
model_name = "Stochastic Gradient Descent"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# Gaussian Process Classifier

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
gausian_process = GaussianProcessClassifier()

In [None]:
gausian_process.fit(train_X.toarray(), train_y)
test_ds_predicted = gausian_process.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = gausian_process
model_name = "Gausian Process"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
knn.fit(train_X.toarray(), train_y)
test_ds_predicted = knn.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = knn
model_name = "K Nearst Neighbour"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [None]:
lda.fit(train_X.toarray(), train_y)
test_ds_predicted = lda.predict( test_X.toarray() )

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, test_ds_predicted ) )

In [None]:
model_obj = lda
model_name = "Linear Discriminant Analysis"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC()

In [None]:
svm.fit(train_X.toarray(), train_y)
test_ds_predicted = svm.predict( test_X.toarray() )

In [None]:
model_obj = svm
model_name = "Support Vector Machine"
process = "Bag Of Words with NLTK Stemming"
n_splits = 5
X = train_ds_features.toarray()
y = train_ds.sentiment
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

In [None]:
df_model_selection.to_csv("Model_statistics.csv",index = False)