# Import modules for cleansing data

In [None]:
import re
import pandas as pd
import numpy as np
from Packages.DataPreperation import CleanData
from textblob import TextBlob as txt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from datetime import time
from nltk.tokenize import RegexpTokenizer
from sklearn import svm
import matplotlib.pyplot as plt

# Read Data

In [2]:
sentimentData = pd.read_csv(r"Data\Sentiment140.csv", header=None)
sentimentData.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Test for the distribution of results

In [3]:
Sentiment2 = sentimentData
Sentiment2[0] = Sentiment2[0].replace(0, "negative")
Sentiment2[0] = Sentiment2[0].replace(4, "positive")
Sentiment2 = Sentiment2[~Sentiment2[0].str.contains("1")]
Sentiment2[0].value_counts()
# Sentiment2.head()

positive    800000
negative    800000
Name: 0, dtype: int64

In [None]:
Sentiment2[0].value_counts().plot.bar()

In [4]:
perform = CleanData()

removeEmojis = True
CleanText = True
removeStopWords = True
lemmatizeText = True
POSLemmatizeTextFor = False
Stemtextfor = False
textcol = 5

if CleanText == True:
    Sentiment2 = perform.GeneralCleansingFor(Sentiment2, textcol)
if removeEmojis == True:
    Sentiment2 = perform.RemoveEmoticonsFor(Sentiment2)
if removeStopWords == True:
    Sentiment2 = perform.RemoveStopWordsFor(Sentiment2)
if lemmatizeText == True:
    Sentiment2 = perform.LemmatizeTextFor(Sentiment2)
if POSLemmatizeTextFor == True:
    Sentiment2 = perform.POSLemmatizeTextFor(Sentiment2)
if Stemtextfor == True:
    Sentiment2 = perform.StemTextFor(Sentiment2)

Data cleansing initialised.


In [None]:
Sentiment2.head(10)

# Perform tokenisation and create test set

In [5]:
token = RegexpTokenizer(r"[a-zA-Z0-9]+")
cv = CountVectorizer(stop_words="english",ngram_range=(1,1),tokenizer=token.tokenize)
text_counts = cv.fit_transform(Sentiment2["newReview"])
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, Sentiment2[0], test_size=0.05, random_state=5)

# Naive Bayes

In [None]:
MNB = MultinomialNB()
MNB.fit(X_train,Y_train)
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("accuracy: " + str('{:4.2f}'.format(accuracy_score*100))+'%')

macro_precision = (metrics.precision_score(Y_test, predicted, average='macro'))
macro_precision = (round(macro_precision,2))*100
print("precision: " + str(macro_precision))

macro_recall = (metrics.recall_score(Y_test, predicted, average='macro'))
macro_recall = (round(macro_recall,2))*100
print("recall: " + str(macro_recall))

macro_f1 = (metrics.f1_score(Y_test, predicted, average='macro'))
macro_f1 = (round(macro_f1,2))*100
print("F1: " + str(macro_f1))

conf_mat = confusion_matrix(Y_test, predicted)
print("confusion matrix: \n" + str(conf_mat))



# Logistic Regression

In [None]:
lr = LogisticRegression(solver = 'liblinear', random_state = 42, max_iter=1000)
lr.fit(X_train,Y_train)
lg_pred = lr.predict(X_test)
print("Accuracy: ",round((metrics.accuracy_score(Y_test,lg_pred))*100,3),"%")

macro_precision = (metrics.precision_score(Y_test, lg_pred, average='macro'))
macro_precision = (round(macro_precision,2))*100
print(macro_precision)

macro_recall = (metrics.recall_score(Y_test, lg_pred, average='macro'))
macro_recall = (round(macro_recall,2))*100
print(macro_recall)

macro_f1 = (metrics.f1_score(Y_test, lg_pred, average='macro'))
macro_f1 = (round(macro_f1,2))*100
print(macro_f1)

conf_mat = confusion_matrix(Y_test, lg_pred)
print(conf_mat)

# KNN

try wordvec2

In [6]:
acc_list = []
prec_list = []
rec_list = []
f1_list = []
for k in range(1,10):

    classifier = KNeighborsClassifier(n_neighbors=k,algorithm='brute') #Using brute-force algorithm for quicker computation.
    classifier.fit(X_train, Y_train) #Fitting the built-in sklearn classifier on our training data
    predicted_label = classifier.predict(X_test) #Making the classifier to predict on the previously unseen test data.

    accuracy_score = (metrics.accuracy_score(Y_test,predicted_label))
    accuracy_score = (round(accuracy_score,2))*100
    acc_list.append(accuracy_score)

    confusion_mat = confusion_matrix(Y_test, predicted_label)
    class_report = classification_report(Y_test, predicted_label)

    macro_precision = (metrics.precision_score(Y_test, predicted_label, average='macro'))
    macro_precision = (round(macro_precision,2))*100
    prec_list.append(macro_precision)

    macro_recall = (metrics.recall_score(Y_test, predicted_label, average='macro'))
    macro_recall = (round(macro_recall,2))*100
    rec_list.append(macro_recall)
    
    macro_f1 = (metrics.f1_score(Y_test, predicted_label, average='macro'))
    macro_f1 = (round(macro_f1,2))*100
    f1_list.append(macro_f1)

    print("\n\nConfusion Matrix for k = {} is:\n".format(k))
    print(confusion_mat)
    print("\nClassification Report for k = {} is:\n".format(k))
    print(class_report)



Confusion Matrix for k = 1 is:

[[25013 14609]
 [11979 28399]]

Classification Report for k = 1 is:

              precision    recall  f1-score   support

    negative       0.68      0.63      0.65     39622
    positive       0.66      0.70      0.68     40378

    accuracy                           0.67     80000
   macro avg       0.67      0.67      0.67     80000
weighted avg       0.67      0.67      0.67     80000



# SVM

In [None]:
svmModel = svm.SVC(kernel='linear')

svmModel.fit(X_train, Y_train)

svmPred = svmModel.predict(X_test)

print(classification_report(Y_test, svmPred))