**Sentimental Analysis using Naive Bayes(94% Accuracy) and Support Vector Machine(98% Accuracy)**

In [2]:
#Import all neccessary libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import nltk

In [3]:
#read the data file
Corpus = pd.read_csv('sentiment_tweets3.csv')

In [4]:
Corpus.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [5]:
# Check if there is a null value
Corpus['message to examine'].isnull().sum()

0

In [6]:
#Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['message to examine'] = [entry.lower() for entry in Corpus['message to examine']]

In [7]:
#Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['message to examine']= [word_tokenize(entry) for entry in Corpus['message to examine']]

In [8]:
Corpus['message to examine']

0        [just, had, a, real, good, moment, ., i, misss...
1        [is, reading, manga, http, :, //plurk.com/p/mz...
2        [@, comeagainjen, http, :, //twitpic.com/2y2lx...
3        [@, lapcat, need, to, send, 'em, to, my, accou...
4        [add, me, on, myspace, !, !, !, myspace.com/lo...
                               ...                        
10309    [no, depression, by, g, herbo, is, my, mood, f...
10310    [what, do, you, do, when, depression, succumbs...
10311    [ketamine, nasal, spray, shows, promise, again...
10312    [dont, mistake, a, bad, day, with, depression,...
10313                                                  [0]
Name: message to examine, Length: 10314, dtype: object

In [9]:
#Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting
#We need to tag words with noun, verb or adjective to pass to WordNetLemmatizer
tag_map = defaultdict(lambda : wn.NOUN)
# nltk.download('wordnet')
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [10]:
for index,entry in enumerate(Corpus['message to examine']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [12]:
#lets see the data
Corpus['text_final']

0               ['real', 'good', 'moment', 'miss', 'much']
1                                ['read', 'manga', 'http']
2                         ['comeagainjen', 'http', 'http']
3        ['lapcat', 'need', 'send', 'accountant', 'tomo...
4                                       ['add', 'myspace']
                               ...                        
10309    ['depression', 'g', 'herbo', 'mood', 'do', 'st...
10310    ['depression', 'succumb', 'brain', 'make', 'fe...
10311    ['ketamine', 'nasal', 'spray', 'show', 'promis...
10312    ['dont', 'mistake', 'bad', 'day', 'depression'...
10313                                                   []
Name: text_final, Length: 10314, dtype: object


In [13]:
#Data looks good now, lets do the Train and Test split
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label (depression result)'],test_size=0.3)

In [14]:
#I am using TFidf to create the vectors of the input, we can also use bag of words or anything else to see the accuracy
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [15]:
print(Tfidf_vect.vocabulary_)
print(Train_X_Tfidf)

  (0, 2774)	0.4969030627497713
  (0, 2578)	0.4475330542909991
  (0, 2403)	0.3516132798119134
  (0, 2402)	0.432609824665905
  (0, 1470)	0.38294641462047313
  (0, 1160)	0.15481183429672415
  (0, 185)	0.2672146928975254
  (1, 4554)	0.4881416368218064
  (1, 4303)	0.39687480995444907
  (1, 4218)	0.3915651951084232
  (1, 4173)	0.3259652184249401
  (1, 1394)	0.5870532360709982
  (2, 4868)	0.3380210615757937
  (2, 4809)	0.2678059233305374
  (2, 4410)	0.40898290613940613
  (2, 4396)	0.2626708284073402
  (2, 2602)	0.22997786988845764
  (2, 2529)	0.2356548281736698
  (2, 2010)	0.29667531906745004
  (2, 1901)	0.3219389088599733
  (2, 454)	0.34494887427992366
  (2, 233)	0.3983530057889799
  (3, 1812)	0.4277557497878154
  (3, 133)	0.903894362480187
  (4, 3810)	0.4962437472944993
  :	:
  (7216, 1936)	0.2698650029108428
  (7216, 1693)	0.1844303642011407
  (7216, 1160)	0.07749603694382554
  (7216, 1113)	0.2339445538449005
  (7216, 582)	0.5261290511839772
  (7216, 321)	0.18633572183802596
  (7216, 293)	

In [16]:
#Fir the model using NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  93.24717285945073


In [17]:
#Using SVM
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  98.96607431340873


In [18]:
#We can clearly see that this model fit better using SVM with accuracy around 99%