# Sentiment Analysis For Stock Data

## Data Preparation

In [1]:
''' install required libraries '''

# !pip install textblob
# !pip install nltk
# !pip install wordcloud
# !pip install tweepy
# !pip install langdetect

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import pandas as pd
import re ,string, csv

# import tweepy # to access tweet API
# from tweepy import OAuthHandler # for Authentication

from textblob import TextBlob #for Valance of Sentence(polarity)

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# nltk.download('all') # Installing All from NLTK library
from nltk.corpus import stopwords # For Removing Stop words like < the , an , is ,..etc >
n_words= stopwords.words('english') #specify english stop words only
n_words.append("rt") #append rt for stop word dictionary

from nltk.tokenize import word_tokenize # for Tokenizing the sentnces as tokens
from nltk.stem.porter import PorterStemmer # converting words to their root forms ,speed and simplicity
porter = PorterStemmer() #Create stemmer obejct

from nltk.stem import WordNetLemmatizer # also converting words to their actual root forms(noun , verb ,aobjective) ,but it slow
lemmatizer = WordNetLemmatizer() #Create lemmatizer obejct

from wordcloud import WordCloud,STOPWORDS #Look at Words with highest Frequency for expression

from langdetect import detect_langs # Detect language for each tweets 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk import ngrams
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Reading Datasets
stocks=pd.read_csv('C:/Users/user/Desktop/Sentiment/stocks_cleaned.csv')
Data=pd.read_csv('C:/Users/user/Desktop/Sentiment/stockerbot-export.csv',error_bad_lines=False)

b'Skipping line 731: expected 8 fields, saw 13\nSkipping line 2836: expected 8 fields, saw 15\nSkipping line 3058: expected 8 fields, saw 12\nSkipping line 3113: expected 8 fields, saw 12\nSkipping line 3194: expected 8 fields, saw 17\nSkipping line 3205: expected 8 fields, saw 17\nSkipping line 3255: expected 8 fields, saw 17\nSkipping line 3520: expected 8 fields, saw 17\nSkipping line 4078: expected 8 fields, saw 17\nSkipping line 4087: expected 8 fields, saw 17\nSkipping line 4088: expected 8 fields, saw 17\nSkipping line 4499: expected 8 fields, saw 12\n'


In [3]:
Data.head()

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1019696670777503700,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1019709091038548000,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1019711413798035500,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1019716662587740200,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1019718460287389700,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


### Exploratory data analysis

In [4]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28264 entries, 0 to 28263
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             28264 non-null  int64 
 1   text           28264 non-null  object
 2   timestamp      28264 non-null  object
 3   source         28264 non-null  object
 4   symbols        28264 non-null  object
 5   company_names  28263 non-null  object
 6   url            21895 non-null  object
 7   verified       28264 non-null  bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 1.5+ MB


In [5]:
'''Convert Columns data types '''

# stockerbot["timestamp"] = pd.to_datetime(stockerbot["timestamp"])
Data["text"] = Data["text"].astype(str)
Data["url"] = Data["url"].astype(str)
Data["company_names"] = Data["company_names"].astype("category")
Data["symbols"] = Data["symbols"].astype("category")
Data["source"] = Data["source"].astype("category")
Data=Data.drop(columns=['id'])

In [6]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28264 entries, 0 to 28263
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   text           28264 non-null  object  
 1   timestamp      28264 non-null  object  
 2   source         28264 non-null  category
 3   symbols        28264 non-null  category
 4   company_names  28263 non-null  category
 5   url            28264 non-null  object  
 6   verified       28264 non-null  bool    
dtypes: bool(1), category(3), object(3)
memory usage: 1.1+ MB


In [7]:
''' Split Timestamp Column into Dates and times '''

Data[['dayofweek','month','day','time','timezone', 'year']] = Data.timestamp.str.split(expand=True)
Data[['hour','minute','second']] = Data.time.str.split(':',expand=True)
Data.head(2)

Unnamed: 0,text,timestamp,source,symbols,company_names,url,verified,dayofweek,month,day,time,timezone,year,hour,minute,second
0,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True,Wed,Jul,18,21:33:26,0,2018,21,33,26
1,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True,Wed,Jul,18,22:22:47,0,2018,22,22,47


In [8]:
''' Check for null values '''
Data.isnull().any() 

text             False
timestamp        False
source           False
symbols          False
company_names     True
url              False
verified         False
dayofweek        False
month            False
day              False
time             False
timezone         False
year             False
hour             False
minute           False
second           False
dtype: bool

There are a null values in Company Names Column

In [9]:
''' Check for null values in Company names columns '''

print(f'null :{Data.company_names.isnull().sum()}')
Data[Data['company_names'].isnull()] 

null :1


Unnamed: 0,text,timestamp,source,symbols,company_names,url,verified,dayofweek,month,day,time,timezone,year,hour,minute,second
3369,When you try to gauge sentiment on a $ticker b...,Thu Jul 12 14:28:55 +0000 2018,provotrout,ticker,,,False,Thu,Jul,12,14:28:55,0,2018,14,28,55


Only One Null Values , so not important for us to delete or not

In [10]:
# Take a look at 10 Largest Source 
total_sources = Data["source"].value_counts()
print(f'Most sources:\n{total_sources.nlargest(10)}')
plt.figure(figsize=(15,5))
# total_sources.head(50).sort_values(ascending=False).plot(kind='bar') 

Most sources:
bibeypost_stock    990
whatsonthorold2    963
mmahotstuff1       899
reurope_stock      668
MareaInformativ    640
optioncharts       614
ConsumerFeed       411
dispatchtribune    375
EnterpriseLeade    368
TranscriptDaily    359
Name: source, dtype: int64


<Figure size 1080x360 with 0 Axes>

<Figure size 1080x360 with 0 Axes>

In [11]:
# Take a look at 10 Largest symbols 
total_companies = Data["symbols"].value_counts()
print(f'Most companies:\n{total_companies.nlargest(10)}')
plt.figure(figsize=(15,5))
# total_companies.head(50).sort_values(ascending=False).plot(kind='bar') 

Most companies:
NFLX    101
MOMO    100
HON     100
AMAT    100
GPS     100
ES      100
MTB     100
GRPN     99
MAS      99
ESS      99
Name: symbols, dtype: int64


<Figure size 1080x360 with 0 Axes>

<Figure size 1080x360 with 0 Axes>

In [12]:
len(Data.text)

28264

In [13]:
# Delete Unwanted Some Text 
Data=Data[Data["text"]!='btc']

### Pre-Processing Text

In [14]:
# Define Clean Function to fix text
def Clean(text):

  # Frist converting all letters to lower case
  text= text.lower()
  
  # removing unwanted digits ,special chracters from the text
  text= ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", text).split()) #tags
  text= ' '.join(re.sub("^@?(\w){1,15}$", " ", text).split())
    
  text= ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())   #Links
  text= ' '.join(re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"," ", text).split()) 
  text= ' '.join(re.sub(r'http\S+', '',text).split())
  
  
  text= ' '.join(re.sub(r'www\S+', '',text).split())
  text= ' '.join(re.sub("\s+", " ",text).split()) #Extrem white Space
  text= ' '.join(re.sub("[^-9A-Za-z ]", "" ,text).split()) #digits 
  text= ' '.join(re.sub('-', ' ', text).split()) 
  text= ' '.join(re.sub('_', ' ', text).split()) #underscore 
  
  # Display available PUNCTUATION for examples
  #for c in string.punctuation:
       #print(f"[{c}]")
  
  # removing stopwards and numbers from STRING library
  table= str.maketrans('', '', string.punctuation+string.digits)
  text = text.translate(table)
  
  # Split Sentence as tokens words 
  tokens = word_tokenize(text)
  
  # converting words to their root forms by STEMMING THE WORDS 
#   stemmed1 = [lemmatizer.lemmatize(word) for word in tokens] #Covert words to their actual root
  stemmed2 = [porter.stem(word) for word in tokens] # Covert words to their rootbut not actual
  
  # Delete each stop words from English stop words
#   words = [w for w in stemmed1 if not w in n_words] #n_words contains English stop words
  words = [w for w in stemmed2 if not w in n_words] #n_words contains English stop words

  text  = ' '.join(words)
    
  return text

In [15]:
# Text Before Pre-processing
Data.text

0        VIDEO: “I was in my office. I was minding my o...
1        The price of lumber $LB_F is down 22% since hi...
2        Who says the American Dream is dead? https://t...
3        Barry Silbert is extremely optimistic on bitco...
4        How satellites avoid attacks and space junk wh...
                               ...                        
28259           $FB : 29234a9c-7f08-4d5a-985f-cb1a5554ecf9
28260    【仮想通貨】ビットコインの価格上昇、８０万円台回復　約１カ月半ぶり　　　　　　$BTC ht...
28261    RT @invest_in_hd: 'Nuff said!  $TEL #telcoin #...
28262    【仮想通貨】ビットコインの価格上昇、８０万円台回復　約１カ月半ぶり　　　　　　$BTC ht...
28263    Stellar $XLM price: $0.297852 Binance registra...
Name: text, Length: 28264, dtype: object

In [16]:
#Delete unwanted source form our text 
Data=Data[Data["source"] != "test5f1798"]

In [17]:
# apply Clean Funsction to our Text
Data.text=[Clean(x) for x in Data.text]

In [18]:
Data.text

0        video wa offic wa mind busi david solomon tell...
1        price lumber lbf sinc hit ytd high maci turnar...
2                                  say american dream dead
3        barri silbert extrem optimist bitcoin predict ...
4        satellit avoid attack space junk circl earth paid
                               ...                        
28258    new exchang telcoin mid august im glad tel big...
28260                                                  btc
28261    inhd nuff said tel telcoin telfam crypto block...
28262                                                  btc
28263     stellar xlm price binanc registr open limit time
Name: text, Length: 28239, dtype: object

In [19]:
# Delete Unwanted Some Text 
Data=Data[Data["text"]!='btc']

In [20]:
# Text after Pre-processing
Data.text

0        video wa offic wa mind busi david solomon tell...
1        price lumber lbf sinc hit ytd high maci turnar...
2                                  say american dream dead
3        barri silbert extrem optimist bitcoin predict ...
4        satellit avoid attack space junk circl earth paid
                               ...                        
28256    exxon onc perfect machin run dri wall street j...
28257                         fallen hero today btc action
28258    new exchang telcoin mid august im glad tel big...
28261    inhd nuff said tel telcoin telfam crypto block...
28263     stellar xlm price binanc registr open limit time
Name: text, Length: 28225, dtype: object

In [21]:
''' Detect Emotions for each text Form TextBlob Library '''

detectEmotion=[]
detectPolarity=[]

for txt in Data.text:
    
    analysis=TextBlob(txt)
    Polarity=analysis.sentiment.polarity
    
    if Polarity  <0:
        emotion='2'  #Negative
    elif Polarity>0: 
        emotion='1'  #Positive
    else:
        emotion='0'  #Neutral
        
    detectEmotion.append(emotion)
    detectPolarity.append(Polarity)
    
# detectEmotion=pd.DataFrame()

Data['Polarity']=detectPolarity
Data['Emotion'] =detectEmotion

In [65]:
Data.head(3)

Unnamed: 0,text,url,year,month,day,dayofweek,hour,minute,second,source,symbols,Polarity,Emotion,language,verified
0,video wa offic wa mind busi david solomon tell...,https://twitter.com/i/web/status/1019696670777...,2018,Jul,18,Wed,21,33,26,GoldmanSachs,GS,0.0,0,en,True
1,price lumber lbf sinc hit ytd high maci turnar...,https://twitter.com/i/web/status/1019709091038...,2018,Jul,18,Wed,22,22,47,StockTwits,M,0.16,1,en,True
2,say american dream dead,https://buff.ly/2L3kmc4,2018,Jul,18,Wed,22,32,1,TheStreet,AIG,-0.1,2,en,True


In [23]:
# Data  = Data[Data['verified'] == True]

In [24]:
#check for valid string only to detect languages

TextValid=[]

for i in range(len(Data)):
    TextValid.append(bool(re.match('^(?=.*[a-zA-Z])', Data.iloc[i,0])))
    
Data['valid']=TextValid
print(len(Data[Data['valid']==False]))
print(len(Data[Data['valid']==True]))

18
28207


In [25]:
# valid string only

Data=Data[Data['valid']==True]

In [26]:
'''Detect languages for each text to filter into specific Lang'''

languages = []

# Loop over the sentences in the data and detect their language
for row in range(len(Data)):
    languages.append(detect_langs(Data.iloc[row, 0]))
    
# print('The detected languages are: ', languages) >>> ['en':'N']
languages = [str(lang).split(':')[0][1:] for lang in languages] 

# Assign the list to a new feature 
Data['language'] = languages

In [27]:
# look at Lang detected from our text

Data['language'].value_counts()

en    19388
ca     1519
it     1155
fr     1123
ro      839
sv      655
nl      511
da      404
no      331
es      325
cy      282
af      236
so      200
id      168
pt      149
sl      137
et      112
tl      105
sw       75
pl       64
de       60
hr       59
sq       59
fi       57
sk       41
hu       34
cs       29
tr       26
lt       25
vi       22
lv       17
Name: language, dtype: int64

In [28]:
# len(Data)

In [29]:
# We Only want to deal with english text for now , so we will filter data for EN Only

Data=Data[Data['language']=='en']

In [30]:
# len(Data)

In [31]:
Data=Data[['text','url','year','month','day','dayofweek','hour','minute','second','source','symbols','Polarity','Emotion','language','verified']]
Data.head(4)

Unnamed: 0,text,url,year,month,day,dayofweek,hour,minute,second,source,symbols,Polarity,Emotion,language,verified
0,video wa offic wa mind busi david solomon tell...,https://twitter.com/i/web/status/1019696670777...,2018,Jul,18,Wed,21,33,26,GoldmanSachs,GS,0.0,0,en,True
1,price lumber lbf sinc hit ytd high maci turnar...,https://twitter.com/i/web/status/1019709091038...,2018,Jul,18,Wed,22,22,47,StockTwits,M,0.16,1,en,True
2,say american dream dead,https://buff.ly/2L3kmc4,2018,Jul,18,Wed,22,32,1,TheStreet,AIG,-0.1,2,en,True
3,barri silbert extrem optimist bitcoin predict ...,https://twitter.com/i/web/status/1019716662587...,2018,Jul,18,Wed,22,52,52,MarketWatch,BTC,0.136364,1,en,True


In [32]:
apple=Data[['text','year','month','day','Polarity','Emotion']][Data.symbols=='AAPL']
apple

Unnamed: 0,text,year,month,day,Polarity,Emotion
26452,kind fun watch race trillion market cap ani in...,2018,Jul,18,0.45,1
26507,appl ha join suppli invest almost next four ye...,2018,Jul,18,0.183333,1
26553,end day corn introduc gorilla glass intel turn...,2018,Jul,18,0.0,0
26747,wa think icahn letter say appl wa worth sold h...,2018,Jul,18,0.3,1
26789,extern true tone display featur onli work macb...,2018,Jul,18,0.175,1
26797,motola agre work thi space use aw isnt reliabl...,2018,Jul,18,0.8,1
27005,googl blast eu commiss rule question whi refus...,2018,Jul,18,0.0,0
27089,instant appl unlock cent iphon gs c unlockfus ...,2018,Jul,18,0.0,0
27241,corn debut gorilla glass unpreced multi drop p...,2018,Jul,18,0.0,0
27650,aapl appl inc grow earn annual next year marke...,2018,Jul,18,0.0,0


In [33]:
# Percentage of each Emotions for apple only

app_neutral   = apple['text'][ apple['Emotion'] == '0']
app_positive = apple['text'][ apple['Emotion'] == '1']
app_negative = apple['text'][ apple['Emotion'] == '2']

print(f' Percentage Positive: {len(app_positive)/len(apple)}\n Percentage Negetive: {len(app_negative)/len(apple)}\n Percentage Neutral : {len(app_neutral)/len(apple)}')

 Percentage Positive: 0.3888888888888889
 Percentage Negetive: 0.05555555555555555
 Percentage Neutral : 0.5555555555555556


In [34]:
# the below function will create a word cloud

def wordcloud_draw(data, color = 'black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word  # double check for nay links
                                and not word.startswith('#')  # removing hash tags
                                and word != 'rt'  
                            ])
    wordcloud = WordCloud(stopwords=STOPWORDS, # using stopwords provided by Word cloud its optional since we already removed stopwords :)
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    # using matplotlib to display the images in notebook itself.
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
  

In [59]:
# print("Most Positive words Frequency")
# wordcloud_draw(app_positive, 'white')
# print("Most Negative words Frequency")
# wordcloud_draw(app_negative)
# print("Most Neutral words Frequency")
# wordcloud_draw(app_neutral, 'white')

In [36]:
# Percentage of each Emotions overall symbols

df_neutral   = Data['text'][ Data['Emotion'] == '0']
df_positive  = Data['text'][ Data['Emotion'] == '1']
df_negative  = Data['text'][ Data['Emotion'] == '2']


print(f' Percentage Positive: {len(df_positive)/len(Data)}\n Percentage Negetive: {len(df_negative)/len(Data)}\n Percentage Neutral: {len(df_neutral)/len(Data)}')

 Percentage Positive: 0.2599030328037962
 Percentage Negetive: 0.08087476789766866
 Percentage Neutral: 0.6592221992985352


In [60]:
# print("Most Positive words Frequency")
# wordcloud_draw(df_positive, 'white')
# print("Most Negative words Frequency")
# wordcloud_draw(df_negative)
# print("Most Neutral words Frequency")
# wordcloud_draw(df_neutral, 'white')

In [38]:
# Save Dataset
Data.to_csv("MystockData.csv",index = False)

## Model

In [39]:
def NgramModels(Model , txt, n):
    
    x_train, x_test, y_train, y_test = train_test_split(Data['text'], Data['Emotion'], test_size=0.2, random_state=50)
    
    vect      = CountVectorizer(max_features=1000 , ngram_range=(n,n))
    train_vect= vect.fit_transform(x_train)
    test_vect = vect.transform(x_test)
    
    model     = Model
    t0        = time.time()
    model.fit(train_vect, y_train)
    t1        = time.time()
    predicted = model.predict(test_vect)
    t2        = time.time()
    time_train= t1-t0
    time_pred = t2-t1
    
    accuracy  = model.score(train_vect, y_train)
    predicted = model.predict(test_vect)
    
    report = classification_report(y_test, predicted, output_dict=True)
    print("Models with " , n , "-grams :\n")
    print('********************** \n')
    print(txt)
    print("Training time: %fs; Prediction time: %fs \n" % (time_train, time_pred))
    print('Accuracy score train set :', accuracy)
    print('Accuracy score test set  :', accuracy_score(y_test, predicted),'\n')
    print('Positive: ', report['1'])
    print('Neutral : ', report['0'])
    print('Negative: ', report['2'])
    print('\n --------------------------------------------------------------------------------------------------- \n')

In [40]:
def KNN_Ngram(n):
    
    x_train, x_test, y_train, y_test = train_test_split(Data['text'], Data['Emotion'], test_size=0.2, random_state=50)
    
    vect      = CountVectorizer(max_features=1000 , ngram_range=(n,n))
    train_vect= vect.fit_transform(x_train)
    test_vect = vect.transform(x_test)
    
    for k in [1,3,5,7,10]:

        model = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
        t0        = time.time()
        model.fit(train_vect, y_train)
        t1        = time.time()
        predicted = model.predict(test_vect)
        t2        = time.time()
        time_train= t1-t0
        time_pred = t2-t1

        accuracy  = model.score(train_vect, y_train)
        predicted = model.predict(test_vect)

        report = classification_report(y_test, predicted, output_dict=True)

        print("Models with " , n , "-grams :\n")
        print('********************** \n')
        print("Classification Report for k = {} is:\n".format(k))
        print("Training time: %fs ; Prediction time: %fs \n" % (time_train, time_pred))
        print('Accuracy score train set :', accuracy)
        print('Accuracy score test set  :', accuracy_score(y_test, predicted),'\n')
        print('Positive: ', report['1'])
        print('Neutral : ', report['0'])
        print('Negative: ', report['2'])
        print('\n -------------------------------------------------------------------------------------- \n')

In [41]:
def TFIDFModels(Model,txt):
    
    x_train, x_test, y_train, y_test = train_test_split(Data['text'], Data['Emotion'], test_size=0.2, random_state=50)
    
    vect      = TfidfVectorizer(min_df = 5, max_df =0.8, sublinear_tf = True, use_idf = True)
    train_vect= vect.fit_transform(x_train)
    test_vect = vect.transform(x_test)
    
    model     = Model
    t0        = time.time()
    model.fit(train_vect, y_train)
    t1        = time.time()
    predicted = model.predict(test_vect)
    t2        = time.time()
    time_train= t1-t0
    time_pred = t2-t1
    
    accuracy  = model.score(train_vect, y_train)
    predicted = model.predict(test_vect)
    
    report = classification_report(y_test, predicted, output_dict=True)
    
    print(txt)
    print("Training time: %fs; Prediction time: %fs \n" % (time_train, time_pred))
    print('Accuracy score train set :', accuracy)
    print('Accuracy score test set  :', accuracy_score(y_test, predicted),'\n')
    print('Positive: ', report['1'])
    print('Neutral : ', report['0'])
    print('Negative: ', report['2'])
    print('\n -------------------------------------------------------------------------------------- \n')

In [42]:
def KNN_TFIDF():
    
    x_train, x_test, y_train, y_test = train_test_split(Data['text'], Data['Emotion'], test_size=0.2, random_state=50)
    
    vect      = TfidfVectorizer(min_df = 5, max_df =0.8, sublinear_tf = True, use_idf = True)
    train_vect= vect.fit_transform(x_train)
    test_vect = vect.transform(x_test)
    
    for k in [1,3,5,7,10]:

        model = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
        t0        = time.time()
        model.fit(train_vect, y_train)
        t1        = time.time()
        predicted = model.predict(test_vect)
        t2        = time.time()
        time_train= t1-t0
        time_pred = t2-t1

        accuracy  = model.score(train_vect, y_train)
        predicted = model.predict(test_vect)

        report = classification_report(y_test, predicted, output_dict=True)

        print("Classification Report for k = {} is:\n".format(k))
        print("Training time: %fs ; Prediction time: %fs \n" % (time_train, time_pred))
        print('Accuracy score train set :', accuracy)
        print('Accuracy score test set  :', accuracy_score(y_test, predicted),'\n')
        print('Positive: ', report['1'])
        print('Neutral : ', report['0'])
        print('Negative: ', report['2'])
        print('\n -------------------------------------------------------------------------------------- \n')

In [43]:
SupportVectorClassifier=svm.SVC(kernel='linear')

LogReg2=NgramModels(Model=LogisticRegression(),txt='Logistic Regression Model : \n ', n=2)
LogReg3=NgramModels(Model=LogisticRegression(),txt='Logistic Regression Model : \n ', n=3)

svm2=NgramModels(Model=SupportVectorClassifier ,txt='Support Vectoer Classifier Model : \n ', n=2)
svm3=NgramModels(Model=SupportVectorClassifier ,txt='Support Vectoer Classifier Model : \n ', n=3)

DecTree2=NgramModels(Model=tree.DecisionTreeClassifier(),txt='Decision Tree Classifier Model : \n ', n=2)
DecTree3=NgramModels(Model=tree.DecisionTreeClassifier(),txt='Decision Tree Classifier Model : \n ', n=3)

KNN2=KNN_Ngram(2)
KNN3=KNN_Ngram(3)

Models with  2 -grams :

********************** 

Logistic Regression Model : 
 
Training time: 1.701037s; Prediction time: 0.000000s 

Accuracy score train set : 0.8079303675048356
Accuracy score test set  : 0.8032490974729242 

Positive:  {'precision': 0.9135021097046413, 'recall': 0.44777662874870733, 'f1-score': 0.600971547536433, 'support': 967}
Neutral :  {'precision': 0.7820940819423369, 'recall': 0.990392006149116, 'f1-score': 0.8740037307105307, 'support': 2602}
Negative:  {'precision': 0.963302752293578, 'recall': 0.33980582524271846, 'f1-score': 0.5023923444976076, 'support': 309}

 --------------------------------------------------------------------------------------------------- 

Models with  3 -grams :

********************** 

Logistic Regression Model : 
 
Training time: 1.823998s; Prediction time: 0.000000s 

Accuracy score train set : 0.7780786589297227
Accuracy score test set  : 0.7753996905621454 

Positive:  {'precision': 0.9514285714285714, 'recall': 0.3443640124

In [44]:
SupportVectorClassifier=svm.SVC(kernel='linear')

print('Models with Tfidf Feature extraction Techniques : \n')
print('************************************************ \n')

LogReg=TFIDFModels(Model=LogisticRegression(),txt='Logistic Regression Model : \n ')
svm=TFIDFModels(Model=SupportVectorClassifier,txt='Support Vector Classifier Model : \n ')
DecTree=TFIDFModels(Model=tree.DecisionTreeClassifier(),txt='Decision Tree Classifier Model : \n ')
knn_tfidf=KNN_TFIDF()

Models with Tfidf Feature extraction Techniques : 

************************************************ 

Logistic Regression Model : 
 
Training time: 2.321228s; Prediction time: 0.000000s 

Accuracy score train set : 0.9706640876853643
Accuracy score test set  : 0.9522949974213513 

Positive:  {'precision': 0.9725877192982456, 'recall': 0.9172699069286453, 'f1-score': 0.9441192123469931, 'support': 967}
Neutral :  {'precision': 0.9436363636363636, 'recall': 0.9973097617217525, 'f1-score': 0.9697309417040357, 'support': 2602}
Negative:  {'precision': 0.9768518518518519, 'recall': 0.6828478964401294, 'f1-score': 0.8038095238095239, 'support': 309}

 -------------------------------------------------------------------------------------- 

Support Vector Classifier Model : 
 
Training time: 27.824473s; Prediction time: 4.186903s 

Accuracy score train set : 0.9889748549323018
Accuracy score test set  : 0.9783393501805054 

Positive:  {'precision': 0.9831223628691983, 'recall': 0.963805584281

In [57]:
idx = pd.MultiIndex.from_product([['2-grams', '3-grams', 'TFIDF'],['Accuracy Training %','Accuracy Testing %']],names=['FeatureExtraction', 'Metric'])
col = ['LogisticRegression', 'SupportVectorClassifier', 'DecisionTree', 'KNeighborsClassifier']

Result = pd.DataFrame('*', idx, col)
Result.LogisticRegression=['80.79','80.32','77.80','77.53','97.06','95.22']
Result.SupportVectorClassifier=['80.73','80.42','77.80','77.74','98.89','97.83']
Result.DecisionTree=['82.28','80.50','78.23','77.59','99.97','98.63']
Result.KNeighborsClassifier=['80.84','77.61','77.54','75.99','99.96','87.05']

In [58]:
Result

Unnamed: 0_level_0,Unnamed: 1_level_0,LogisticRegression,SupportVectorClassifier,DecisionTree,KNeighborsClassifier
FeatureExtraction,Metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2-grams,Accuracy Training %,80.79,80.73,82.28,80.84
2-grams,Accuracy Testing %,80.32,80.42,80.5,77.61
3-grams,Accuracy Training %,77.8,77.8,78.23,77.54
3-grams,Accuracy Testing %,77.53,77.74,77.59,75.99
TFIDF,Accuracy Training %,97.06,98.89,99.97,99.96
TFIDF,Accuracy Testing %,95.22,97.83,98.63,87.05


In [64]:
# Result.to_csv("FinalResult.csv")