In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [5]:
# Print many statements at same time using the below command
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [6]:
df= pd.read_csv("tweets.csv")
df.head()
df.info()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [11]:
pd.pivot_table(df,index='label',values=["id"],aggfunc='count')

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,29720
1,2242


In [4]:
#To remove all '@user' pattern from the given text. It takes two arguments one is original string 
#of the text and other is the pattern of the text.
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [5]:
# remove twitter handles (@user)
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")

In [6]:
# remove special characters, numbers, punctuations
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [7]:
#Removing short words having length of three
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [8]:
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguide society #motivation


In [9]:
#Tokenization
tokenized_tweet = df['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [when, father, dysfunctional, selfish, drags, ...
1    [thanks, #lyft, credit, cause, they, offer, wh...
2                              [bihday, your, majesty]
3                     [#model, love, take, with, time]
4                   [factsguide, society, #motivation]
Name: tidy_tweet, dtype: object

In [10]:
#Stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

0    [when, father, dysfunct, selfish, drag, kid, i...
1    [thank, #lyft, credit, caus, they, offer, whee...
2                              [bihday, your, majesti]
3                     [#model, love, take, with, time]
4                         [factsguid, societi, #motiv]
Name: tidy_tweet, dtype: object

In [11]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

df['tidy_tweet'] = tokenized_tweet

In [14]:
# To append list as column in existing dataframe
#df['tokenized_tweet'] = pd.Series(tokenized_tweet, index=df.index)

In [12]:
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,2,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,3,0,bihday your majesty,bihday your majesti
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguid societi #motiv


In [13]:
df['label'].head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [14]:
type(df['label'])

pandas.core.series.Series

In [15]:
df1 = df.iloc[:,[1,3]]

In [16]:
df1.head()

Unnamed: 0,label,tidy_tweet
0,0,when father dysfunct selfish drag kid into dys...
1,0,thank #lyft credit caus they offer wheelchair ...
2,0,bihday your majesti
3,0,#model love take with time
4,0,factsguid societi #motiv


In [17]:
df1.tail()

Unnamed: 0,label,tidy_tweet
31957,0,that youuu
31958,0,nina turner airwav tri wrap herself mantl genu...
31959,0,listen song monday morn work
31960,1,#sikh #templ vandalis #calgari #wso condemn
31961,0,thank follow


In [21]:
#df1 = df.drop(['tweet'], axis = 1)

In [22]:
#df2 = df1.drop(['id'],axis = 1)
#df2.head()

In [18]:
#Interchanging columns in dataframe
cols = list(df1)
cols[1] , cols[0] = cols[0] , cols[1]
cols

['tidy_tweet', 'label']

In [19]:
df2 = df1.ix[:, cols]

In [20]:
df2.head()

Unnamed: 0,tidy_tweet,label
0,when father dysfunct selfish drag kid into dys...,0
1,thank #lyft credit caus they offer wheelchair ...,0
2,bihday your majesti,0
3,#model love take with time,0
4,factsguid societi #motiv,0


In [21]:
#Shuffling rows of the dataframe
df3 = df2.reindex(np.random.permutation(df.index))

In [22]:
df3.head()

Unnamed: 0,tidy_tweet,label
6379,thought factori left right polaris #trump #use...,1
15115,think babi like plaintain some breaki thi morn...,0
29090,when gonna back #missthebigguy #feedmemor,0
894,friendszxc,0
12919,thi make #mad afternoon walk thi what #animalc...,0


In [23]:
df3.tail()

Unnamed: 0,tidy_tweet,label
21611,realli happi with #eng lineup shape hope prove...,0
5826,sound like dare,0
5512,go believ thi #gaysham expect better from,0
8200,noah laugh contagi #laugher #babi #love,0
16126,veri #ilovey #mother #mama #maria #god #bless,0


In [24]:
#Extracting 3200 records from the dataframe
df4 = df3.head(3200)

In [25]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3200 entries, 6379 to 21324
Data columns (total 2 columns):
tidy_tweet    3200 non-null object
label         3200 non-null int64
dtypes: int64(1), object(1)
memory usage: 75.0+ KB


In [26]:
df4.head()

Unnamed: 0,tidy_tweet,label
6379,thought factori left right polaris #trump #use...,1
15115,think babi like plaintain some breaki thi morn...,0
29090,when gonna back #missthebigguy #feedmemor,0
894,friendszxc,0
12919,thi make #mad afternoon walk thi what #animalc...,0


In [32]:
#from sklearn.feature_extraction.text import CountVectorizer
#bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')

In [33]:
# bag-of-words feature matrix 
#df5['tidy_tweet'] = bow_vectorizer.fit_transform(df5['tidy_tweet'])

In [34]:
#df5.head()

In [35]:
#Create features and labels
#x= df5.drop(['label'],axis=1)
#y = df5['label']
#x.head()
#y.head()

In [33]:
# To split the data set in to parts
train=df4.sample(frac=0.75,random_state=2)
test=df4.drop(train.index)

In [34]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 28921 to 5115
Data columns (total 2 columns):
tidy_tweet    800 non-null object
label         800 non-null int64
dtypes: int64(1), object(1)
memory usage: 18.8+ KB


Unnamed: 0,tidy_tweet,label
28921,your comment reflect ignor,1
23280,there peopl there will fuher their caus contro...,0
10378,thank need channel go stream,0
7783,love puff #dinner #lalov #smile #saturd #kebab...,0
5805,happi work confer right mindset lead cultur de...,0


In [35]:
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 30042 to 12064
Data columns (total 2 columns):
tidy_tweet    2400 non-null object
label         2400 non-null int64
dtypes: int64(1), object(1)
memory usage: 56.2+ KB


Unnamed: 0,tidy_tweet,label
30042,bewar market taint messag,1
31585,sadist dead aliv #dead #aliv #ist,0
1656,life remot chang yourself #success #career,0
27513,peopl buy thi stunt hook line sinker wait free...,0
27663,nose perfum spark furor,1


In [36]:
train.shape
test.shape

(2400, 2)

(800, 2)

In [37]:
train.groupby('label').count()

Unnamed: 0_level_0,tidy_tweet
label,Unnamed: 1_level_1
0,2207
1,193


In [38]:
test.groupby('label').count()

Unnamed: 0_level_0,tidy_tweet
label,Unnamed: 1_level_1
0,751
1,49


In [39]:
#train_x = train['tidy_tweet']
#train_y = train['label']

In [40]:
#train_x.head()

In [38]:
# Creating Features and Labels and running correlation Heat map
# create train and test data with 75% and 25% split
#train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.25,random_state=1)train_x.shape
#test_x.shape
#train_y.shape
#test_y.shape

(2400, 1)

(800, 1)

(2400,)

(800,)

In [41]:
#train_x.head()

Unnamed: 0,tidy_tweet
7214,"(0, 26)\t1\n (0, 414)\t1\n (0, 761)\t1\n ..."
27104,"(0, 26)\t1\n (0, 414)\t1\n (0, 761)\t1\n ..."
8101,"(0, 26)\t1\n (0, 414)\t1\n (0, 761)\t1\n ..."
17646,"(0, 26)\t1\n (0, 414)\t1\n (0, 761)\t1\n ..."
21005,"(0, 26)\t1\n (0, 414)\t1\n (0, 761)\t1\n ..."


In [42]:
#train_y.head()

7214     0
27104    0
8101     0
17646    0
21005    0
Name: label, dtype: int64

In [25]:
# To split the data set in to parts
#train=df5.sample(frac=0.7,random_state=200)
#test=df5.drop(train.index)

In [26]:
#test.info()
#test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 960 entries, 9911 to 22608
Data columns (total 2 columns):
tidy_tweet    960 non-null object
label         960 non-null int64
dtypes: int64(1), object(1)
memory usage: 22.5+ KB


Unnamed: 0,tidy_tweet,label
9911,#bbq #ktyd#bbq#incident#lol#enjoy #sorry#and#t...,0
11135,#piracydimaadi watch movi theatr onli,0
22593,friday rememb judg other what #love peac #success,0
22056,well want,0
18090,#nochebuena #lasvega #usa vega strip,0


In [27]:
#train.info()
#train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2240 entries, 12896 to 4704
Data columns (total 2 columns):
tidy_tweet    2240 non-null object
label         2240 non-null int64
dtypes: int64(1), object(1)
memory usage: 52.5+ KB


Unnamed: 0,tidy_tweet,label
12896,hire first,0
16712,remak doesn have token asian chick #carolsbyca...,1
31757,stop these chelski fan #equal #divers,1
16976,,0
15764,midweek,0


In [41]:
#Converting dataframe into list of tuples
train_1 = list(map(tuple,train.values))
train_1[0:4]

[('bewar market taint messag', 1),
 ('sadist dead aliv #dead #aliv #ist', 0),
 ('life remot chang yourself #success #career', 0),
 ('peopl buy thi stunt hook line sinker wait free click media #lazi', 0)]

In [42]:
test_1= list(map(tuple,test.values))
test_1[0:4]

[('your comment reflect ignor', 1),
 ('there peopl there will fuher their caus control there', 0),
 ('thank need channel go stream', 0),
 ('love puff #dinner #lalov #smile #saturd #kebab hossein persian kebab', 0)]

In [24]:
#! pip install -U textblob nltk

Collecting textblob
  Downloading https://files.pythonhosted.org/packages/7c/7d/ad09a26b63d4ad3f9395840c72c95f2fc9fa2b192094ef14e9e720be56f9/textblob-0.15.2-py2.py3-none-any.whl (636kB)
[K    100% |████████████████████████████████| 645kB 1.8MB/s eta 0:00:01
[?25hCollecting nltk
  Downloading https://files.pythonhosted.org/packages/6f/ed/9c755d357d33bc1931e157f537721efb5b88d2c583fe593cc09603076cc3/nltk-3.4.zip (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 852kB/s eta 0:00:01
[?25hCollecting six (from nltk)
  Downloading https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Requirement already up-to-date: singledispatch in ./anaconda3/lib/python3.6/site-packages (from nltk)
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... [?25ldone
[?25h  Stored in directory: /home/munirajumalathi/.cache/pip/wheels/4b/c8/24/b2343664bcceb7147efeb21c0b23703a05b23f

In [25]:
#! pip install --upgrade pip

Collecting pip
  Downloading https://files.pythonhosted.org/packages/d7/41/34dd96bd33958e52cb4da2f1bf0818e396514fd4f4725a79199564cd0c20/pip-19.0.2-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 889kB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.1
    Uninstalling pip-9.0.1:
      Successfully uninstalled pip-9.0.1
Successfully installed pip-19.0.2


In [43]:
from textblob.classifiers import NaiveBayesClassifier

In [22]:
#train_sample = train[0:2000]

In [23]:
#cl = NaiveBayesClassifier(train_sample)

In [None]:
#cl.accuracy(train_sample)

In [44]:
cl = NaiveBayesClassifier(train_1)

In [45]:
cl.accuracy(train_1)

0.96625

In [46]:
cl.accuracy(test_1)

0.9475

In [47]:
cl.informative_features(15)

[('contains(violent)', True),
 ('contains(suppoer)', True),
 ('contains(liber)', True),
 ('contains(polit)', True),
 ('contains(racist)', True),
 ('contains(misogynist)', True),
 ('contains(research)', True),
 ('contains(miami)', True),
 ('contains(messag)', True),
 ('contains(white)', True),
 ('contains(latest)', True),
 ('contains(profil)', True),
 ('contains(boot)', True),
 ('contains(scum)', True),
 ('contains(effect)', True)]

In [48]:
cl.show_informative_features(15)

Most Informative Features
       contains(violent) = True                1 : 0      =     34.1 : 1.0
       contains(suppoer) = True                1 : 0      =     34.1 : 1.0
         contains(liber) = True                1 : 0      =     31.6 : 1.0
         contains(polit) = True                1 : 0      =     29.1 : 1.0
        contains(racist) = True                1 : 0      =     27.6 : 1.0
    contains(misogynist) = True                1 : 0      =     26.6 : 1.0
      contains(research) = True                1 : 0      =     26.6 : 1.0
         contains(miami) = True                1 : 0      =     25.0 : 1.0
        contains(messag) = True                1 : 0      =     25.0 : 1.0
         contains(white) = True                1 : 0      =     23.4 : 1.0
        contains(latest) = True                1 : 0      =     20.5 : 1.0
        contains(profil) = True                1 : 0      =     19.0 : 1.0
          contains(boot) = True                1 : 0      =     19.0 : 1.0

# Lexicon Based Classifier

In [49]:
#Creating list of tweet from train dataset
train_list = train['tidy_tweet'].tolist()
train_list[0:4]

['bewar market taint messag',
 'sadist dead aliv #dead #aliv #ist',
 'life remot chang yourself #success #career',
 'peopl buy thi stunt hook line sinker wait free click media #lazi']

In [70]:
#Converting train dataframe into list
train_list_df = train.values.tolist()
train_list_df[0:4]
len(train_list_df)

[['bewar market taint messag', 1],
 ['sadist dead aliv #dead #aliv #ist', 0],
 ['life remot chang yourself #success #career', 0],
 ['peopl buy thi stunt hook line sinker wait free click media #lazi', 0]]

2400

In [51]:
#Converting test dataframe into list
test_list_df = test.values.tolist()
test_list_df[0:4]
len(test_list_df)

[['your comment reflect ignor', 1],
 ['there peopl there will fuher their caus control there', 0],
 ['thank need channel go stream', 0],
 ['love puff #dinner #lalov #smile #saturd #kebab hossein persian kebab', 0]]

800

In [52]:
#Creating a list of tweet from test data
test_list = test['tidy_tweet'].tolist()
test_list[0:4]

['your comment reflect ignor',
 'there peopl there will fuher their caus control there',
 'thank need channel go stream',
 'love puff #dinner #lalov #smile #saturd #kebab hossein persian kebab']

In [53]:
#Tokenization of list of train tweet
tokenized_Train_tweet = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_Train_tweet.head()

30042                       [bewar, market, taint, messag]
31585             [sadist, dead, aliv, #dead, #aliv, #ist]
1656     [life, remot, chang, yourself, #success, #career]
27513    [peopl, buy, thi, stunt, hook, line, sinker, w...
27663                         [nose, perfum, spark, furor]
Name: tidy_tweet, dtype: object

In [54]:
#To create a list of tokenized train tweet
train_list = tokenized_Train_tweet.tolist()

In [55]:
train_list[0:4]
len(train_list)

[['bewar', 'market', 'taint', 'messag'],
 ['sadist', 'dead', 'aliv', '#dead', '#aliv', '#ist'],
 ['life', 'remot', 'chang', 'yourself', '#success', '#career'],
 ['peopl',
  'buy',
  'thi',
  'stunt',
  'hook',
  'line',
  'sinker',
  'wait',
  'free',
  'click',
  'media',
  '#lazi']]

2400

In [56]:
#Tokenization of test tweet 
tokenized_Test_tweet = test['tidy_tweet'].apply(lambda x: x.split())
tokenized_Test_tweet.head()

28921                      [your, comment, reflect, ignor]
23280    [there, peopl, there, will, fuher, their, caus...
10378                   [thank, need, channel, go, stream]
7783     [love, puff, #dinner, #lalov, #smile, #saturd,...
5805     [happi, work, confer, right, mindset, lead, cu...
Name: tidy_tweet, dtype: object

In [57]:
#To create a list of tokenized test tweet
test_list = tokenized_Test_tweet.tolist()

In [58]:
test_list[0:4]
len(test_list)

[['your', 'comment', 'reflect', 'ignor'],
 ['there',
  'peopl',
  'there',
  'will',
  'fuher',
  'their',
  'caus',
  'control',
  'there'],
 ['thank', 'need', 'channel', 'go', 'stream'],
 ['love',
  'puff',
  '#dinner',
  '#lalov',
  '#smile',
  '#saturd',
  '#kebab',
  'hossein',
  'persian',
  'kebab']]

800

In [59]:
#Importing Affin datastet
lex_file = pd.read_csv("AFINN-111.csv", sep=',', engine='python')

In [60]:
lex_file.head()

Unnamed: 0,words,score
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


In [62]:
#Converting dataframe into dictionary
lexicons = dict(zip(lex_file.words,lex_file.score))

In [63]:
lexicons["abandon"]

-2

In [64]:
#For each tweet
#For each word in tweet
#Get the word score (score is a number if the word is in Lexicon, 0 if not)
#Add all the scores and find the polarity
strength = []
prediction = []
for tweet in test_list:
    score = 0
    for word in tweet:
        if word in (lexicons):
            score = score + lexicons[word]
    strength.append(score)
    if (score >= 0):
        prediction.append('0')
    else:
        prediction.append('1')
#print(strength)
#print(prediction)

In [65]:
import pandas as pd
#Get the columns of data
corpus = []
class_label = []
for record in test_list_df :
    corpus.append(record[0])
    class_label.append(record[1])
#print(corpus)
print(class_label)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 

In [66]:
corpus_list = {'data':corpus}
labels = {'label':class_label}
senti_strength = {'senti_strength':strength}

In [67]:
#Create an Analysis Base Table (ABT)
senti_matrix = pd.DataFrame(corpus_list, strength)
senti_matrix['Label'] = class_label
senti_matrix['prediction']=prediction

In [68]:
senti_matrix[0:5]

Unnamed: 0,data,Label,prediction
0,your comment reflect ignor,1,0
0,there peopl there will fuher their caus contro...,0,0
2,thank need channel go stream,0,0
3,love puff #dinner #lalov #smile #saturd #kebab...,0,0
0,happi work confer right mindset lead cultur de...,0,0


In [69]:
senti_matrix.pivot_table(index = ['prediction', 'Label'], aggfunc = 'count')

Unnamed: 0_level_0,Unnamed: 1_level_0,data
prediction,Label,Unnamed: 2_level_1
0,0,669
0,1,34
1,0,82
1,1,15
