In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import nltk

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
data.shape

(7613, 5)

In [4]:
data['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [5]:
data.drop_duplicates(subset=['text'], inplace=True ,keep = False, ignore_index=True)

In [6]:
data.shape

(7434, 5)

In [7]:
exemple = data.text[40]
exemple

"on the outside you're ablaze and alive\nbut you're dead inside"

In [8]:
tokens = nltk.word_tokenize(exemple)
tokens[:5]

['on', 'the', 'outside', 'you', "'re"]

In [9]:
tagged = nltk.pos_tag(tokens)
tagged[:5]

[('on', 'IN'),
 ('the', 'DT'),
 ('outside', 'NN'),
 ('you', 'PRP'),
 ("'re", 'VBP')]

In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [11]:
sia = SentimentIntensityAnalyzer()

In [12]:
sia.polarity_scores(exemple)

{'neg': 0.355, 'neu': 0.537, 'pos': 0.107, 'compound': -0.7311}

In [13]:
data.text[2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [14]:
results = {}

for i in range(data.shape[0]):
    text = data.text[i]
    myid = data.id[i]
    results[myid] = sia.polarity_scores(text)

In [15]:
vaders = pd.DataFrame(results).T
vaders.reset_index(inplace = True, drop = True)

In [16]:
vaders

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.851,0.149,0.2732
1,0.286,0.714,0.000,-0.3400
2,0.095,0.905,0.000,-0.2960
3,0.000,1.000,0.000,0.0000
4,0.000,1.000,0.000,0.0000
...,...,...,...,...
7429,0.249,0.751,0.000,-0.6841
7430,0.122,0.878,0.000,-0.4939
7431,0.328,0.574,0.098,-0.7650
7432,0.262,0.738,0.000,-0.4939


In [17]:
vaders = vaders.join(data)

In [18]:
vaders.head(5)

Unnamed: 0,neg,neu,pos,compound,id,keyword,location,text,target
0,0.0,0.851,0.149,0.2732,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,0.286,0.714,0.0,-0.34,4,,,Forest fire near La Ronge Sask. Canada,1
2,0.095,0.905,0.0,-0.296,5,,,All residents asked to 'shelter in place' are ...,1
3,0.0,1.0,0.0,0.0,6,,,"13,000 people receive #wildfires evacuation or...",1
4,0.0,1.0,0.0,0.0,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [19]:
sorted_vaders = vaders.sort_values('compound').reset_index(drop = True)

In [20]:
sorted_vaders['compound'].isna().value_counts()

False    7434
Name: compound, dtype: int64

In [21]:
sorted_vaders.target[sorted_vaders['compound'] < 2].value_counts(normalize=True)

0    0.576271
1    0.423729
Name: target, dtype: float64