In [1]:
# dependencies
import pandas as pd
from textblob import TextBlob

In [2]:
# Import in data
df = pd.read_json('preprocessed_headlines.json')

In [3]:
df = df[['title', 'sentiment_human', 'compound_score']]

In [4]:
# Determine sentiment category for each NLTK compound score
df['NLTK'] = 0
df.loc[df['compound_score'] > 0.2, 'NLTK'] = 1
df.loc[df['compound_score'] < -0.2, 'NLTK'] = -1
df.head(5)

Unnamed: 0,title,sentiment_human,compound_score,NLTK
0,I could be deported because my parents came he...,-1,0.1027,0
1,First Latino tapped to head DHS signals shift ...,0,0.0,0
2,"Tony Pham, interim director of Immigration and...",0,0.0,0
3,More than two-thirds of undocumented immigrant...,0,-0.3818,-1
4,"Biden to meet with struggling workers, small-b...",-1,-0.7845,-1


In [5]:
# Label each NLTK sentiment as correct or incorrect
df['NLTK_acc'] = 0
df.loc[df['NLTK'] == df['sentiment_human'], 'NLTK_acc'] = 1
df.head(5)

Unnamed: 0,title,sentiment_human,compound_score,NLTK,NLTK_acc
0,I could be deported because my parents came he...,-1,0.1027,0,0
1,First Latino tapped to head DHS signals shift ...,0,0.0,0,1
2,"Tony Pham, interim director of Immigration and...",0,0.0,0,1
3,More than two-thirds of undocumented immigrant...,0,-0.3818,-1,0
4,"Biden to meet with struggling workers, small-b...",-1,-0.7845,-1,1


In [6]:
# Determine total number of correct classifications by NLTK
NLTK_correct = df.groupby('NLTK_acc').count()['title'][1]
NLTK_correct

2092

In [7]:
# Determine NLTK accuracy
NLTK_accuracy = NLTK_correct/len(df)
NLTK_accuracy

0.6763659877141933

In [10]:
# Calculate sentiment polarity using TextBlob
df['textblob_score'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head(5)

Unnamed: 0,title,sentiment_human,compound_score,NLTK,NLTK_acc,textblob_score
0,I could be deported because my parents came he...,-1,0.1027,0,0,0.2
1,First Latino tapped to head DHS signals shift ...,0,0.0,0,1,0.25
2,"Tony Pham, interim director of Immigration and...",0,0.0,0,1,-0.155556
3,More than two-thirds of undocumented immigrant...,0,-0.3818,-1,0,0.25
4,"Biden to meet with struggling workers, small-b...",-1,-0.7845,-1,1,0.2


In [11]:
# Determine sentiment category for each TextBlog compound score
df['TextBlob'] = 0
df.loc[df['textblob_score'] > 0.2, 'TextBlob'] = 1
df.loc[df['textblob_score'] < -0.2, 'TextBlob'] = -1
df.head(5)

Unnamed: 0,title,sentiment_human,compound_score,NLTK,NLTK_acc,textblob_score,TextBlob
0,I could be deported because my parents came he...,-1,0.1027,0,0,0.2,0
1,First Latino tapped to head DHS signals shift ...,0,0.0,0,1,0.25,1
2,"Tony Pham, interim director of Immigration and...",0,0.0,0,1,-0.155556,0
3,More than two-thirds of undocumented immigrant...,0,-0.3818,-1,0,0.25,1
4,"Biden to meet with struggling workers, small-b...",-1,-0.7845,-1,1,0.2,0


In [12]:
# Label each NLTK sentiment as correct or incorrect
df['TextBlob_acc'] = 0
df.loc[df['TextBlob'] == df['sentiment_human'], 'TextBlob'] = 1
df.head(5)

Unnamed: 0,title,sentiment_human,compound_score,NLTK,NLTK_acc,textblob_score,TextBlob,TextBlob_acc
0,I could be deported because my parents came he...,-1,0.1027,0,0,0.2,0,0
1,First Latino tapped to head DHS signals shift ...,0,0.0,0,1,0.25,1,0
2,"Tony Pham, interim director of Immigration and...",0,0.0,0,1,-0.155556,1,0
3,More than two-thirds of undocumented immigrant...,0,-0.3818,-1,0,0.25,1,0
4,"Biden to meet with struggling workers, small-b...",-1,-0.7845,-1,1,0.2,0,0


In [15]:
df.groupby('TextBlob').count()

Unnamed: 0_level_0,title,sentiment_human,compound_score,NLTK,NLTK_acc,textblob_score,TextBlob_acc
TextBlob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,33,33,33,33,33,33,33
0,1629,1629,1629,1629,1629,1629,1629
1,1431,1431,1431,1431,1431,1431,1431


In [14]:
# Determine total number of correct classifications by NLTK
TextBlob_correct = df.groupby('TextBlob_acc').count()['title']
TextBlob_correct

TextBlob_acc
0    3093
Name: title, dtype: int64