In [140]:
import pandas as pd
import numpy as np
import json

from sklearn.metrics import precision_score, recall_score

# Read data and scores dict

In [141]:
df = pd.read_csv('../data/warm_up_data.csv')

In [142]:
with open('../data/AFINN-111-scores.json', 'r') as fp:
     scores_dict = json.load(fp)

In [143]:
df

Unnamed: 0,tweet_id,airline_sentiment,text,airline
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America
3,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...,Virgin America
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America
...,...,...,...,...
9484,569587813856841728,neutral,"“@AmericanAir: @TilleyMonsta George, that does...",American
9485,569587705937600512,negative,@AmericanAir my flight was Cancelled Flightled...,American
9486,569587686496825344,positive,@AmericanAir thank you we got on a different f...,American
9487,569587371693355008,negative,@AmericanAir leaving over 20 minutes Late Flig...,American


List unique airlines within the dataframe

In [144]:
# List unique airlines within the dataframe
df.airline.unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

# Data preparation

Convert airline sentiment column to numeric class

In [145]:
setiment_class_dict = {'negative':-1, 'neutral':0, 'positive':1}

In [146]:
df["sentiment_class_true"] = df.airline_sentiment.apply(lambda x: setiment_class_dict.get(x))

In [147]:
scores_dict

{'abandon': -2,
 'abandoned': -2,
 'abandons': -2,
 'abducted': -2,
 'abduction': -2,
 'abductions': -2,
 'abhor': -3,
 'abhorred': -3,
 'abhorrent': -3,
 'abhors': -3,
 'abilities': 2,
 'ability': 2,
 'aboard': 1,
 'absentee': -1,
 'absentees': -1,
 'absolve': 2,
 'absolved': 2,
 'absolves': 2,
 'absolving': 2,
 'absorbed': 1,
 'abuse': -3,
 'abused': -3,
 'abuses': -3,
 'abusive': -3,
 'accept': 1,
 'accepted': 1,
 'accepting': 1,
 'accepts': 1,
 'accident': -2,
 'accidental': -2,
 'accidentally': -2,
 'accidents': -2,
 'accomplish': 2,
 'accomplished': 2,
 'accomplishes': 2,
 'accusation': -2,
 'accusations': -2,
 'accuse': -2,
 'accused': -2,
 'accuses': -2,
 'accusing': -2,
 'ache': -2,
 'achievable': 1,
 'aching': -2,
 'acquit': 2,
 'acquits': 2,
 'acquitted': 2,
 'acquitting': 2,
 'acrimonious': -3,
 'active': 1,
 'adequate': 1,
 'admire': 3,
 'admired': 3,
 'admires': 3,
 'admiring': 3,
 'admit': -1,
 'admits': -1,
 'admitted': -1,
 'admonish': -2,
 'admonished': -2,
 'adopt': 

# Assign text sentiment score based on scores dict

Write assign_score function, which will summarize scores of all words present within scores_dict and calculate sentiment_score_hat for each tweet

In [148]:
df

Unnamed: 0,tweet_id,airline_sentiment,text,airline,sentiment_class_true
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1
3,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...,Virgin America,0
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1
...,...,...,...,...,...
9484,569587813856841728,neutral,"“@AmericanAir: @TilleyMonsta George, that does...",American,0
9485,569587705937600512,negative,@AmericanAir my flight was Cancelled Flightled...,American,-1
9486,569587686496825344,positive,@AmericanAir thank you we got on a different f...,American,1
9487,569587371693355008,negative,@AmericanAir leaving over 20 minutes Late Flig...,American,-1


In [149]:
def assign_score(text):
    score = 0
    for word in text.split():
        word_score = scores_dict.get(word.lower(), 0)
        score += word_score
    return score

In [150]:
assign_score(df.text[0])

-5

In [151]:
df["sentiment_score_hat"] = df.text.apply(assign_score)
df

Unnamed: 0,tweet_id,airline_sentiment,text,airline,sentiment_class_true,sentiment_score_hat
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1,-5
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1,-2
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1,-4
3,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...,Virgin America,0,0
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1,3
...,...,...,...,...,...,...
9484,569587813856841728,neutral,"“@AmericanAir: @TilleyMonsta George, that does...",American,0,1
9485,569587705937600512,negative,@AmericanAir my flight was Cancelled Flightled...,American,-1,-1
9486,569587686496825344,positive,@AmericanAir thank you we got on a different f...,American,1,2
9487,569587371693355008,negative,@AmericanAir leaving over 20 minutes Late Flig...,American,-1,-7


## Split sentiment scores to class

Write a classify_sentiment functions, which will assign label numeric labels [-1,0,1] for based on predicted sentiment score. Recommended use of pd.cut

In [152]:
def classify_sentiment(score_col, negative_th, positive_th):
    bins = [-100, negative_th, positive_th, 100]
    labels = [-1, 0, 1]
    score_class = pd.cut(score_col, bins=bins, labels=labels).astype(int)
    return score_class

In [153]:
df.sentiment_score_hat.describe()

count    9489.000000
mean        0.284540
std         2.853243
min       -13.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        16.000000
Name: sentiment_score_hat, dtype: float64

In [154]:
df["sentiment_class_hat"] = classify_sentiment(df.sentiment_score_hat, -1, 1)

In [155]:
df.groupby('sentiment_class_hat', as_index = False).tweet_id.count()

Unnamed: 0,sentiment_class_hat,tweet_id
0,-1,4332
1,0,1607
2,1,3550


Create a correct_classification bool column defining if our prediction is correct

In [156]:
df["correct_classification"] = df.sentiment_class_hat == df.sentiment_class_true

In [157]:
df['correct_classification'].mean()

0.6059648013489304

## Evaluate classification performance per class

Evaluate accuracy by class

In [158]:
df.groupby('airline_sentiment', as_index = False).correct_classification.mean()

Unnamed: 0,airline_sentiment,correct_classification
0,negative,0.609733
1,neutral,0.261321
2,positive,0.848869


# Select only negative and positive texts

Create df_np dataframe as subset of df excluding neutral (0) sentiment class

In [161]:
df_np = df.loc[df.airline_sentiment != 'neutral']

In [162]:
df_np.head()

Unnamed: 0,tweet_id,airline_sentiment,text,airline,sentiment_class_true,sentiment_score_hat,sentiment_class_hat,correct_classification
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1,-5,-1,True
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1,-2,-1,True
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1,-4,-1,True
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1,3,1,True
6,570289724453216256,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...,Virgin America,1,3,1,True


## Calculate Precision and Recall for classifying negative review

Precision = TP / TP + FP

Recall = TP / TP + FN

Create negative_review and negative_review_hat bool column for negative review detection classifier and calculate Precision and Recall

In [166]:
df_np["negative_review"] = np.where(df_np.sentiment_class_true==-1,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_np["negative_review"] = np.where(df_np.sentiment_class_true==-1,1,0)


In [167]:
df_np["negative_review_hat"] =  np.where(df_np.sentiment_class_hat==-1,1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_np["negative_review_hat"] =  np.where(df_np.sentiment_class_hat==-1,1,0)


In [168]:
precision_score(df_np["negative_review"] , df_np["negative_review_hat"] )

0.9708176100628931

In [169]:
recall_score(df_np["negative_review"] , df_np["negative_review_hat"] )

0.6097329751935535