In [228]:
import pandas as pd
import numpy as np
import json

from sklearn.metrics import precision_score, recall_score

In [236]:
cd ALK-NLP-course

C:\Users\Jan Majewski\Documents\Conferences&Courses\Kozminski\ALK-NLP-course


# Read data and scores dict

In [237]:
df = pd.read_csv("data/warm_up_data.csv")

In [238]:
with open('data/AFINN-111-scores.json', 'r') as fp:
     scores_dict = json.load(fp)

In [239]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,text,airline
0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America
1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America
2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America
3,570300248553349120,neutral,@VirginAmerica Really missed a prime opportuni...,Virgin America
4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America


# Data preparation

Convert airline sentiment column to numeric class

In [240]:
setiment_class_dict = {'negative':-1, 'neutral':0, 'positive':1}

In [241]:
df["sentiment_class_true"] = df.airline_sentiment.apply(lambda x: setiment_class_dict.get(x))

# Assign text sentiment score based on scores dict

In [242]:
def assign_score(text):
    score = 0
    for word in text.split():
        score+=scores.get(word.lower(),0)
    return score
        

In [243]:
df["sentiment_score_hat"] = df.text.apply(assign_score)

## Split sentiment scores to class

In [244]:
def classify_sentiment(score_col, negative_th, positive_th):
    bins = [-100,negative_th, positive_th, 100]
    labels = [-1,0,1]
    
    score_class = pd.cut(score_col,bins = bins, labels = labels).astype(int)
    return score_class

In [245]:
df.sentiment_score_hat.describe()

count    9489.000000
mean        0.284540
std         2.853243
min       -13.000000
25%        -2.000000
50%         0.000000
75%         2.000000
max        16.000000
Name: sentiment_score_hat, dtype: float64

In [246]:
df["sentiment_class_hat"] = classify_sentiment(df.sentiment_score_hat, -1, 1)

In [247]:
df["correct_classification"] = df.sentiment_class_hat == df.sentiment_class_true

## Evaluate classification performance per class

In [248]:
df.groupby("airline_sentiment")["correct_classification"].mean()

airline_sentiment
negative    0.609733
neutral     0.261321
positive    0.848869
Name: correct_classification, dtype: float64

# Select only negative and positive texts

In [249]:
df_np = df.loc[df.sentiment_class_true!=0].reset_index()

In [250]:
df_np

Unnamed: 0,index,tweet_id,airline_sentiment,text,airline,sentiment_class_true,sentiment_score_hat,sentiment_class_hat,correct_classification
0,0,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...,Virgin America,-1,-5,-1,True
1,1,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...,Virgin America,-1,-2,-1,True
2,2,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...,Virgin America,-1,-4,-1,True
3,4,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ...",Virgin America,1,3,1,True
4,6,570289724453216256,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...,Virgin America,1,3,1,True
...,...,...,...,...,...,...,...,...,...
8137,9482,569588651925098496,positive,Thank you. “@AmericanAir: @jlhalldc Customer R...,American,1,2,1,True
8138,9483,569588464896876545,negative,@AmericanAir thx for nothing on getting us out...,American,-1,-1,-1,True
8139,9485,569587705937600512,negative,@AmericanAir my flight was Cancelled Flightled...,American,-1,-1,-1,True
8140,9486,569587686496825344,positive,@AmericanAir thank you we got on a different f...,American,1,2,1,True


## Calculate Precision and Recall for classifying negative review

Precision = TP / TP + FP

Recall = TP / TP + FN

In [251]:
df_np["negative_review"] = np.where(df_np.sentiment_class_true==-1,1,0)

In [252]:
df_np["negative_review_hat"] =  np.where(df_np.sentiment_class_hat==-1,1,0)

In [253]:
precision_score(df_np["negative_review"] , df_np["negative_review_hat"] )

0.9708176100628931

In [254]:
recall_score(df_np["negative_review"] , df_np["negative_review_hat"] )

0.6097329751935535