In [None]:
import pandas as  pd

DATASET_PATH = 'data/dataset_ready_to_train.json'
df = pd.read_json(DATASET_PATH)
df.head()

In [None]:
drivers_ = df[df.drivers.apply(len) > 0]
print(drivers_.drivers.value_counts().sum())
df.drivers.value_counts()


In [None]:
from afinn import Afinn

afinn = Afinn()

def map_score(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

df['afinn_score'] = df['text'].apply(lambda x: map_score(afinn.score(x)))
print(df.afinn_score.value_counts())
df.head(15)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
print(accuracy_score(df['sentiment'], df['afinn_score']))
conf_matrix = confusion_matrix(df['sentiment'], df['afinn_score'])
print(conf_matrix)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=df['sentiment'].unique(), yticklabels=df['sentiment'].unique())
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig("afinn_confusion_matrix.png")
plt.show()

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

def map_score_vader(score):
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'
    
df['vader_score'] = df['text'].apply(lambda x: map_score_vader(vader.polarity_scores(x)))
print(df.vader_score.value_counts())
print(accuracy_score(df['sentiment'], df['vader_score']))

In [None]:
conf_matrix = confusion_matrix(df['sentiment'], df['vader_score'])
print(conf_matrix)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=df['sentiment'].unique(), yticklabels=df['sentiment'].unique())
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig("vader_confusion_matrix.png")
plt.show()

In [None]:
df_diff = df[(df['afinn_score'] != df['vader_score']) & (df['sentiment'] != df['vader_score'])]
df_diff.to_json('data/affin_vader_diff.json', orient='records')
df_diff.head()