# EDA Understand Negative Sentiment Per Entity 

In [133]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import pearsonr

# Load Previously Processed Data 

In [134]:
processed_df = pd.read_csv('~/sentiment_analysis/archive/twitter_training_preprocessed.csv') 

In [135]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70958 entries, 0 to 70957
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              70958 non-null  int64 
 1   entity          70958 non-null  object
 2   label           70958 non-null  object
 3   text            70958 non-null  object
 4   label_num       70958 non-null  int64 
 5   text_processed  69892 non-null  object
 6   combined        70958 non-null  object
dtypes: int64(2), object(5)
memory usage: 3.8+ MB


In [136]:
processed_df['entity'].value_counts()

entity
CallOfDuty                           2304
Verizon                              2301
MaddenNFL                            2294
NBA2K                                2290
TomClancysRainbowSix                 2287
Facebook                             2280
Microsoft                            2278
TomClancysGhostRecon                 2263
WorldOfCraft                         2248
johnson&johnson                      2247
ApexLegends                          2246
Battlefield                          2236
CallOfDutyBlackopsColdWar            2233
LeagueOfLegends                      2228
FIFA                                 2224
Dota2                                2218
Amazon                               2213
Overwatch                            2208
Hearthstone                          2201
GrandTheftAuto(GTA)                  2201
HomeDepot                            2199
Borderlands                          2192
Google                               2188
PlayStation5(PS5)          

# Filter Out Irrelevant Data 

In [137]:
processed_df = processed_df[processed_df['label'] != 'Irrelevant']

# Apply Ordinal Encoding To Label 

In [138]:
encoder = OrdinalEncoder(categories=[['Negative', 'Neutral', 'Positive']])
processed_df['encoded_label'] = encoder.fit_transform(processed_df[['label']]) #?

# Group by entity 

In [139]:
processed_grouped_df = processed_df.groupby('entity')

In [140]:
for key, group in processed_grouped_df:
    print("entity: ", key)
    print(group['label'].value_counts())


entity:  Amazon
label
Neutral     1165
Negative     564
Positive     302
Name: count, dtype: int64
entity:  ApexLegends
label
Neutral     897
Positive    598
Negative    568
Name: count, dtype: int64
entity:  AssassinsCreed
label
Positive    1372
Negative     364
Neutral      153
Name: count, dtype: int64
entity:  Battlefield
label
Positive    556
Negative    439
Neutral     335
Name: count, dtype: int64
entity:  Borderlands
label
Positive    961
Neutral     578
Negative    415
Name: count, dtype: int64
entity:  CS-GO
label
Positive    705
Neutral     513
Negative    334
Name: count, dtype: int64
entity:  CallOfDuty
label
Negative    855
Positive    425
Neutral     365
Name: count, dtype: int64
entity:  CallOfDutyBlackopsColdWar
label
Positive    816
Negative    537
Neutral     337
Name: count, dtype: int64
entity:  Cyberpunk2077
label
Positive    889
Neutral     453
Negative    360
Name: count, dtype: int64
entity:  Dota2
label
Negative    700
Neutral     579
Positive    538
Name: cou

# Covariance Analysis Between Word and Label

In [141]:
group = processed_grouped_df.get_group('TomClancysRainbowSix')

## Tokenization With Binary Presence

In [144]:
vectorizer = CountVectorizer(binary=True)
word_matrix = vectorizer.fit_transform(group['text']).toarray()
words_df = pd.DataFrame(word_matrix, columns=vectorizer.get_feature_names_out())

In [145]:
words_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2195 entries, 0 to 2194
Columns: 2885 entries, 00 to zwykv8qghp
dtypes: int64(2885)
memory usage: 48.3 MB


In [146]:
words_df['encoded_label'] = group['encoded_label'].values

In [147]:
words_df['encoded_label'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2195 entries, 0 to 2194
Series name: encoded_label
Non-Null Count  Dtype  
--------------  -----  
2195 non-null   float64
dtypes: float64(1)
memory usage: 17.3 KB


In [148]:
words_df['encoded_label'].value_counts()

encoded_label
0.0    1101
1.0     607
2.0     487
Name: count, dtype: int64

## Calculate Pearson Correlation Coefficient Between Each Word and Label

In [149]:
correlations = {}
for word in vectorizer.get_feature_names_out():
    corr, _ = pearsonr(words_df[word], words_df['encoded_label'])
    correlations[word] = corr

In [150]:
sorted_correlations_negative = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

In [151]:
print("top ten words associated with negative sentiment")
for word, corr in sorted_correlations_negative[:10]:
    print(f"Word: {word}, Correlation: {corr:.2f}")

top ten words associated with negative sentiment
Word: rainbow6game, Correlation: -0.46
Word: fix, Correlation: -0.28
Word: love, Correlation: 0.26
Word: game, Correlation: -0.25
Word: the, Correlation: -0.22
Word: servers, Correlation: -0.21
Word: can, Correlation: -0.20
Word: ubisoftsupport, Correlation: -0.19
Word: why, Correlation: -0.19
Word: ubisoft, Correlation: -0.19


In [131]:
print("top ten words associated with positive sentiment")
for word, corr in sorted_correlations_negative[-10:]:
    print(f"Word: {word}, Correlation: {corr:.2f}")

top ten words associated with positive sentiment
Word: talk, Correlation: -0.00
Word: possible, Correlation: -0.00
Word: charged, Correlation: -0.00
Word: goes, Correlation: -0.00
Word: poppy, Correlation: 0.00
Word: and, Correlation: 0.00
Word: decision, Correlation: 0.00
Word: very, Correlation: -0.00
Word: through, Correlation: -0.00
Word: any, Correlation: 0.00
