In [2]:
# References: https://www.geeksforgeeks.org/python-pearsons-chi-square-test/
# Dependencies
import pandas as pd
from scipy.stats import chi2_contingency 

In [3]:
# Import in data
df = pd.read_json('../sentiment_classification/updated_mongo_dataset.json')
df = df[['source', 'sentiment_category', 'url']]
df.head()

Unnamed: 0,source,sentiment_category,url
0,NBC News,negative,https://www.nbcnews.com/think/opinion/biden-s-...
1,NBC News,neutral,https://www.nbcnews.com/news/latino/first-lati...
2,NBC News,neutral,https://www.nbcnews.com/news/asian-america/ton...
3,NBC News,neutral,https://www.nbcnews.com/politics/immigration/s...
4,NBC News,negative,https://www.nbcnews.com/politics/politics-news...


In [4]:
# See breakdown by source
df.groupby(['source', 'sentiment_category']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,url
source,sentiment_category,Unnamed: 2_level_1
Al Jazeera English,negative,211
Al Jazeera English,neutral,119
Al Jazeera English,positive,36
Breitbart News,negative,500
Breitbart News,neutral,139
Breitbart News,positive,124
CBS News,negative,60
CBS News,neutral,63
CBS News,positive,13
CNN,negative,163


In [5]:
# Calculate count of sentiment by each source)
df_count = df.groupby(['sentiment_category', 'source']).count().reset_index().rename(columns={'url': 'sum'})
df_count.head()

Unnamed: 0,sentiment_category,source,sum
0,negative,Al Jazeera English,211
1,negative,Breitbart News,500
2,negative,CBS News,60
3,negative,CNN,163
4,negative,Fox News,259


In [7]:
# Create new dataframe of count of headlines with each overall sentiment by source 
df_count_pivot = pd.pivot_table(df_count, index='sentiment_category', columns='source', values ='sum')
df_count_pivot

source,Al Jazeera English,Breitbart News,CBS News,CNN,Fox News,NBC News,Politico,The Washington Post,USA Today
sentiment_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
negative,211,500,60,163,259,156,63,114,124
neutral,119,139,63,118,150,139,34,55,93
positive,36,124,13,71,85,79,21,23,81


In [8]:
data = df_count_pivot.to_numpy()
data

array([[211, 500,  60, 163, 259, 156,  63, 114, 124],
       [119, 139,  63, 118, 150, 139,  34,  55,  93],
       [ 36, 124,  13,  71,  85,  79,  21,  23,  81]], dtype=int64)

In [9]:
# defining the table 
stat, p, dof, expected = chi2_contingency(data) 
  
# interpret p-value 
alpha = 0.05
print("p value is " + str(p)) 
if p <= alpha: 
    # The variables DO have a significant relationship
    print('Dependent (reject H0)') 
else: 
    # The variables do not have a significant relationship
    print('Independent (H0 holds true)') 

p value is 5.789772777538245e-23
Dependent (reject H0)
