In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn

In [2]:
cleaned_df = pd.read_csv('satisfaction-data-clean-updated.csv')

In [3]:
cleaned_df.loc[cleaned_df["Indicators"] == "Life satisfaction rating between 0 and 5", 'Rating'] = 2.5
cleaned_df.loc[cleaned_df["Indicators"] == "Life satisfaction rating of 6 or 7", 'Rating'] = 6.5
cleaned_df.loc[cleaned_df["Indicators"] == "Life satisfaction rating of 8, 9 or 10", 'Rating'] = 9

In [4]:
total_df = cleaned_df[(cleaned_df['Gender'] == 'Total, all persons') & (cleaned_df['GEO'] != 'Canada (excluding territories)')]

In [5]:
total_df['score'] = total_df['VALUE'] * total_df['Rating']/100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_df['score'] = total_df['VALUE'] * total_df['Rating']/100


In [6]:
total_df = total_df.groupby(['GEO', 'REF_DATE'])['score'].sum().reset_index(name ='Satisfaction Score')

In [7]:
satisfaction_score = total_df.groupby('GEO')['Satisfaction Score'].mean().reset_index(name ='Satisfaction Score')

In [8]:
satisfaction_score = satisfaction_score.rename(columns={'GEO':'Province'})

In [9]:
sentiment_score_annual = pd.read_csv('sentiment_score_annual.csv')

In [10]:
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'Manitoba', 'Province'] = 'Manitoba'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'NovaScotia', 'Province'] = 'Nova Scotia'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'PEI', 'Province'] = 'Prince Edward Island'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'alberta', 'Province'] = 'Alberta'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'britishcolumbia', 'Province'] = 'British Columbia'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'newbrunswickcanada', 'Province'] = 'New Brunswick'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'newfoundland', 'Province'] = 'Newfoundland and Labrabdor'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'ontario', 'Province'] = 'Ontario'
sentiment_score_annual.loc[sentiment_score_annual['subreddit'] == 'saskatchewan', 'Province'] = 'Saskatchewan'

In [11]:
sentiment_score_annual = sentiment_score_annual[sentiment_score_annual['year'] == 2022]

In [12]:
sentiment_score_annual = sentiment_score_annual[sentiment_score_annual['Province'] != 'nan']

In [13]:
sentiment_score_annual = sentiment_score_annual.drop(columns=['year'])

In [14]:
sentiment_scores = sentiment_score_annual.rename(columns={'score':'Sentiment Score'})

In [15]:
merged = sentiment_scores.merge(satisfaction_score, on='Province')

In [16]:
line = stats.linregress(merged['Sentiment Score'], merged['Satisfaction Score'])

In [17]:
merged['prediction'] = line.intercept + line.slope*merged['Sentiment Score']

In [18]:
merged

Unnamed: 0,subreddit,Sentiment Score,Province,Satisfaction Score,prediction
0,Manitoba,0.342205,Manitoba,7.059,7.024552
1,NovaScotia,0.387696,Nova Scotia,6.9315,7.115489
2,PEI,0.365866,Prince Edward Island,7.322875,7.07185
3,alberta,0.306322,Alberta,6.9125,6.952819
4,britishcolumbia,0.323016,British Columbia,6.777,6.986192
5,newbrunswickcanada,0.331267,New Brunswick,7.048,7.002684
6,ontario,0.246919,Ontario,6.79725,6.83407
7,saskatchewan,0.298894,Saskatchewan,7.0775,6.937969


In [40]:
line

LinregressResult(slope=1.9990536812600275, intercept=6.340464849925753, rvalue=0.48881449621846695, pvalue=0.21900395273690554, stderr=1.4565128271532044, intercept_stderr=0.47740716846914844)

In [39]:
seaborn.set()
plt.figure()
fig, ax = plt.subplots()
ax.scatter(merged['Sentiment Score'], merged['Satisfaction Score'])
fig.set_figheight(7)
fig.set_figwidth(10)
ax.set_title('Average Sentiment Score vs Average Satisfaction Score per Province, 2022')
ax.set_xlabel('Sentiment Score')
ax.set_ylabel('Satisfaction Score')
ax.set_ylim(6.7, 7.4)
ax.set_xlim(0.225,0.4)

for i, txt in enumerate(merged['Province']):
    ax.annotate(txt, (merged['Sentiment Score'][i], merged['Satisfaction Score'][i]), ha='center', xytext=((merged['Sentiment Score'][i], merged['Satisfaction Score'][i]+0.01)))

ax.plot(merged['Sentiment Score'], merged['prediction'], 'r-', linewidth=2)
ax.legend(['Provincial Scores', 'Linear regression line'],loc='lower right')
plt.savefig('average')

In [22]:
df = pd.read_csv('sentiment_score.csv')

In [23]:
df

Unnamed: 0.1,Unnamed: 0,id,year,month,subreddit,selftext,ups,downs,num_comments,score,date
0,0,4rebbl,2016,7,alberta,Hello! I'm new to Alberta and have a dog exper...,1.0,0.0,1,0.7934,2016-07-01
1,1,4revo5,2016,7,saskatchewan,My car got it's ass handed to it by a hail sto...,5.0,0.0,21,0.5940,2016-07-01
2,2,4rf1xq,2016,7,NovaScotia,29m 26f looking for a lovely lady to join us. ...,0.0,0.0,4,0.7184,2016-07-01
3,3,4rgr7l,2016,7,alberta,"Hello all,\n\nI'm interested in becoming a REP...",3.0,0.0,2,0.9732,2016-07-01
4,4,4rht02,2016,7,alberta,I can't believe I supported the ndp so deeply....,0.0,0.0,9,0.7550,2016-07-01
...,...,...,...,...,...,...,...,...,...,...,...
97297,97756,fbl0rb,2020,2,ontario,Just wondering if I don’t get a T4 from electi...,,,11,-0.4588,2020-02-01
97298,97758,8nnb0u,2018,5,ontario,His quote for his political campaign is “For t...,0.0,0.0,9,0.0000,2018-05-01
97299,97759,e45toy,2019,11,newfoundland,Just curious because I've heard literally noth...,,,11,-0.9491,2019-11-01
97300,97760,qk1t11,2021,10,newfoundland,Anyone have their kid(s) in CBR Minor Hockey? ...,,,11,-0.7470,2021-10-01


In [24]:
df['selftext'] = df['selftext'].str.lower()

In [25]:
x = df.iloc[3]['selftext']

In [26]:
x.split()

['hello',
 'all,',
 "i'm",
 'interested',
 'in',
 'becoming',
 'a',
 'repo',
 '/',
 'ttd',
 'here',
 'in',
 'alberta.',
 'maybe',
 "i'm",
 'not',
 'using',
 'the',
 'appropriate',
 'terms',
 'when',
 'searching',
 'google...',
 'so',
 'the',
 'results',
 'are',
 'kind',
 'of',
 'scattered.',
 'job',
 'posts,',
 'info',
 'not',
 'applicable',
 'or',
 'alberta',
 'specific,',
 'etc.',
 'so...',
 'as',
 'a',
 'last',
 'hope...',
 'i',
 'was',
 'wondering',
 'if',
 'maybe',
 'someone',
 'here',
 'on',
 'reddit',
 'can',
 'point',
 'me',
 'in',
 'the',
 'right',
 'direction?',
 "it's",
 'worth',
 'mentioning',
 'as',
 'well',
 'i',
 'am',
 'interested',
 'in',
 'working',
 'for',
 'a',
 'company',
 '(for',
 'now)',
 'so',
 'anything',
 'related',
 'to',
 'being',
 'an',
 'independent',
 'contractor',
 'is',
 'irrelevant',
 'at',
 'the',
 'moment',
 'in',
 'regards',
 'to',
 'my',
 'queries.',
 'what',
 'sort',
 'of',
 'licences',
 'are',
 'needed?',
 'are',
 'there',
 'certain',
 'governmen

In [27]:
df[df['selftext'].str.contains('https')]

Unnamed: 0.1,Unnamed: 0,id,year,month,subreddit,selftext,ups,downs,num_comments,score,date
10,10,4rkiov,2016,7,britishcolumbia,###**pemberton music festival and clean vibes ...,4.0,0.0,0,0.9953,2016-07-01
76,76,53j9hx,2016,9,alberta,https://soundcloud.com/user-61637261/jessica-l...,0.0,0.0,1,0.3724,2016-09-01
82,82,53l1ey,2016,9,alberta,the alberta rcmp tweeted that two people from ...,39.0,0.0,2,-0.0516,2016-09-01
99,99,4suxye,2016,7,ontario,i’m a graduate student at university of toront...,0.0,0.0,2,0.9782,2016-07-01
113,113,4vt4bc,2016,8,newfoundland,collected some data for corner brook's pokemon...,7.0,0.0,4,0.8162,2016-08-01
...,...,...,...,...,...,...,...,...,...,...,...
97272,97730,npgp0l,2021,5,NovaScotia,can anyone explain **why we pay 45% more** for...,,,28,0.9345,2021-05-01
97279,97738,mhiaxw,2021,3,alberta,"i’m just curious, i feel like we need a lockdo...",,,53,0.6542,2021-03-01
97281,97740,mhiw7a,2021,3,Manitoba,"hello, \n\nhealth canada is conducting a surve...",,,2,0.9224,2021-03-01
97283,97742,ovh9o4,2021,7,ontario,"hey folks, a few years back i did the circle t...",,,2,0.7847,2021-07-01


In [28]:
df[df['score'] == 0]

Unnamed: 0.1,Unnamed: 0,id,year,month,subreddit,selftext,ups,downs,num_comments,score,date
7,7,4rj1p4,2016,7,newfoundland,hey! i'm trying to figure out how to get to ca...,4.0,0.0,4,0.0,2016-07-01
14,14,4rl9cz,2016,7,saskatchewan,won't be back up until 10 at the earliest.,0.0,0.0,1,0.0,2016-07-01
23,23,53th3k,2016,9,newbrunswickcanada,http://www.cbc.ca/news/canada/prince-edward-is...,11.0,0.0,2,0.0,2016-09-01
37,37,4sdqhh,2016,7,newfoundland,the website hasn't been updated in a while. do...,2.0,0.0,3,0.0,2016-07-01
40,40,4sgwyi,2016,7,NovaScotia,although they are located only mere miles acro...,2.0,0.0,13,0.0,2016-07-01
...,...,...,...,...,...,...,...,...,...,...,...
97282,97741,ovh8jy,2021,7,britishcolumbia,does anyone know where i can buy little tree f...,,,4,0.0,2021-07-01
97285,97744,ovhif1,2021,7,saskatchewan,https://www.cbc.ca/news/canada/saskatchewan/co...,,,13,0.0,2021-07-01
97288,97747,vomo1y,2022,6,ontario,i need a booster shot to get on a flight (same...,,,12,0.0,2022-06-01
97294,97753,6qshuw,2017,7,newbrunswickcanada,visiting a cottage next week and i was wonderi...,,,9,0.0,2017-07-01


In [29]:
df.iloc[97285]['selftext']

'https://www.cbc.ca/news/canada/saskatchewan/covid-19-in-sask-50000-cases-july-31-1.6125706'

In [30]:
cleaned_df = pd.read_csv('satisfaction-data-clean-updated.csv')

In [31]:
cleaned_df = cleaned_df[(cleaned_df['Gender'] == 'Total, all persons') & (cleaned_df['GEO'] != 'Canada (excluding territories)')]

In [32]:
cleaned_df = cleaned_df.groupby(['GEO', 'Indicators'])['VALUE'].mean().reset_index(name ='VALUE')

In [33]:
cleaned_df['VALUE']= cleaned_df['VALUE'].round(0).astype(int)

In [34]:
cleaned_df

Unnamed: 0,GEO,Indicators,VALUE
0,Alberta,Life satisfaction rating between 0 and 5,21
1,Alberta,Life satisfaction rating of 6 or 7,30
2,Alberta,"Life satisfaction rating of 8, 9 or 10",49
3,Atlantic Region,Life satisfaction rating between 0 and 5,19
4,Atlantic Region,Life satisfaction rating of 6 or 7,27
5,Atlantic Region,"Life satisfaction rating of 8, 9 or 10",54
6,British Columbia,Life satisfaction rating between 0 and 5,22
7,British Columbia,Life satisfaction rating of 6 or 7,30
8,British Columbia,"Life satisfaction rating of 8, 9 or 10",47
9,Manitoba,Life satisfaction rating between 0 and 5,19


In [35]:
pivot = pd.pivot_table(cleaned_df, values='VALUE', index=['GEO'], columns=['Indicators'])

In [36]:
pivot

Indicators,Life satisfaction rating between 0 and 5,Life satisfaction rating of 6 or 7,"Life satisfaction rating of 8, 9 or 10"
GEO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alberta,21.0,30.0,49.0
Atlantic Region,19.0,27.0,54.0
British Columbia,22.0,30.0,47.0
Manitoba,19.0,28.0,53.0
New Brunswick,20.0,26.0,54.0
Newfoundland and Labrador,16.0,27.0,57.0
Nova Scotia,21.0,27.0,51.0
Ontario,22.0,30.0,47.0
Prairies Region,20.0,30.0,51.0
Prince Edward Island,16.0,26.0,58.0


In [37]:
from scipy import stats
chi2 = stats.chi2_contingency(pivot)
print(chi2.pvalue)
print(chi2.expected_freq)

0.9965747693438591
[[19.03171953 28.21368948 52.75459098]
 [19.03171953 28.21368948 52.75459098]
 [18.84140234 27.93155259 52.22704508]
 [19.03171953 28.21368948 52.75459098]
 [19.03171953 28.21368948 52.75459098]
 [19.03171953 28.21368948 52.75459098]
 [18.84140234 27.93155259 52.22704508]
 [18.84140234 27.93155259 52.22704508]
 [19.22203673 28.49582638 53.28213689]
 [19.03171953 28.21368948 52.75459098]
 [19.03171953 28.21368948 52.75459098]
 [19.03171953 28.21368948 52.75459098]]
