In [1]:
import pandas as pd

list1 = pd.read_csv('list1.csv')
list2 = pd.read_csv('list2.csv')
list3 = pd.read_csv('list3.csv')
list4 = pd.read_csv('list4.csv')
list5 = pd.read_csv('list5.csv')
final = pd.read_csv('merged_df.csv')

In [2]:
list1.head()

Unnamed: 0,State,Year,Quarter,Text
0,Alabama,2020,Q1,The move empowers the government to lock down ...
1,Alabama,2020,Q1,"Across the country, it’s mostly business as us..."
2,Alabama,2020,Q1,The spending package dwarfs what the Trump adm...
3,Alabama,2020,Q1,Should you wear a mask? Does airplane travel p...
4,Alabama,2020,Q1,Some tests distributed by the agency deliver “...


In [3]:
list2.head()

Unnamed: 0,State,Year,Quarter,Text
0,Georgia,2020.0,Q1,A fourth-generation owner of one of the bigges...
1,Georgia,2020.0,Q1,With Louisiana and Georgia delaying their prim...
2,Georgia,2020.0,Q1,Compare the number of deaths and the rate of i...
3,Georgia,2020.0,Q1,They are among the senators facing criticism f...
4,Georgia,2020.0,Q1,The warning was another indication of the Trum...


In [4]:
# Put all DataFrames into a list
dataframes = [list1, list2, list3, list4, list5]

# Concatenate them row-wise
final_list = pd.concat(dataframes, ignore_index=True)

In [5]:
final_list['Year'] = final_list['Year'].astype(int)

In [6]:
final_list

Unnamed: 0,State,Year,Quarter,Text
0,Alabama,2020,Q1,The move empowers the government to lock down ...
1,Alabama,2020,Q1,"Across the country, it’s mostly business as us..."
2,Alabama,2020,Q1,The spending package dwarfs what the Trump adm...
3,Alabama,2020,Q1,Should you wear a mask? Does airplane travel p...
4,Alabama,2020,Q1,Some tests distributed by the agency deliver “...
...,...,...,...,...
8106,Wyoming,2023,Q4,"Jeanine Tesori and George Brant’s “Grounded,” ..."
8107,Wyoming,2023,Q4,"When not driving around, electric buses and ot..."
8108,Wyoming,2023,Q4,How Adidas looked past misconduct in its partn...
8109,Wyoming,2023,Q4,The Natural Resources Defense Council is elimi...


In [19]:
final_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8111 entries, 0 to 8110
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   State     8111 non-null   object 
 1   Year      8111 non-null   int64  
 2   Quarter   8111 non-null   object 
 3   Text      8111 non-null   object 
 4   pos       8111 non-null   float64
 5   neg       8111 non-null   float64
 6   neu       8111 non-null   float64
 7   compound  8111 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 507.1+ KB


In [7]:
corpus = final_list['Text']

In [8]:
corpus = corpus.astype(str)

In [9]:
# import nltk vader library
from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
# initiate an analyzer
sia = SentimentIntensityAnalyzer()

senti_pos = []
senti_neg = []
senti_neu = []
senti_comp = []


# iterate through each sentence in corpus
for sentence in corpus:
    
    #print(sentence)
    
    # analyze the sentiment. ss is a dictionary
    ss = sia.polarity_scores(sentence)
    
    # output each sentiment score (neg, neu, pos, compound) in ss
    #print(ss['pos']) # for debugging
    senti_pos.append(ss['pos'])
    senti_neg.append(ss['neg'])
    senti_neu.append(ss['neu'])
    senti_comp.append(ss['compound'])
    
    # print an empty line as seperator
    #print('\n')

In [10]:
final_list = final_list.assign(pos = senti_pos, neg = senti_neg, neu = senti_neu, compound = senti_comp)

In [11]:
final_list

Unnamed: 0,State,Year,Quarter,Text,pos,neg,neu,compound
0,Alabama,2020,Q1,The move empowers the government to lock down ...,0.000,0.041,0.959,-0.1027
1,Alabama,2020,Q1,"Across the country, it’s mostly business as us...",0.191,0.062,0.747,0.8402
2,Alabama,2020,Q1,The spending package dwarfs what the Trump adm...,0.144,0.207,0.649,-0.5574
3,Alabama,2020,Q1,Should you wear a mask? Does airplane travel p...,0.109,0.089,0.802,0.2824
4,Alabama,2020,Q1,Some tests distributed by the agency deliver “...,0.000,0.000,1.000,0.0000
...,...,...,...,...,...,...,...,...
8106,Wyoming,2023,Q4,"Jeanine Tesori and George Brant’s “Grounded,” ...",0.123,0.115,0.762,0.1280
8107,Wyoming,2023,Q4,"When not driving around, electric buses and ot...",0.156,0.036,0.808,0.8608
8108,Wyoming,2023,Q4,How Adidas looked past misconduct in its partn...,0.015,0.000,0.985,0.0129
8109,Wyoming,2023,Q4,The Natural Resources Defense Council is elimi...,0.161,0.129,0.710,0.0516


In [12]:
# Group by State and calculate the mean for each sentiment column
state_quarter_avg_scores = final_list.groupby(['Year','State', 'Quarter'])[['pos', 'neu', 'neg', 'compound']].mean().reset_index()

# Display the results
state_quarter_avg_scores

Unnamed: 0,Year,State,Quarter,pos,neu,neg,compound
0,2020,Alabama,Q1,0.0965,0.7927,0.1109,-0.01450
1,2020,Alabama,Q2,0.0749,0.7965,0.1287,-0.19299
2,2020,Alabama,Q3,0.1116,0.8149,0.0735,0.13934
3,2020,Alabama,Q4,0.0730,0.8705,0.0563,0.15929
4,2020,Alaska,Q1,0.0340,0.8955,0.0705,-0.18579
...,...,...,...,...,...,...,...
811,2023,Wisconsin,Q4,0.0733,0.8733,0.0533,0.19950
812,2023,Wyoming,Q1,0.0730,0.8467,0.0806,-0.08449
813,2023,Wyoming,Q2,0.0920,0.8383,0.0697,0.08459
814,2023,Wyoming,Q3,0.0522,0.8758,0.0720,-0.05592


In [13]:
state_quarter_avg_scores['Quarter'] = state_quarter_avg_scores['Quarter'].str.lstrip('Q')

In [14]:
state_quarter_avg_scores

Unnamed: 0,Year,State,Quarter,pos,neu,neg,compound
0,2020,Alabama,1,0.0965,0.7927,0.1109,-0.01450
1,2020,Alabama,2,0.0749,0.7965,0.1287,-0.19299
2,2020,Alabama,3,0.1116,0.8149,0.0735,0.13934
3,2020,Alabama,4,0.0730,0.8705,0.0563,0.15929
4,2020,Alaska,1,0.0340,0.8955,0.0705,-0.18579
...,...,...,...,...,...,...,...
811,2023,Wisconsin,4,0.0733,0.8733,0.0533,0.19950
812,2023,Wyoming,1,0.0730,0.8467,0.0806,-0.08449
813,2023,Wyoming,2,0.0920,0.8383,0.0697,0.08459
814,2023,Wyoming,3,0.0522,0.8758,0.0720,-0.05592


In [15]:
final = final.rename(columns={'Geography': 'State'})

In [20]:
final['Quarter'] = final['Quarter'].astype(str)

In [26]:
final_df = pd.merge(final, state_quarter_avg_scores[['Year','State', 'Quarter', 'pos', 'neu', 'neg', 'compound']], 
               on=['Year','State', 'Quarter'], how='left')

In [28]:
final_df

Unnamed: 0,Year,Quarter,State,Total Telehealth Eligible Users,Total Medicare Part B Enrollment,Total Telehealth Users,Pct_Telehealth,Real_GDP,Total_Population,Total_Male_Population%,...,White,Black,Hispanic,Asian,American Indian or Alaska Native,Multiple Races,pos,neu,neg,compound
0,2020,1,Alabama,393549.0,5.082080e+05,21796.0,0.0554,222288.8,4903185,48.3,...,0.654,0.265,0.044,0.014,0.004,0.019,0.0965,0.7927,0.1109,-0.01450
1,2020,1,Alaska,54569.0,8.833933e+04,4139.0,0.0758,50332.8,731545,52.0,...,0.600,0.022,0.070,0.060,0.151,0.083,0.0340,0.8955,0.0705,-0.18579
2,2020,1,Arizona,508632.0,6.788920e+05,31362.0,0.0617,365027.7,7278717,49.7,...,0.542,0.043,0.318,0.033,0.039,0.024,0.0764,0.8259,0.0977,-0.16932
3,2020,1,Arkansas,308190.0,4.092010e+05,14549.0,0.0472,128340.9,3017804,48.9,...,0.721,0.152,0.078,0.016,0.006,0.024,0.0742,0.8370,0.0889,0.02960
4,2020,1,California,2119057.0,2.918656e+06,193877.0,0.0915,2933320.2,39512223,49.7,...,0.364,0.053,0.395,0.147,0.004,0.033,0.1210,0.8126,0.0665,0.07139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,2023,4,Virginia,709544.0,8.941027e+05,73929.0,0.1042,597597.1,8715698,49.4,...,0.583,0.179,0.111,0.070,0.001,0.056,0.0307,0.9059,0.0634,-0.12002
812,2023,4,Washington,483457.0,6.821407e+05,63062.0,0.1304,677238.0,7812880,50.4,...,0.625,0.038,0.145,0.099,0.008,0.078,0.0685,0.8366,0.0949,-0.13560
813,2023,4,West Virginia,145095.0,1.894707e+05,13676.0,0.0943,80798.2,1770071,49.9,...,0.902,0.026,0.020,0.007,0.004,0.044,0.0399,0.8414,0.1186,-0.43365
814,2023,4,Wisconsin,365045.0,5.011933e+05,34198.0,0.0937,344570.8,5910955,50.1,...,0.787,0.057,0.081,0.029,0.005,0.040,0.0733,0.8733,0.0533,0.19950


In [30]:
final_df.to_csv('final_df.csv', index=False)