### Load essential libraries for exploratory analysis

In [None]:
%matplotlib inline
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import matplotlib.ticker as ticker
from shapely import wkt

import datetime

### Loading the scores found from applying below analysis along with location information and timestamps
<br>   a) LabMT Happiness Score
<br>   b) Vader Compound Score
<br>   c) CoreNLP Sentiment Score
<br>   d) HuggingFace Sentiment Score
<br>   e) Flair Sentiment Score
<br>   f) Anxiety related keyword density
<br>   g) COVID-19 related keyword density
<br> And sanitize the data, eliminate abnormal scores

In [None]:
end_date_str='2022-10-01'

In [None]:
from csa.utils import CsaUtils
utils = CsaUtils()

df = pd.read_pickle(utils.preprocess_tweets_pkl)
df = df[df['localDate'] < end_date_str]
df

In [None]:
df = df[['tid','lat','long','happinessScore','vaderScore','cnlpScore','huggingfaceScore', 'flairScore','anxietyKeywordDensity','covid19KeywordDensity','localDate']]

df = df[df["happinessScore"] >= 1.3]
df["anxiousTweetRatio"] = 0
df.loc[df.anxietyKeywordDensity > 0, 'anxiousTweetRatio'] = 1

#df = df[:200]

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.long, df.lat))
gdf.crs = "EPSG:4326"

### Load both shapefiles for County map and State map of USA
<br> Apply required projection to display/plot properly

In [None]:
from csa.utils import CsaUtils
utils = CsaUtils()
county_map = gpd.read_file(f'{utils.raw_dir}/maps/us_counties/sh.shp')
county_map = county_map.to_crs("EPSG:4326")

In [None]:
state_map = gpd.read_file(f'{utils.raw_dir}/maps/us_states/sh.shp')
state_map = state_map.to_crs("EPSG:4326")
projection = "+proj=laea +lat_0=30 +lon_0=-95"
state_map = state_map.to_crs(projection)
state_map.rename(columns={'STATEFP': 'FIPS'}, inplace=True)

In [None]:
state_map = state_map.drop_duplicates(subset=['STUSPS'])
state_map = state_map.sort_values(by=['STUSPS'], ascending=True)

In [None]:
state_map.plot(cmap='magma', figsize=(12, 12))

In [None]:
STUSPS = state_map["STUSPS"].unique().tolist()
county_map = county_map[county_map["STATE"].isin(STUSPS)]

In [None]:
county_map.plot(cmap='magma', figsize=(12, 12))

### Join each Score with Map and apply rolling average on each score for 30 days

In [None]:
joined_gdf = gpd.sjoin(gdf, county_map, how='inner', op='within')

happinessScoreFIPS=joined_gdf[['tid', 'happinessScore', 'FIPS']]
vaderScoreFIPS=joined_gdf[['tid', 'vaderScore', 'FIPS']]
cnlpScoreFIPS=joined_gdf[['tid', 'cnlpScore', 'FIPS']]
huggingfaceScoreFIPS=joined_gdf[['tid', 'huggingfaceScore', 'FIPS']]
flairScoreFIPS=joined_gdf[['tid', 'flairScore', 'FIPS']]
anxiousTweetRatioFIPS=joined_gdf[['tid', 'anxiousTweetRatio', 'FIPS']]
covid19KeywordDensityFIPS=joined_gdf[['tid', 'covid19KeywordDensity', 'FIPS']]

allScoreFIPS1=joined_gdf[['tid','FIPS', 'happinessScore','vaderScore','cnlpScore','huggingfaceScore', 'flairScore','anxietyKeywordDensity','covid19KeywordDensity','localDate','anxiousTweetRatio']]

for col in ['happinessScore','vaderScore','cnlpScore','huggingfaceScore','flairScore','anxietyKeywordDensity','covid19KeywordDensity','anxiousTweetRatio']:
    allScoreFIPS1[col] = allScoreFIPS1[col].rolling(30).mean()
allScoreFIPS1.dropna(axis = 0, inplace=True)

### Load NyTimes COVID-19 Data

In [None]:
covidData_nytimes1 = utils.get_raw_nytimes_us_states()
covidData_nytimes1 = covidData_nytimes1[covidData_nytimes1['date'] < end_date_str]

### Get Top Infected states

<br>Generate the list of state IDs (FIPS) containing
<br>    a) Three most infected states' ID
<br>    b) Three least infected states' ID
<br>    c) Three median states' ID 

In [None]:
allScoreFIPS = allScoreFIPS1.copy()
allScoreFIPS['FIPS'] = allScoreFIPS['FIPS'].astype(int)/1000
allScoreFIPS['FIPS'] = allScoreFIPS['FIPS'].astype(int)
valid_fips = allScoreFIPS['FIPS'].unique()
valid_fips

In [None]:
covidData_nytimes2 = covidData_nytimes1.copy()
covidData_nytimes2 = covidData_nytimes2[covidData_nytimes2['fips'].isin(valid_fips)]
covidData_nytimes2['fips'].unique()

In [None]:
start = covidData_nytimes2['cases']
covidData_nytimes2['newCases'] = covidData_nytimes2.groupby('state')['cases'].diff().fillna(start)
covidData_nytimes2.newCases[covidData_nytimes2['newCases'] < 0] = 0
start = covidData_nytimes2['deaths']
covidData_nytimes2['newDeaths'] = covidData_nytimes2.groupby('state')['deaths'].diff().fillna(start)
covidData_nytimes2.newDeaths[covidData_nytimes2['newDeaths'] < 0] = 0
covidData_nytimes2

In [None]:
tdf = covidData_nytimes2.groupby('state').agg({'newCases': 'sum', 'newDeaths': 'sum'}).sort_values(['newCases', 'newDeaths'], ascending=False)
tdf

In [None]:
covidData_nytimes = covidData_nytimes2.copy()
for col in ['newDeaths','newCases']:
    covidData_nytimes[col] = covidData_nytimes[col].rolling(30).mean()
covidData_nytimes.dropna(axis=0, inplace=True)

In [None]:
covidData_nytimes['date'] = pd.to_datetime(covidData_nytimes['date'])
covidData_nytimes['month_number'] = covidData_nytimes['date'].apply(lambda x: x.strftime('%Y-%m'))
covidData_nytimes.rename(columns={'date': 'localDate', 'fips': 'FIPS'}, inplace=True)

allScoreFIPS['localDate'] = pd.to_datetime(allScoreFIPS['localDate'])
allScoreFIPS['month_number'] = allScoreFIPS['localDate'].apply(lambda x: x.strftime('%Y-%m'))
allScoreFIPS = allScoreFIPS.sort_values(by=['localDate','FIPS'], ascending=True)
covidData_nytimes = covidData_nytimes.sort_values(by=['localDate','FIPS'], ascending=True)

In [None]:
# Function to convert list of state FIPS to list of state name
def getListOfStateNames(listOfFIPS):
    nameList = []
    for fips in listOfFIPS:
        nameList.append(''.join(covidData_nytimes[covidData_nytimes['FIPS'] == fips]['state'].unique()))
    return nameList

In [None]:
tdf = covidData_nytimes.groupby('FIPS').agg({'newCases': 'sum', 'newDeaths': 'sum'}).sort_values(['newCases', 'newDeaths'], ascending=False)
# get top 3
top3 = tdf[:3]
# get bottom 3
bottom3 = tdf[-3:]
# get the median of the middle three rows
n = len(tdf)  # length of the DataFrame
mid_start = (n - 3) // 2  # index of the first middle row
mid_end = mid_start + 3  # index of the row after the last middle row
mid3 = tdf.iloc[mid_start:mid_end]  # get the middle three rows
top3
print("======================")
print("Three most infected states:", getListOfStateNames(top3.index))
print("======================")
print("Three median states:", getListOfStateNames(mid3.index))
print("======================")
print("Three least infected states:", getListOfStateNames(bottom3.index))
print("======================")
fips_list = np.concatenate([top3.index, mid3.index, bottom3.index])
print(fips_list)

### For each selected State

In [None]:
# Function to get the start date of the Month
def get_start_date(year, month):
    return datetime.datetime.strptime(f'{year}-{month}-1', "%Y-%m-%d").date()

# Function to Normalize data (min-max)
def maxMinNormalize(dataTable, colName):
    max_val = dataTable[colName].max()
    min_val = dataTable[colName].min()
    if max_val > min_val:        
        v = (dataTable[colName]-min_val)/(max_val-min_val)
        dataTable[colName] = v
    return dataTable

In [None]:
covidData_nytimes

In [None]:
maxCorrFactors=[]
merged={}

pd.options.mode.chained_assignment = None
for fips in fips_list:
    allScoreFIPS_county = allScoreFIPS[allScoreFIPS['FIPS']==fips]
    allScoreFIPS_county.drop('FIPS', axis=1, inplace=True)
    allScoreFIPS_county.dropna(axis = 0, inplace=True)
#     print(allScoreFIPS_county)
    allScoreFIPS_county = allScoreFIPS_county.pivot_table(index=['month_number'], aggfunc='mean')

    covidData_nytimes_county = covidData_nytimes[(covidData_nytimes['FIPS']==fips)]
    covidData_nytimes_county.drop('FIPS', axis=1, inplace=True)    
    
    covidData_nytimes_county = covidData_nytimes_county.pivot_table(index=['month_number'], aggfunc='mean')
    
#     print("+++ ", fips, " +++")
    merged[fips] = allScoreFIPS_county.merge(covidData_nytimes_county, on=['month_number'], how='left')
    merged[fips] = merged[fips].sort_values(by=['month_number'], ascending=True)

    merged[fips] = maxMinNormalize(merged[fips], 'newCases')
    merged[fips] = maxMinNormalize(merged[fips], 'newDeaths')
    merged[fips] = maxMinNormalize(merged[fips], 'happinessScore')
    merged[fips] = maxMinNormalize(merged[fips], 'vaderScore')
    merged[fips] = maxMinNormalize(merged[fips], 'cnlpScore')
    merged[fips] = maxMinNormalize(merged[fips], 'huggingfaceScore')
    merged[fips] = maxMinNormalize(merged[fips], 'flairScore')    
    merged[fips] = maxMinNormalize(merged[fips], 'anxiousTweetRatio')
    merged[fips] = maxMinNormalize(merged[fips], 'covid19KeywordDensity')
    
    new_index = []
    for month_number in merged[fips].index:
        parts = month_number.split("-")
        year = int(parts[0])
        month = int(parts[1])
        new_index.append(get_start_date(year, month).strftime('%Y-%m'))
    merged[fips].index = new_index

### For each selected State
<br>    Draw line charts with all attributes against time for each state.

In [None]:
covid_stats_attrs = ["newCases","newDeaths"]
sentiment_score_attrs = ["happinessScore","vaderScore","cnlpScore", "huggingfaceScore", "flairScore"]
line_color_set = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
line_marker_set = ['.', 'x', 'd', '^', '<']

for fips in fips_list:
    plt.clf()
    plt.cla()
    plt.close()
    state_name = ''.join(covidData_nytimes[covidData_nytimes['FIPS'] == fips]['state'].unique())
    print("\n======================================= State =", state_name, "===========================================")

    fig = plt.figure(figsize=(18,9))
    
    line_index = 0
    ax1 = fig.add_subplot(211)
    plt.margins(x=0.005)

    ax1.set_title(state_name + " (newCases vs All Sentiment Scores)")
    ax1.xaxis.set_major_locator(ticker.MultipleLocator(1))
    merged[fips].plot(ax=ax1, y="newCases", linewidth=2, linestyle='--', color='black', alpha=0.7)
    for y in sentiment_score_attrs:
        merged[fips].plot(ax=ax1, y=y, linestyle='-', marker=line_marker_set[line_index], color=line_color_set[line_index], alpha=0.6-(0.1*line_index))
        line_index += 1
    plt.grid(axis="x", color='black', alpha=0.1, linewidth=0.7)
    
    line_index = 0
    ax2 = fig.add_subplot(212)
    plt.margins(x=0.005)

    ax2.set_title(state_name + " (newDeaths vs All Sentiment Scores)")
    ax2.xaxis.set_major_locator(ticker.MultipleLocator(1))
    merged[fips].plot(ax=ax2, y="newDeaths", linewidth=1, linestyle='--', color='black', alpha=0.8)
    for y in sentiment_score_attrs:
        merged[fips].plot(ax=ax2, y=y, linestyle='-', marker=line_marker_set[line_index], color=line_color_set[line_index], alpha=0.7-(0.1*line_index))
        line_index += 1
    plt.grid(axis="x", color='black', alpha=0.1, linewidth=0.7)
    plt.gcf().autofmt_xdate()
    plt.show()
        
print("\n======================== Plotting Anxious Tweet Ratio Scores =============================")
for fips in fips_list:
    line_index = 0
    plt.clf()
    plt.cla()
    plt.close()
    state_name = ''.join(covidData_nytimes[covidData_nytimes['FIPS'] == fips]['state'].unique())
    
    fig, ax = plt.subplots(1, figsize=(18,4))
    ax.set_title(state_name + " (Anxious Tweet Ratio vs COVID Stats)")
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    merged[fips].plot(ax=ax, y="anxiousTweetRatio", linestyle='-', color='black', alpha=0.8)
    for x in covid_stats_attrs:
        merged[fips].plot(ax=ax, y=x, linewidth=2, linestyle='--',
                          marker=line_marker_set[line_index], color=line_color_set[line_index], alpha=0.6)
        line_index += 1
    plt.margins(x=0.005)
    plt.grid(axis="x", color='black', alpha=0.1, linewidth=0.7)
    plt.gcf().autofmt_xdate()
    plt.show()

print("\n======================== Plotting COVID-19 Keyword Density Scores =============================")
for fips in fips_list:
    line_index = 0
    plt.clf()
    plt.cla()
    plt.close()
    state_name = ''.join(covidData_nytimes[covidData_nytimes['FIPS'] == fips]['state'].unique())
    
    fig, ax = plt.subplots(1, figsize=(18,4))
    ax.set_title(state_name + " (COVID-19 Keyword Density vs COVID Stats)")
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    merged[fips].plot(ax=ax, y="covid19KeywordDensity", linestyle='-', color='black', alpha=0.8)
    for x in covid_stats_attrs:
        merged[fips].plot(ax=ax, y=x, linewidth=2, linestyle='--',
                          marker=line_marker_set[line_index], color=line_color_set[line_index], alpha=0.6)
        line_index += 1
    plt.margins(x=0.005)
    plt.grid(axis="x", color='black', alpha=0.1, linewidth=0.7)
    plt.gcf().autofmt_xdate()
    plt.show()

### Prepare US County map to Plot Happiness Score, Vader Sentiment Score, CoreNLP Sentiment Score

In [None]:
# Function to merge all available scores in the map based on location information (FIPS), skew the map and get Mean values
def mergeSkewAndgetMean(dataTable, mapTable, projectionStr):
    county_data = pd.merge(mapTable, dataTable, on='FIPS', how='inner')
    county_avg_data = county_data.groupby(['FIPS','geometry'], as_index=False, sort=False).mean()
    county_avg_data.crs = "EPSG:4326"
    county_avg_data['geometry'] = county_avg_data['geometry'].apply(wkt.loads)
    county_avg_data = gpd.GeoDataFrame(county_avg_data, geometry='geometry')
    county_avg_data.crs = "EPSG:4326"
    county_avg_data = county_avg_data.to_crs(projectionStr)
    return county_avg_data

In [None]:
county_map['geometry'] = county_map['geometry'].astype(str)
cmap = "Paired"

### Plot the Happiness Scores

In [None]:
county_avg_happinessScore = mergeSkewAndgetMean(happinessScoreFIPS, county_map, projection)
county_avg_happinessScore

In [None]:
fig, ax = plt.subplots(1, figsize=(11,8.5))
ax.axis('off')
county_avg_happinessScore.plot(ax=ax, column='happinessScore', legend=True, scheme='quantiles', linewidth=0.3, edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot the Vader Scores

In [None]:
county_avg_vaderScore = mergeSkewAndgetMean(vaderScoreFIPS, county_map, projection)
county_avg_vaderScore

In [None]:
county_avg_vaderScore["sentiText"] = "Neutral"
county_avg_vaderScore.loc[county_avg_vaderScore.vaderScore > 0.5, 'sentiText'] = "Positive"
county_avg_vaderScore.loc[county_avg_vaderScore.vaderScore < -0.5, 'sentiText'] = "Negative"

fig, ax = plt.subplots(1, figsize=(11,8.5))
ax.axis('off')
county_avg_vaderScore.plot(ax=ax, column='sentiText', legend=True, linewidth=0.3,
                       edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot CNLP Scores

In [None]:
county_avg_cnlpScore = mergeSkewAndgetMean(cnlpScoreFIPS, county_map, projection)
county_avg_cnlpScore

In [None]:
county_avg_cnlpScore["sentiText"] = "Neutral"
county_avg_cnlpScore.loc[(county_avg_cnlpScore.cnlpScore >= 0.5) 
                                     & (county_avg_cnlpScore.cnlpScore < 1.5), 'sentiText'] = "Positive"
county_avg_cnlpScore.loc[(county_avg_cnlpScore.cnlpScore <= -0.5) 
                                     & (county_avg_cnlpScore.cnlpScore > -1.5), 'sentiText'] = "Negative"
county_avg_cnlpScore.loc[county_avg_cnlpScore.cnlpScore >= 1.5, 'sentiText'] = "Very positive"
county_avg_cnlpScore.loc[county_avg_cnlpScore.cnlpScore <= -1.5, 'sentiText'] = "Very negative"

fig, ax = plt.subplots(1, figsize=(11,8.5))
ax.axis('off')
county_avg_cnlpScore.plot(ax=ax, column='sentiText', legend=True, linewidth=0.3,
                       edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot Hugging Face Scores

In [None]:
county_avg_huggingfaceScore = mergeSkewAndgetMean(huggingfaceScoreFIPS, county_map, projection)
county_avg_huggingfaceScore

In [None]:
county_avg_huggingfaceScore["sentiText"] = "Neutral"
county_avg_huggingfaceScore.loc[county_avg_huggingfaceScore.huggingfaceScore > 0, 'sentiText'] = "Positive"
county_avg_huggingfaceScore.loc[county_avg_huggingfaceScore.huggingfaceScore < 0, 'sentiText'] = "Negative"

fig, ax = plt.subplots(1, figsize=(12,12))
ax.axis('off')
county_avg_huggingfaceScore.plot(ax=ax, column='sentiText', legend=True, linewidth=0.3,
                          edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot Flair Scores

In [None]:
county_avg_flairScore = mergeSkewAndgetMean(flairScoreFIPS, county_map, projection)
county_avg_flairScore

In [None]:
county_avg_flairScore["sentiText"] = "Neutral"
county_avg_flairScore.loc[county_avg_flairScore.flairScore > 0, 'sentiText'] = "Positive"
county_avg_flairScore.loc[county_avg_flairScore.flairScore < 0, 'sentiText'] = "Negative"

fig, ax = plt.subplots(1, figsize=(12,12))
ax.axis('off')
county_avg_flairScore.plot(ax=ax, column='sentiText', legend=True, linewidth=0.3,
                          edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot Average Anxious Tweet Ratio Scores

In [None]:
county_avg_anxiousTweetRatio = mergeSkewAndgetMean(anxiousTweetRatioFIPS, county_map, projection)
county_avg_anxiousTweetRatio

In [None]:
county_avg_anxiousTweetRatio["percentText"] = "Below 1%"
county_avg_anxiousTweetRatio.loc[(county_avg_anxiousTweetRatio.anxiousTweetRatio >= 0.01) 
                                     & (county_avg_anxiousTweetRatio.anxiousTweetRatio <= 0.05), 'percentText'] = "1% ~ 5%"
county_avg_anxiousTweetRatio.loc[(county_avg_anxiousTweetRatio.anxiousTweetRatio > 0.05) 
                                     & (county_avg_anxiousTweetRatio.anxiousTweetRatio <= 0.10), 'percentText'] = "5% ~ 10%"
county_avg_anxiousTweetRatio.loc[(county_avg_anxiousTweetRatio.anxiousTweetRatio > 0.10) 
                                     & (county_avg_anxiousTweetRatio.anxiousTweetRatio <= 0.50), 'percentText'] = "10% ~ 50%"
county_avg_anxiousTweetRatio.loc[county_avg_anxiousTweetRatio.anxiousTweetRatio > 0.50, 'percentText'] = "Above 50%"

fig, ax = plt.subplots(1, figsize=(11,8.5))
ax.axis('off')
county_avg_anxiousTweetRatio.plot(ax=ax, column='percentText', legend=True, linewidth=0.3,
                       edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')

### Plot COVID-19 keyword density Scores

In [None]:
county_avg_covid19KeywordDensity = mergeSkewAndgetMean(covid19KeywordDensityFIPS, county_map, projection)
county_avg_covid19KeywordDensity

In [None]:
county_avg_covid19KeywordDensity["percentText"] = "Below 1%"
county_avg_covid19KeywordDensity.loc[(county_avg_covid19KeywordDensity.covid19KeywordDensity >= 0.01) 
                                     & (county_avg_covid19KeywordDensity.covid19KeywordDensity <= 0.02), 'percentText'] = "1% ~ 2%"
county_avg_covid19KeywordDensity.loc[(county_avg_covid19KeywordDensity.covid19KeywordDensity > 0.02) 
                                     & (county_avg_covid19KeywordDensity.covid19KeywordDensity <= 0.05), 'percentText'] = "2% ~ 5%"
county_avg_covid19KeywordDensity.loc[county_avg_covid19KeywordDensity.covid19KeywordDensity > 0.05, 'percentText'] = "Above 5%"

fig, ax = plt.subplots(1, figsize=(11,8.5))
ax.axis('off')
county_avg_covid19KeywordDensity.plot(ax=ax, column='percentText', legend=True, linewidth=0.3,
                       edgecolor='0.8', legend_kwds={'loc': 'lower left'}, cmap=cmap)
state_map.plot(facecolor="none", ax=ax, linewidth=0.4, edgecolor='0')