In [None]:
import numpy as np
import pandas as pd
import os
import csv
import matplotlib.pyplot as plt
from linearmodels import PooledOLS
import statsmodels.api as sm
import matplotlib
import seaborn as sns
import altair as alt
import plotly.express as px
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [None]:
# loading in the data
#eligible voters
voters=pd.read_csv('msc_thesis/data/eligible voters',index_col=0)
# variables
data=pd.read_csv('msc_thesis/data/totaldata',index_col=0,header=[0,1])
data.columns.names=['variable','years']
data.index.names=['FIPS']
data=data.stack('years')
data=data.unstack('FIPS')
data.index = data.index.map(int) 
temp=data.loc[:,'dem_votesshare'].shift(1)
temp.columns=[np.repeat('lagged_dem_voteshare',len(temp.columns)),temp.columns]
temp2=data.loc[:,'exposure_dem'].shift(1)
temp2.columns=[np.repeat('lagged_exposure_dem',len(temp2.columns)),temp2.columns]
temp3=data.loc[:,'corrected exposure dem'].shift(1)
temp3.columns=[np.repeat('lagged_corrected_exposure_dem',len(temp3.columns)),temp3.columns]
data=pd.concat([data,temp,temp2,temp3],axis=1)

temp=data.loc[:,'rep_voteshare'].shift(1)
temp.columns=[np.repeat('lagged_rep_voteshare',len(temp.columns)),temp.columns]
temp2=data.loc[:,'exposure_rep'].shift(1)
temp2.columns=[np.repeat('lagged_exposure_rep',len(temp2.columns)),temp2.columns]
temp3=data.loc[:,'corrected exposure rep'].shift(1)
temp3.columns=[np.repeat('lagged_corrected_exposure_rep',len(temp3.columns)),temp3.columns]
data=pd.concat([data,temp,temp2,temp3],axis=1).dropna()

data=data.stack('FIPS')
data=data.swaplevel(0,1)
data=data.sort_index(level=0)
#SCI
SCI=pd.read_csv('msc_thesis/data/SCI',index_col=0)
#number of eligible friends based on SCI
eligiblefriends=pd.read_csv('msc_thesis/data/eligible friends',index_col=0)
eligiblefriends.columns =eligiblefriends.columns.map(int)

# First some general descriptive plot about the data and exposure

In [None]:
# getting the exposure
df_exposure_pop=pd.DataFrame(index=voters.index)
for i in [2004,2008,2012,2016,2020]:
    df_exposure_pop['expo_pop_dem'+str(i)]=data.unstack('years').loc[:,'exposure_dem'].loc[:,i]/voters.loc[data.unstack('years').index,str(i)]
    df_exposure_pop['expo_pop_rep'+str(i)]=data.unstack('years').loc[:,'exposure_rep'].loc[:,i]/voters.loc[data.unstack('years').index,str(i)]

df_exposure_pop.columns=[ ['expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep',
                         'expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep'],
    np.repeat([2004,2008,2012,2016,2020],2)]
temp=df_exposure_pop['expo_pop_dem']-df_exposure_pop['expo_pop_rep']
df_exposure_pop=pd.concat([df_exposure_pop,temp],axis=1)
df_exposure_pop.columns=[ ['expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep',
                         'expo_pop_dem','expo_pop_rep','expo_pop_dem','expo_pop_rep','expo_pop_diff','expo_pop_diff'
                          ,'expo_pop_diff','expo_pop_diff','expo_pop_diff'],
   [2004, 2004, 2008, 2008, 2012, 2012, 2016, 2016, 2020, 2020,2004,2008,2012,2016,2020]]


In [None]:
# graph of democratic exposure through election cycles
data['exposure_dem'].unstack('years').plot(kind='kde',figsize=(15,5),use_index=False,title='democratic exposure')

In [None]:
#difference in exposure between democrates and republicans (figure 1 in thesis)
(data['exposure_dem'].unstack('years')-data['exposure_rep'].unstack('years')).plot(kind='kde',figsize=(15,5),use_index=False,xlim=[-0.7,0.5])

In [None]:
#the difference between democratic and republican of  exposure divided by the county's population
df_exposure_pop['expo_pop_diff'].plot(kind='kde',figsize=(15,5),use_index=False,title='difference in exposure',xlim=[-0.0002,0.0002])

In [None]:
#scatter of the exposure of both parties
data.unstack('years').plot(kind='scatter',x='exposure_dem',y='exposure_rep',figsize=(15,5),use_index=False,title='correlation between exposure dem and rep all year')

In [None]:
#correlogram of variables in the data set
f, ax = plt.subplots(figsize=(21,15))
ax = sns.heatmap(
    data.corr(), 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    
)

In [None]:
#relation of democratic vote share and exposure
fig, ax = plt.subplots()
ax.scatter(data.loc[:,'exposure_dem'],data.loc[:,'dem_votesshare'], color ='blue')
#ax.axvline(270, color = 'r', ls = '--')
ax.set_xlabel('Exposure', fontsize = 15)
ax.set_ylabel('Voteshare', fontsize = 15)
ax.set_title('With lagged values', fontsize = 30)
plt.show()

# The main regressions of the thesis

In [None]:
#the main regression for democrats that is used
y=data['dem_votesshare']
x=data.loc[:,[
    #age groups
    '0-19 years old', '20-29 years old', '30-44 years old','45-64 years old',
    
    #education
    'Bachelor',  'Highschool' ,'Less than highschool',
    
    #religion
    'AMSHRATE', 'BAPTRATE','CATHRATE','EVANRATE','JEWRATE','SALVRATE', 'SBCRATE', 'UMCRATE','MSLMRATE',
    # 'ORTHRATE',
    
    #race and ethnicity
    'NHWA_FEMALE', 'NHWA_MALE','NHBA_FEMALE', 'NHBA_MALE','H_FEMALE', 'H_MALE','NHAA_MALE',
    #'NHIA_FEMALE', 'NHIA_MALE', 'NHNA_FEMALE','NHNA_MALE', 'NHTOM_FEMALE', 'NHTOM_MALE', 'NHAA_FEMALE', 
    
    #economic variables
    'unemployment rate','SP500','nasdaq','inflation', 'median income', 'youth poverty','poverty',
    
    #location variables
    'homestate of democratic president', 'homestate of democratic vp', 'homestate of republican president',
    'homestate of republican vp', 'metro', 'metro adjacent',
    
    #campaign visits
    #'visits_dem_pres', 'visits_dem_vp','visits_dem_FL','visits_rep_pres','visits_rep_vp','visits_rep_FL',
    'visits_dem',
    #'corrected total rep visits','corrected total dem visits','visits_rep',
    
    #exposure variables    
    #'exposure_dem', 'exposure_rep',
    #'corrected exposure dem', 'corrected exposure rep',
    'lagged_exposure_dem',
    #'lagged_corrected_exposure_rep','lagged_corrected_exposure_dem','lagged_dem_voteshare', 'lagged_exposure_rep',
    
    #political variables
    'third party vote','voter turnout'
    
    #not used 
    #'dem_votesshare','lagged_rep_voteshare','rep_voteshare', 'democratic incumbant',
    
            ]]
x.columns=['% 0-19 years old', '% 20-29 years old', '% 30-44 years old','% 45-64 years old',
           '% bachelor',  '% highschool' ,'% less than highschool',
            '% Amish', '% Baptist','% Catholic','% Evangelicals','% Jewish','% Salvation army',
           '% Southern batptis', '% United methodist','% Muslim',
           '% white female', '% white male','% black female', '% black male',
           '% Hispanic female', '% Hispanic male', '% asian male',
           'Unemployment rate','SP500','Nasdaq','Inflation', 'Median income', 'Youth poverty','Poverty',
           'Homestate democratic candidate', 'Homestate democratic running mate', 'Homestate republican candidate',
    'Homestate republican running mate', 'Metro', 'Metro adjacent', 'Campaign visits', 'Exposure',
           'Third party vote','Voter turnout'
            
          ]
x= sm.add_constant(x)
mod = PooledOLS(y, x)
pooledOLS_res = mod.fit()
res1= mod.fit()
# Store values for checking homoskedasticity graphically
fittedvals_pooled_OLS = pooledOLS_res.predict().fitted_values
residuals_pooled_OLS = pooledOLS_res.resids
print(pooledOLS_res)

In [None]:
#the main regression that is used for republicans 
y=data['rep_voteshare']
x=data.loc[:,[
    #age groups
    '0-19 years old', '20-29 years old', '30-44 years old','45-64 years old',
    
    #education
    'Bachelor',  'Highschool' ,'Less than highschool',
    
    #religion
    'AMSHRATE', 'BAPTRATE','CATHRATE','EVANRATE','JEWRATE','SALVRATE', 'SBCRATE', 'UMCRATE','MSLMRATE',
    # 'ORTHRATE',
    
    #race and ethnicity
    'NHWA_FEMALE', 'NHWA_MALE','NHBA_FEMALE', 'NHBA_MALE','H_FEMALE', 'H_MALE','NHAA_MALE',
    #'NHIA_FEMALE', 'NHIA_MALE', 'NHNA_FEMALE','NHNA_MALE', 'NHTOM_FEMALE', 'NHTOM_MALE',  'NHAA_FEMALE',
    
    #economic variables
    'unemployment rate','SP500','nasdaq','inflation', 'poverty','youth poverty', 'median income',
    
    #location variables
    'homestate of democratic president', 'homestate of democratic vp', 'homestate of republican president',
    'homestate of republican vp', 'metro', 'metro adjacent',
    
    #campaign visits
    #'visits_dem_pres', 'visits_dem_vp','visits_dem_FL','visits_rep_pres','visits_rep_vp','visits_rep_FL',
    'visits_rep',
    #'corrected total rep visits','corrected total dem visits','visits_dem',
    
    #exposure variables    
    #'exposure_dem', 'exposure_rep',
    #'corrected exposure dem', 'corrected exposure rep',
    'lagged_exposure_rep',
    #'lagged_corrected_exposure_rep','lagged_corrected_exposure_dem','lagged_dem_voteshare', 'lagged_exposure_dem',
    
    #political variables
    'third party vote','voter turnout'
    
    #not used 
    #'dem_votesshare','lagged_rep_voteshare','rep_voteshare', 'democratic incumbant',
    
            ]]
x.columns=['% 0-19 years old', '% 20-29 years old', '% 30-44 years old','% 45-64 years old',
           '% bachelor',  '% highschool' ,'% less than highschool',
            '% Amish', '% Baptist','% Catholic','% Evangelicals','% Jewish','% Salvation army',
           '% Southern batptis', '% United methodist','% Muslim',
           '% white female', '% white male','% black female', '% black male',
           '% Hispanic female', '% Hispanic male', '% asian male',
           'Unemployment rate','SP500','Nasdaq','Inflation', 'Median income', 'Youth poverty','Poverty',
           'Homestate democratic candidate', 'Homestate democratic running mate', 'Homestate republican candidate',
    'Homestate republican running mate', 'Metro', 'Metro adjacent', 'Campaign visits', 'Exposure',
           'Third party vote','Voter turnout'
            
          ]
x= sm.add_constant(x)
mod = PooledOLS(y, x)
res2=mod.fit()
pooledOLS_res = mod.fit()
# Store values for checking homoskedasticity graphically
fittedvals_pooled_OLS = pooledOLS_res.predict().fitted_values
residuals_pooled_OLS = pooledOLS_res.resids
print(pooledOLS_res)

# The simulation for the election, this is used to get the influence measure.
# It is also possible to not run this part and load the values in.
# Running this part will take several hours

In [None]:
#the function for the simulation
def simulation(predictedvalues, N, sigma, year, voters,StateFIPS,EV, compact=False):
    if compact==False:
        predictedvalues.index=predictedvalues.index.droplevel('years')
    votes=pd.concat([predictedvalues,voters[str(year)]],axis=1).dropna()
    votes.columns=['predicted values','voters']
    votes['casted votes']=votes.iloc[:,0]*votes.iloc[:,1]
    statevotes=[]
    statevoters=[]
    for i in StateFIPS.iloc[:,0]:
        countiesinstate=[True if int(i) == int(str(j)[:-3]) else False for j in votes.index ]
        statevotes.append(sum(votes.loc[countiesinstate,'casted votes']))
        statevoters.append(sum(votes.loc[countiesinstate,'voters']))
    StateFIPS['casted votes']=statevotes
    StateFIPS['voters']=statevoters
    StateFIPS['win chance']=StateFIPS['casted votes']/StateFIPS['voters']
    StateFIPS['EV']=EV
    ev_results=[]
    for i in range(N):
        results=np.random.normal(loc=np.array(StateFIPS['win chance']),scale=sigma)
        ev_results.append(sum(StateFIPS.loc[results>0.5,'EV']))
        
    return ev_results
                    

In [None]:
#generating a test and train set
i=2020
y_train=y.loc[y.index.get_level_values('years')!=i]
y_test=y.loc[y.index.get_level_values('years')==i]
x_train=x.loc[x.index.get_level_values('years')!=i]
x_test=x.loc[x.index.get_level_values('years')==i]

In [None]:
# example of a simulation 
N=10000
county='01001'
delta=0
year=i
voters=pd.read_csv('msc_thesis/data/eligible voters')
voters=voters.set_index('FIPS')
dateofdata='10-9'

influence=pd.DataFrame(index=SCI.columns,columns=['influence'])
StateFIPS=pd.read_excel('msc_data_raw/state-geocodes-v2016.xls')
StateFIPS.columns=StateFIPS.iloc[3]
StateFIPS=StateFIPS.iloc[4:,2:]
StateFIPS.columns= ['State FIPS', 'State']
StateFIPS=StateFIPS.set_index('State')

if year>2010:
    EV=np.array([9,3,11,6,55,9,7,3,3,29,16,4,4,20,11,6,6,8,8,4,10,11,16,10,6,10,3,5,6,
                      4,14,5,29,15,3,18,7,7,20,4,9,3,11,38,6,3,13,12,5,10,3])
else:
    EV=np.array([9,3,10,6,55,9,7,3,3,27,15,4,4,21,11,7,6,8,9,4,10,12,17,10,6,11,3,5,5,
                      4,15,5,31,15,3,20,7,7,21,4,8,3,11,34,5,3,13,11,5,10,3])

y_hat=res1.predict(x_test)
y_hat_camp=(y_hat.unstack('years')['predictions'][2020]+res1.params['Exposure']*delta*SCI.loc[:,county]/eligiblefriends[(year-4)]).dropna()
y_hat=res1.predict().fitted_values.loc[fittedvals_pooled_OLS.index.get_level_values('years')==2004]
est=simulation(predictedvalues=y_hat, N=N, sigma=0.05, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=False)
#average electoral votes
np.sum(est)/N

In [None]:
#simulation with network effects for all counties

N=150000
delta=150000
year=2020
sigma=0.2

voters=pd.read_csv('msc_thesis/data/eligible voters')
voters=voters.set_index('FIPS')

mod = PooledOLS(y, x)
pooledOLS_res = mod.fit()




influence=pd.DataFrame(index=SCI.columns,columns=['influence'])
StateFIPS=pd.read_excel('msc_data_raw/state-geocodes-v2016.xls')
StateFIPS.columns=StateFIPS.iloc[3]
StateFIPS=StateFIPS.iloc[4:,2:]
StateFIPS.columns= ['State FIPS', 'State']
StateFIPS=StateFIPS.set_index('State')

if year>2010:
    EV=np.array([9,3,11,6,55,9,7,3,3,29,16,4,4,20,11,6,6,8,8,4,10,11,16,10,6,10,3,5,6,
                      4,14,5,29,15,3,18,7,7,20,4,9,3,11,38,6,3,13,12,5,10,3])
else:
    EV=np.array([9,3,10,6,55,9,7,3,3,27,15,4,4,21,11,7,6,8,9,4,10,12,17,10,6,11,3,5,5,
                      4,15,5,31,15,3,20,7,7,21,4,8,3,11,34,5,3,13,11,5,10,3])

y_hat=pooledOLS_res.predict(x_test)
est=simulation(predictedvalues=y_hat, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=False)
baseline=np.sum(np.array(est)>269)/N
for i in SCI.columns:
    county=i
    y_hat_camp=(y_hat['predictions']+pooledOLS_res.params['Exposure']*delta*SCI.loc[:,county]/eligiblefriends[(year-4)]).dropna()
    est=simulation(predictedvalues=y_hat_camp, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=True)
    influence.loc[i,'influence']=np.sum(np.array(est)>269)/N-baseline
influence.to_csv('msc_thesis/results/year='+str(year)+'_delta='+str(delta)+'_N='+str(N)+'_sigma='+str(sigma)+'_for_model_with_exposure')

In [None]:
#simulation without network effects for all counties

N=150000
delta=150000
year=2020
sigma=0.2

voters=pd.read_csv('msc_thesis/data/eligible voters')
voters=voters.set_index('FIPS')

mod = PooledOLS(y, x)
pooledOLS_res = mod.fit()

y_hat=pooledOLS_res.predict(x_test)
est=simulation(predictedvalues=y_hat, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=False)

baseline=np.sum(np.array(est)>269)/N

influencewithoutnetwork=pd.DataFrame(index=SCI.columns,columns=['influence'])
StateFIPS=pd.read_excel('msc_data_raw/state-geocodes-v2016.xls')
StateFIPS.columns=StateFIPS.iloc[3]
StateFIPS=StateFIPS.iloc[4:,2:]
StateFIPS.columns= ['State FIPS', 'State']
StateFIPS=StateFIPS.set_index('State')

if year>2010:
    EV=np.array([9,3,11,6,55,9,7,3,3,29,16,4,4,20,11,6,6,8,8,4,10,11,16,10,6,10,3,5,6,
                      4,14,5,29,15,3,18,7,7,20,4,9,3,11,38,6,3,13,12,5,10,3])
else:
    EV=np.array([9,3,10,6,55,9,7,3,3,27,15,4,4,21,11,7,6,8,9,4,10,12,17,10,6,11,3,5,5,
                      4,15,5,31,15,3,20,7,7,21,4,8,3,11,34,5,3,13,11,5,10,3])

for i in range(len(StateFIPS)):
    state=StateFIPS.iloc[i,0]
    countiesinstate=np.array([True if int(state) == int(str(j)[:-3]) else False for j in y_hat.index ])
    y_hat_camp=(y_hat['predictions']+delta*countiesinstate/np.sum(countiesinstate)/voters.loc[y_hat.index,'2020']).dropna()
    est=simulation(predictedvalues=y_hat_camp, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=True)
    influencewithoutnetwork.loc[i,'influence']=np.sum(np.array(est)>269)/N-baseline
influencewithoutnetwork.to_csv('msc_thesis/results/year='+str(year)+'_delta='+str(delta)+'_N='+str(N)+'_sigma='+str(sigma)+'_for_model_with_exposure_without_network_effect')

In [None]:
#baseline of electoral votes


N=150000
delta=0
year=2020
sigma=0.2
influencewithoutnetwork=pd.DataFrame(index=SCI.columns,columns=['influence'])
voters=pd.read_csv('msc_thesis/data/eligible voters')
voters=voters.set_index('FIPS')

mod = PooledOLS(y, x)
pooledOLS_res = mod.fit()

y_hat=pooledOLS_res.predict(x_test)
est=simulation(predictedvalues=y_hat, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=False)

baseline=np.sum(np.array(est)>269)/N

influencebaseline=pd.DataFrame(index=SCI.columns,columns=['influence'])
StateFIPS=pd.read_excel('msc_data_raw/state-geocodes-v2016.xls')
StateFIPS.columns=StateFIPS.iloc[3]
StateFIPS=StateFIPS.iloc[4:,2:]
StateFIPS.columns= ['State FIPS', 'State']
StateFIPS=StateFIPS.set_index('State')

if year>2010:
    EV=np.array([9,3,11,6,55,9,7,3,3,29,16,4,4,20,11,6,6,8,8,4,10,11,16,10,6,10,3,5,6,
                      4,14,5,29,15,3,18,7,7,20,4,9,3,11,38,6,3,13,12,5,10,3])
else:
    EV=np.array([9,3,10,6,55,9,7,3,3,27,15,4,4,21,11,7,6,8,9,4,10,12,17,10,6,11,3,5,5,
                      4,15,5,31,15,3,20,7,7,21,4,8,3,11,34,5,3,13,11,5,10,3])

for i in range(len(StateFIPS)):
    y_hat_camp=y_hat
    est=simulation(predictedvalues=y_hat_camp, N=N, sigma=sigma, year=year, voters=voters,StateFIPS=StateFIPS,EV=EV, compact=True)
    influencewithoutnetwork.loc[i,'influence']=np.sum(np.array(est)>269)/N-baseline
influencewithoutnetwork.to_csv('msc_thesis/results/year='+str(year)+'_delta='+str(delta)+'_N='+str(N)+'_sigma='+str(sigma)+'_for_model_with_exposure_baseline')

# Visuals of the influence measure

In [None]:
#if the previous part is not performed the values can be loaded in here

#For democrats
influence = pd.read_csv('msc_thesis/results/year=2020_delta=50000_N=150000_sigma=0.03_for_model_with_exposure',index_col=0)
influencebaseline = pd.read_csv('msc_thesis/results/year=2020_delta=0_N=50000_sigma=0.03_for_model_with_exposure_baseline',index_col=0)
influencewithoutnetwork = pd.read_csv('msc_thesis/results/year=2020_delta=50000_N=50000_sigma=0.03_for_model_with_exposure_without_network_effect',index_col=0)
influencewithoutnetworkstate = pd.read_csv('msc_thesis/results/year=2020_delta=50000_N=150000_sigma=0.03_for_model_with_exposure_without_network_effect',index_col=0).dropna()

StateFIPS=pd.read_excel('msc_data_raw/state-geocodes-v2016.xls')
StateFIPS.columns=StateFIPS.iloc[3]
StateFIPS=StateFIPS.iloc[4:,2:]
StateFIPS.columns= ['State FIPS', 'State']
StateFIPS=StateFIPS.set_index('State')
df_influence=pd.concat([influence,influencebaseline,influencewithoutnetwork],axis=1)
df_influence.columns=['influence','influencebaseline','influencewithoutnetwork']

#for republicans
influencerep = pd.read_csv('msc_thesis/results/year=2020_delta=50000_N=50000_sigma=0.03_for_model_with_exposure_republican',index_col=0)
influencebaselinerep = pd.read_csv('msc_thesis/results/year=2020_delta=0_N=50000_sigma=0.03_for_model_with_exposure_baseline_republican',index_col=0).dropna()
influencewithoutnetworkstaterep = pd.read_csv('msc_thesis/results/year=2020_delta=50000_N=50000_sigma=0.03_for_model_with_exposure_without_network_effect_republican',index_col=0).dropna()

stateinfluencerep=pd.DataFrame(index=StateFIPS.index,columns=['influence'])
for i in range(len(StateFIPS)):
    state=StateFIPS.iloc[i,0]
    countiesinstate=np.array([True if int(state) == int(str(j)[:-3]) else False for j in influence.index ])
    countiesinstatefromdata=np.array([True if int(state) == int(str(j)[:-3]) else False for j in data.index.get_level_values('FIPS').unique() ])
    countiesinstate2=np.array([True if int(state) == int(str(j)[:-3]) else False for j in voters.index ])
    stateinfluencerep.iloc[i]=sum(influencerep.loc[countiesinstate,'influence'])/sum(countiesinstate)
    
df_influencestaterep=stateinfluencerep
df_influencestaterep['temp1']=np.array(influencebaselinerep['influence'])
df_influencestaterep['temp']=np.array(influencewithoutnetworkstaterep['influence'])
df_influencestaterep.columns=['influencerep','influencebaselinerep','influencewithoutnetworkrep']

In [None]:
#aggregating for the state level if the cell above is not performed delete the #
stateinfluence=pd.DataFrame(index=StateFIPS.index,columns=['influence'])
stateinfluencebaseline=pd.DataFrame(index=StateFIPS.index,columns=['influence'])
#stateinfluencewithoutnetwork=pd.DataFrame(index=StateFIPS.index,columns=['influence'])
visitsstatedem=pd.DataFrame(index=StateFIPS.index,columns=['visits'])
for i in range(len(StateFIPS)):
    state=StateFIPS.iloc[i,0]
    countiesinstate=np.array([True if int(state) == int(str(j)[:-3]) else False for j in influence.index ])
    countiesinstatefromdata=np.array([True if int(state) == int(str(j)[:-3]) else False for j in data.index.get_level_values('FIPS').unique() ])
    countiesinstate2=np.array([True if int(state) == int(str(j)[:-3]) else False for j in voters.index ])
    stateinfluence.iloc[i]=sum(influence.loc[countiesinstate,'influence'])/sum(countiesinstate)
    stateinfluencebaseline.iloc[i]=sum(influencebaseline.loc[countiesinstate,'influence'])/sum(countiesinstate)
    #stateinfluencewithoutnetwork.iloc[i]=sum(influencewithoutnetwork.loc[countiesinstate,'influence'])/sum(countiesinstate)
    visitsstatedem.iloc[i]=sum((data.loc[data.index.get_level_values('years')==2020].loc[countiesinstatefromdata,'visits_dem']/voters['2020'].loc[countiesinstate2]).dropna())*10000

df_influencestate=pd.concat([stateinfluence,stateinfluencebaseline],axis=1)
df_influencestate['temp']=np.array(influencewithoutnetworkstate['influence'])
df_influencestate.columns=['influence','influencebaseline','influencewithoutnetwork']

# adding them together in one dataframe
df_influence['FIPS']=df_influence.index
df_influence['influence_rep']=np.array(influencerep)

#correcting FIPS with only 4 numbers
fipslist=[]
for i in df_influence.index:
    if len(str(round(i)))==4:
        fipslist.append('0'+str(round(i)))
    else:
        fipslist.append(str(round(i)))
df_influence.index=fipslist
df_influence['FIPS']=fipslist

In [None]:
#making a dataframe with the ripple effects based on SCI and the model
delta=50000
year=2020
changes=pd.DataFrame(index=SCI.index)
fipslist=[]
for i in SCI.index:
    if len(str(i))==4:
        fipslist.append('0'+str(i))
    else:
        fipslist.append(str(i))
changes['FIPS']=fipslist
countylow=influence.sort_values('influence').index[0]
countyhigh=influence.sort_values('influence',ascending=False).index[0]
changes['change_low']=np.log10(res1.params['Exposure']*delta*SCI.loc[:,str(countylow)]/eligiblefriends[(year-4)])
changes['change_high']=np.log10(res1.params['Exposure']*delta*SCI.loc[:,str(countyhigh)]/eligiblefriends[(year-4)])

In [None]:
df_influence.plot(kind='kde',title='density estimation plot of different influence',figsize=(15,5))

In [None]:
stateinfluence.sort_values('influence',ascending=False).plot(kind='bar',figsize=(15,5),title='influence of states with network')

In [None]:
georgia=np.array([True if 13 == int(str(j)[:-3]) else False for j in influence.index ])
ky=np.array([True if 21 == int(str(j)[:-3]) else False for j in influence.index ])
topbottomstates=pd.concat([influence.loc[ky],influence.loc[georgia]],ignore_index=True,axis=1)
topbottomstates.columns=['Kentucky','Georgia']
topbottomstates.plot(kind='hist',figsize=(15,5),use_index=False,bins=75,title='counties in highest and lowest influence state')

In [None]:
converge=pd.read_csv('msc_thesis/results/converance matrix',index_col=0)
converge.plot(kind='line',ylim=[0.8,1],xlim=[0,200000],figsize=(15,5),title='test for convergance')

In [None]:
#(figure 6a in thesis)

influencetemp=df_influencestate
#influencetemp.index=influencetemp.index.map(int) 
test=pd.concat([influencetemp,visitsstatedem],axis=1).dropna()
test=test.sort_values('influence')
#test.columns=['exposuredem','votesdem','votesrep']
#test['influence']=test['influence'].map(float) 
#fig, ax = plt.subplots()
#ax.scatter(test.iloc[:,0],test.iloc[:,1], color ='blue')
#ax.axvline(270, color = 'r', ls = '--')
#ax.set_xlabel('influence', fontsize = 15)
#ax.set_ylabel('visits', fontsize = 15)
#ax.set_title('Visits and influence', fontsize = 30)
#plt.show()

from scipy import stats
fig, ax = plt.subplots()
fig.set_size_inches(15,5)
visited=np.array(test['visits']>0)
notvisited=np.array(np.repeat(1,len(visited))-visited).astype(bool)
#visited=np.array(visited.iloc[:,0])
ax.scatter(test.loc[visited,'influence'],test.loc[visited,'visits'], color ='blue')
ax.scatter(test.iloc[-10:].loc[notvisited[-10:],'influence'],test.iloc[-10:].loc[notvisited[-10:],'visits'], color ='red')
ax2 = ax.twinx()
kde = stats.gaussian_kde(np.array(test.loc[:,'influence']).astype(np.float64))
xx = np.linspace(min(test.iloc[:,0]),max(test.iloc[:,0]), 1000)
ax2.plot(xx, kde(xx))


ax.axvline(sum(test.influence/len(test)), color = 'r', ls = '--')##average influence
#ax.axvline(test.sort_values('influence').iloc[int(len(test)*0.5),0], color = 'r', ls = '--')#median
#ax.axvline(test.sort_values('influence').iloc[40,0], color = 'y', ls = '--')#75percentile
#ax.axvline(test.sort_values('influence').iloc[45,0], color = 'g', ls = '--')#95percentile
ax.set_xlabel('influence', fontsize = 15)
ax.set_ylabel('visits', fontsize = 15)
#ax.set_xlim([min(test.iloc[:,0]),max(test.iloc[:,0])])
ax.set_title('Corrected visits and influence')
x = test.loc[visited,'influence'].tolist()
y = test.loc[visited,'visits'].tolist()

for i in range(len(x)):
    txt=test.loc[visited,'influence'].index[i]
    if txt =='Tennessee':
        ax.annotate(txt, (x[i]-0.001,y[i]+0.02))    
    else:
        ax.annotate(txt, (x[i],y[i]+0.02))
x2 = test.iloc[-10:].loc[notvisited[-10:],'influence'].tolist()
y2 = test.iloc[-10:].loc[notvisited[-10:],'visits'].tolist()        
for i in range(len(x2)):
    txt=test.iloc[-10:].loc[notvisited[-10:],'influence'].index[i]
    if txt =='Virginia':
        ax.annotate(txt, (x2[i]-0.0008,y2[i]+0.02)) 
    elif txt=='Alaska':
        ax.annotate(txt, (x2[i]+0.0001,y2[i]-0.02)) 
    else:
        ax.annotate(txt, (x2[i]-0.0003,y2[i]+0.02))
plt.show()


plt.show()

In [None]:
#(figure 6a in thesis)
fig, ax = plt.subplots()
test=test.sort_values('influencewithoutnetwork')
visited=np.array(test['visits']>0)
notvisited=np.array(np.repeat(1,len(visited))-visited).astype(bool)
#visited=test.iloc[:,1]>0
ax.scatter(test.loc[visited,'influencewithoutnetwork'],test.loc[visited,'visits'], color ='blue')
ax.axvline(sum(test.iloc[:,2]/len(test)), color = 'r', ls = '--')##average influence
#ax.axvline(test.sort_values('influencewithoutnetwork').iloc[int(len(test)*0.5),2], color = 'r', ls = '--')#median
#ax.axvline(test.sort_values('influencewithoutnetwork').iloc[40,2], color = 'y', ls = '--')#75percentile
#ax.axvline(test.sort_values('influencewithoutnetwork').iloc[45,2], color = 'g', ls = '--')#95percentile
ax.set_xlabel('influence', fontsize = 15)
ax.set_ylabel('visits', fontsize = 15)
fig.set_size_inches(15,5)
ax.scatter(test.iloc[-10:].loc[notvisited[-10:],'influencewithoutnetwork'],test.iloc[-10:].loc[notvisited[-10:],'visits'], color ='red')
ax2 = ax.twinx()
kde = stats.gaussian_kde(np.array(test.loc[:,'influencewithoutnetwork']).astype(np.float64))
xx = np.linspace(min(test.loc[:,'influencewithoutnetwork']),max(test.loc[:,'influencewithoutnetwork']), 1000)
ax2.plot(xx, kde(xx))

#ax.set_xlim([min(test.iloc[:,2]),max(test.iloc[:,2])])
ax.set_title('Corrected visits and influence without network')
x = test.loc[visited,'influencewithoutnetwork'].tolist()
y = test.loc[visited,'visits'].tolist()

for i in range(len(x)):
    txt=test.loc[visited,'influencewithoutnetwork'].index[i]
    if txt !='Michigan':
        ax.annotate(txt, (x[i],y[i]+0.03), rotation=90)    
    else:
        ax.annotate(txt, (x[i]+0.0002,y[i]))
        
x2 = test.iloc[-10:].loc[notvisited[-10:],'influencewithoutnetwork'].tolist()
y2 = test.iloc[-10:].loc[notvisited[-10:],'visits'].tolist()        
for i in range(len(x2)):
    txt=test.iloc[-10:].loc[notvisited[-10:],'influencewithoutnetwork'].index[i]
    if txt =='New Hampshire':
        ax.annotate(txt, (x2[i],y2[i]+0.03), rotation=90) 
    elif txt=='Alaska':
        ax.annotate(txt, (x2[i],y2[i]+0.03),rotation=90)
        
    elif txt=='Maine':
        ax.annotate(txt, (x2[i]-0.0007,y2[i]-0.04))
    else:
        ax.annotate(txt, (x2[i]+0.0001,y2[i]-0.04))        

plt.show()

In [None]:
########zorg voor dit stuk voor dem en rep influence in de df

In [None]:
#democratic influence (figure 3 in thesis)
fig = px.choropleth(df_influence, geojson=counties, locations='FIPS',
                    color='influence',
                           range_color=(0, 0.02),
                    #color_continuous_midpoint=0,
                           scope="usa",
                            color_continuous_scale='Blues',
                    #county_outline={'color': 'rgb(255,255,255)', 'width': 0.5}
                           #labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#for repblican (figure 4 in thesis)
fig = px.choropleth(df_influence, geojson=counties, locations='FIPS',
                    color='influence_rep',
                           range_color=(0, 0.025),
                    #color_continuous_midpoint=0,
                           scope="usa",
                            color_continuous_scale='OrRd',
                    #county_outline={'color': 'rgb(255,255,255)', 'width': 0.5}
                           #labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#   (figure 5 in thesis)

fig = px.choropleth(changes, geojson=counties, locations='FIPS',
                    color='change_high',
                           range_color=(-4, 0),
                    #color_continuous_midpoint=0,
                           scope="usa",
                            color_continuous_scale='OrRd',
                    title='influence of '+str(countyhigh)
                  #  colorbar=dict(len=0.75,
                  #title='#Cases', 
                  #x=0.9,
                  #tickvals = [0, -1, -2, -3, - 4],
                  #ticktext = ['1', '0.1', '0.01', '0.001', '0.0001'])
                  #  #county_outline={'color': 'rgb(255,255,255)', 'width': 0.5}
                  #         #labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":35,"l":0,"b":0})
fig.show()