In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt

In [2]:
data_covid = pd.read_csv('covid_data.csv')
data_land = pd.read_excel('LND01_land_area_columnH.xls')
data_county = pd.read_csv('us_county.csv')

In [3]:
data_covid.head()

Unnamed: 0,fips,county,state,lat,long,date,cases,state_code,deaths
0,1001.0,Autauga,Alabama,32.539527,-86.644082,2021-02-14,6023,AL,84
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,2021-02-14,19105,AL,252
2,1005.0,Barbour,Alabama,31.868263,-85.387129,2021-02-14,2042,AL,48
3,1007.0,Bibb,Alabama,32.996421,-87.125115,2021-02-14,2395,AL,57
4,1009.0,Blount,Alabama,33.982109,-86.567906,2021-02-14,5961,AL,121


In [4]:
data_land.head()

Unnamed: 0,Areaname,STCOU,LND010200D
0,UNITED STATES,0,3794083.06
1,ALABAMA,1000,52419.02
2,"Autauga, AL",1001,604.45
3,"Baldwin, AL",1003,2026.93
4,"Barbour, AL",1005,904.52


In [5]:
data_county.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.534923,-86.64273
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.376936,30.727479,-87.722564
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.873788,31.869581,-85.39321
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.055844,32.998628,-87.126475
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.673953,33.980869,-86.56738


In [6]:
print(data_covid.isna().sum())
print("\n")
print(data_land.isna().sum())
print("\n")
print(data_county.isna().sum())
print(data_county['population'].isna().sum())

fips          10
county         6
state          0
lat            0
long           0
date           0
cases          0
state_code    89
deaths         0
dtype: int64


Areaname      0
STCOU         0
LND010200D    0
dtype: int64


fips                  0
county                0
state                 0
state_code           79
male                  0
female                0
median_age            0
population            0
female_percentage     0
lat                   0
long                  0
dtype: int64
0


In [7]:
data_covid = data_covid.dropna()
data_county = data_county.dropna()
print(data_covid.isna().sum())
print(data_county.isna().sum())

fips          0
county        0
state         0
lat           0
long          0
date          0
cases         0
state_code    0
deaths        0
dtype: int64
fips                 0
county               0
state                0
state_code           0
male                 0
female               0
median_age           0
population           0
female_percentage    0
lat                  0
long                 0
dtype: int64


In [8]:
print(len(data_covid))
print(len(data_land))
print(len(data_county))

3241
3198
3141


In [9]:
df = pd.merge(data_county, data_land, left_on='fips', right_on='STCOU')
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,STCOU,LND010200D
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.534923,-86.64273,"Autauga, AL",1001,604.45
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.376936,30.727479,-87.722564,"Baldwin, AL",1003,2026.93
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.873788,31.869581,-85.39321,"Barbour, AL",1005,904.52
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.055844,32.998628,-87.126475,"Bibb, AL",1007,626.16
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.673953,33.980869,-86.56738,"Blount, AL",1009,650.6


In [10]:
df['lat'] = df['lat'].round(decimals=1)
df['long'] = df['long'].round(decimals=1)
data_covid['lat'] = data_covid['lat'].round(decimals=1)
data_covid['long'] = data_covid['long'].round(decimals=1)
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,STCOU,LND010200D
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.5,-86.6,"Autauga, AL",1001,604.45
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.376936,30.7,-87.7,"Baldwin, AL",1003,2026.93
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.873788,31.9,-85.4,"Barbour, AL",1005,904.52
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.055844,33.0,-87.1,"Bibb, AL",1007,626.16
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.673953,34.0,-86.6,"Blount, AL",1009,650.6


In [11]:
df = pd.merge(df, data_covid.drop(['fips', 'county', 'state', 'state_code'], axis=1), on=['lat','long'])
df = df.drop(['STCOU'], axis=1)
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,LND010200D,date,cases,deaths
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.5,-86.6,"Autauga, AL",604.45,2021-02-14,6023,84
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.376936,30.7,-87.7,"Baldwin, AL",2026.93,2021-02-14,19105,252
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.873788,31.9,-85.4,"Barbour, AL",904.52,2021-02-14,2042,48
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.055844,33.0,-87.1,"Bibb, AL",626.16,2021-02-14,2395,57
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.673953,34.0,-86.6,"Blount, AL",650.6,2021-02-14,5961,121


In [12]:
print(len(df))
print(df.isna().sum())

3021
fips                 0
county               0
state                0
state_code           0
male                 0
female               0
median_age           0
population           0
female_percentage    0
lat                  0
long                 0
Areaname             0
LND010200D           0
date                 0
cases                0
deaths               0
dtype: int64


In [13]:
df = df[df['LND010200D'] != 0]
print(len(df))

3019


In [14]:
df = df[df['population'] < 1000000]
print(len(df))

2976


In [15]:
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,LND010200D,date,cases,deaths
0,1001,Autauga County,Alabama,AL,26874,28326,37.8,55200,51.315217,32.5,-86.6,"Autauga, AL",604.45,2021-02-14,6023,84
1,1003,Baldwin County,Alabama,AL,101188,106919,42.8,208107,51.376936,30.7,-87.7,"Baldwin, AL",2026.93,2021-02-14,19105,252
2,1005,Barbour County,Alabama,AL,13697,12085,39.9,25782,46.873788,31.9,-85.4,"Barbour, AL",904.52,2021-02-14,2042,48
3,1007,Bibb County,Alabama,AL,12152,10375,39.9,22527,46.055844,33.0,-87.1,"Bibb, AL",626.16,2021-02-14,2395,57
4,1009,Blount County,Alabama,AL,28434,29211,40.8,57645,50.673953,34.0,-86.6,"Blount, AL",650.6,2021-02-14,5961,121


In [16]:
df = df[df['cases'] != 0]
print(len(df))

2950


In [17]:
df['cases'] = np.log(df['cases'])
df['male'] = np.log(df['male'])
df['female'] = np.log(df['female'])
df['population'] = np.log(df['population'])
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,LND010200D,date,cases,deaths
0,1001,Autauga County,Alabama,AL,10.198915,10.251535,37.8,10.918718,51.315217,32.5,-86.6,"Autauga, AL",604.45,2021-02-14,8.703341,84
1,1003,Baldwin County,Alabama,AL,11.524735,11.579827,42.8,12.245808,51.376936,30.7,-87.7,"Baldwin, AL",2026.93,2021-02-14,9.857705,252
2,1005,Barbour County,Alabama,AL,9.524932,9.39972,39.9,10.157432,46.873788,31.9,-85.4,"Barbour, AL",904.52,2021-02-14,7.621685,48
3,1007,Bibb County,Alabama,AL,9.405249,9.247154,39.9,10.02247,46.055844,33.0,-87.1,"Bibb, AL",626.16,2021-02-14,7.781139,57
4,1009,Blount County,Alabama,AL,10.255341,10.282301,40.8,10.962059,50.673953,34.0,-86.6,"Blount, AL",650.6,2021-02-14,8.692994,121


In [18]:
df['log_pop_density'] = df['population']/np.log(df['LND010200D'])
df.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,long,Areaname,LND010200D,date,cases,deaths,log_pop_density
0,1001,Autauga County,Alabama,AL,10.198915,10.251535,37.8,10.918718,51.315217,32.5,-86.6,"Autauga, AL",604.45,2021-02-14,8.703341,84,1.704899
1,1003,Baldwin County,Alabama,AL,11.524735,11.579827,42.8,12.245808,51.376936,30.7,-87.7,"Baldwin, AL",2026.93,2021-02-14,9.857705,252,1.608269
2,1005,Barbour County,Alabama,AL,9.524932,9.39972,39.9,10.157432,46.873788,31.9,-85.4,"Barbour, AL",904.52,2021-02-14,7.621685,48,1.492115
3,1007,Bibb County,Alabama,AL,9.405249,9.247154,39.9,10.02247,46.055844,33.0,-87.1,"Bibb, AL",626.16,2021-02-14,7.781139,57,1.556379
4,1009,Blount County,Alabama,AL,10.255341,10.282301,40.8,10.962059,50.673953,34.0,-86.6,"Blount, AL",650.6,2021-02-14,8.692994,121,1.692225


In [19]:
data_hosp = pd.read_csv('Hospital_Beds_per_County_and_per_capita.csv')
data_hosp = data_hosp.drop(['FID','GEOID','NAME','Pop18','Co','UnwelPct','pct65pls','Staffed','Beds','F_Count','F_Staffed','F_Beds','F_ICUBeds',
                            'F_BedsPC','F_ICUPC','F_StaffPC','ICUPC','SHAPE_Length','SHAPE_Area'], axis=1)
data_hosp.head()

Unnamed: 0,St,CoSt,ICUBeds,BedsPC,StaffPC,FoodInsc
0,AL,Franklin County,7,254.752,430.324324,13.0
1,GA,Fannin County,5,532.879999,532.879999,11.4
2,IA,Kossuth County,0,608.039998,633.374997,9.8
3,OK,Ottawa County,9,271.752137,338.24468,17.2
4,PA,Susquehanna County,4,846.299998,846.299998,11.4


In [20]:
df2 = pd.merge(df, data_hosp, left_on=['county', 'state_code'], right_on=['CoSt', 'St'])
df2 = df2.drop(columns=['CoSt', 'St'])
df2.head()


Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,...,Areaname,LND010200D,date,cases,deaths,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc
0,1001,Autauga County,Alabama,AL,10.198915,10.251535,37.8,10.918718,51.315217,32.5,...,"Autauga, AL",604.45,2021-02-14,8.703341,84,1.704899,6,669.447058,1034.599998,13.4
1,1003,Baldwin County,Alabama,AL,11.524735,11.579827,42.8,12.245808,51.376936,30.7,...,"Baldwin, AL",2026.93,2021-02-14,9.857705,252,1.608269,44,623.985465,725.172297,12.3
2,1005,Barbour County,Alabama,AL,9.524932,9.39972,39.9,10.157432,46.873788,31.9,...,"Barbour, AL",904.52,2021-02-14,7.621685,48,1.492115,5,359.256756,886.166664,23.2
3,1007,Bibb County,Alabama,AL,9.405249,9.247154,39.9,10.02247,46.055844,33.0,...,"Bibb, AL",626.16,2021-02-14,7.781139,57,1.556379,0,657.22857,920.119996,15.8
4,1009,Blount County,Alabama,AL,10.255341,10.282301,40.8,10.962059,50.673953,34.0,...,"Blount, AL",650.6,2021-02-14,8.692994,121,1.692225,6,1449.274996,2318.839991,11.0


In [21]:
print(len(df2))
print(df2.isna().sum())

2914
fips                 0
county               0
state                0
state_code           0
male                 0
female               0
median_age           0
population           0
female_percentage    0
lat                  0
long                 0
Areaname             0
LND010200D           0
date                 0
cases                0
deaths               0
log_pop_density      0
ICUBeds              0
BedsPC               0
StaffPC              0
FoodInsc             0
dtype: int64


In [22]:
df2['BedsPC'] = np.log(df2['BedsPC'] + 1)
df2['StaffPC'] = np.log(df2['StaffPC'] + 1)
df2['ICUBeds'] = df2['ICUBeds']**0.25
df2.head()

Unnamed: 0,fips,county,state,state_code,male,female,median_age,population,female_percentage,lat,...,Areaname,LND010200D,date,cases,deaths,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc
0,1001,Autauga County,Alabama,AL,10.198915,10.251535,37.8,10.918718,51.315217,32.5,...,"Autauga, AL",604.45,2021-02-14,8.703341,84,1.704899,1.565085,6.507945,6.942736,13.4
1,1003,Baldwin County,Alabama,AL,11.524735,11.579827,42.8,12.245808,51.376936,30.7,...,"Baldwin, AL",2026.93,2021-02-14,9.857705,252,1.608269,2.57551,6.437728,6.587787,12.3
2,1005,Barbour County,Alabama,AL,9.524932,9.39972,39.9,10.157432,46.873788,31.9,...,"Barbour, AL",904.52,2021-02-14,7.621685,48,1.492115,1.495349,5.886817,6.788033,23.2
3,1007,Bibb County,Alabama,AL,9.405249,9.247154,39.9,10.02247,46.055844,33.0,...,"Bibb, AL",626.16,2021-02-14,7.781139,57,1.556379,0.0,6.489552,6.82559,15.8
4,1009,Blount County,Alabama,AL,10.255341,10.282301,40.8,10.962059,50.673953,34.0,...,"Blount, AL",650.6,2021-02-14,8.692994,121,1.692225,1.565085,7.279508,7.749253,11.0


In [23]:
print(np.mean(df2['BedsPC']))
print(np.std(df2['BedsPC']))
print(np.mean(df2['StaffPC']))
print(np.std(df2['StaffPC']))
print(np.mean(df2['ICUBeds']))
print(np.std(df2['ICUBeds']))
print(np.mean(df2['FoodInsc']))
print(np.std(df2['FoodInsc']))

4.767908907455111
2.575672809729914
4.905458181441756
2.6513266417634846
0.9269694252531202
1.1147376850802941
13.658819492107089
4.217859141494941


In [24]:
df2 = df2.drop(['fips', 'state_code', 'Areaname', 'date', 'deaths'], axis=1)
df = df2
df = df.drop(['county', 'state'], axis=1)
df.head()

Unnamed: 0,male,female,median_age,population,female_percentage,lat,long,LND010200D,cases,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc
0,10.198915,10.251535,37.8,10.918718,51.315217,32.5,-86.6,604.45,8.703341,1.704899,1.565085,6.507945,6.942736,13.4
1,11.524735,11.579827,42.8,12.245808,51.376936,30.7,-87.7,2026.93,9.857705,1.608269,2.57551,6.437728,6.587787,12.3
2,9.524932,9.39972,39.9,10.157432,46.873788,31.9,-85.4,904.52,7.621685,1.492115,1.495349,5.886817,6.788033,23.2
3,9.405249,9.247154,39.9,10.02247,46.055844,33.0,-87.1,626.16,7.781139,1.556379,0.0,6.489552,6.82559,15.8
4,10.255341,10.282301,40.8,10.962059,50.673953,34.0,-86.6,650.6,8.692994,1.692225,1.565085,7.279508,7.749253,11.0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['cases']), df['cases'], test_size=0.2, random_state=10302000)
print(len(X_test))

583


In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns)

In [27]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
print(model.score(X_train, y_train))

# This is the coefficient Beta_1, ..., Beta_7
print(model.coef_)

# This is the coefficient Beta_0
model.intercept_

0.9387645509725394
[-1.47062189e+01  2.43176806e+00 -1.57122692e-01  1.37131581e+01
 -5.92711274e-01 -6.12225906e-02  2.73769406e-02 -1.69873473e-02
 -2.25637002e-02  9.00892353e-03  1.77351777e-02  2.95014428e-02
 -3.79435112e-02]


7.711183807698767

In [28]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
mean_absolute_error = abs(test_output['pred_Y'] - test_output['cases']).mean()
print(mean_absolute_error)
print(np.mean(df['cases']))

1.6492510848473663
7.718717234648996


In [29]:
test_output.head()

Unnamed: 0,pred_Y,cases
3,8.205648,7.781139
9,9.65425,7.463937
14,7.99282,8.546558
20,6.150083,9.082166
28,10.514292,8.280711


In [30]:
model_lasso = Lasso(alpha=0.2)
model_lasso.fit(X_train, y_train)
print(model_lasso.score(X_train,y_train))
print(model_lasso.coef_)
print(model_lasso.intercept_)

0.9079930302206338
[ 1.18539608  0.         -0.01261634  0.          0.         -0.
  0.         -0.          0.          0.          0.          0.
  0.        ]
7.71118380769877


In [31]:
test_output = pd.DataFrame(model_lasso.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
mean_absolute_error = abs(test_output['pred_Y'] - test_output['cases']).mean()
print(mean_absolute_error)
print(np.mean(df['cases']))
print(model_lasso.score(X_test, y_test))

1.538862626162342
7.718717234648996
0.9018979323678886


PREDICTED CASES vs ACTUAL CASES

In [32]:
test_output.head()

Unnamed: 0,pred_Y,cases
3,8.037726,7.781139
9,9.375906,7.463937
14,8.087657,8.546558
20,6.099235,9.082166
28,10.103368,8.280711


In [33]:
presidents = pd.read_csv('presidents.csv')
#presidents = presidents.sort_values(by=['state', 'county'])
iter = 0
county = presidents['county'].loc[presidents.index[0]]
state = presidents['state'].loc[presidents.index[0]]
tempdf = {
    'state': [],
    'county': [],
    'DEM%': []
}
totalvotescounty = 1
demvotescounty = 0

for x in presidents['total_votes']:
    if county == presidents['county'].loc[presidents.index[iter]]:
        totalvotescounty += x
        if presidents['party'].loc[presidents.index[iter]] == 'DEM':
            demvotescounty = x
    else:
        demperc =  float(demvotescounty) / float(totalvotescounty)
        tempdf['state'].append(state)
        tempdf['county'].append(county)
        tempdf['DEM%'].append(demperc)
        county = presidents['county'].loc[presidents.index[iter]]
        state = presidents['state'].loc[presidents.index[iter]]
        totalvotescounty = x + 1
        if presidents['party'].loc[presidents.index[iter]] == 'DEM':
            demvotescounty = x
        else:
            demvotescounty = 0
    iter += 1

tempdf = pd.DataFrame(tempdf)
tempdf.head(10)


Unnamed: 0,state,county,DEM%
0,Delaware,Kent County,0.511939
1,Delaware,New Castle County,0.678063
2,Delaware,Sussex County,0.438196
3,District of Columbia,District of Columbia,0.936639
4,District of Columbia,Ward 2,0.884314
5,District of Columbia,Ward 3,0.89069
6,District of Columbia,Ward 4,0.938009
7,District of Columbia,Ward 5,0.939533
8,District of Columbia,Ward 6,0.901461
9,District of Columbia,Ward 7,0.947818


First importing new dataframe 'presidents.csv'

Then Process it so that each county only takes one line and includes the total percent of votes Joe Biden and the Democrat party received in the 2020 election

In [34]:
print(len(df2))
dftemp = df2[df2['state'] == 'Massachusetts']
dftemp = dftemp.sort_values(by='county')
dftemp.head(10)

2914


Unnamed: 0,county,state,male,female,median_age,population,female_percentage,lat,long,LND010200D,cases,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc
1112,Barnstable County,Massachusetts,11.534452,11.621905,52.9,12.272282,52.184941,41.7,-70.3,1305.62,9.169518,1.710558,2.140695,6.318078,6.350483,8.2
1113,Berkshire County,Massachusetts,11.028466,11.093235,46.7,11.754522,51.618654,42.4,-73.2,946.27,8.46168,1.715356,2.213364,5.993645,6.269436,10.1
1114,Bristol County,Massachusetts,12.508486,12.571691,41.0,13.233735,51.579607,41.8,-71.1,691.19,10.894292,2.023997,3.130169,6.464891,6.545847,10.3
1115,Franklin County,Massachusetts,10.451638,10.500509,46.1,11.169519,51.221541,42.6,-72.6,724.74,7.554335,1.695997,1.565085,6.686043,6.686043,9.0
1116,Hampden County,Massachusetts,12.330743,12.399009,38.9,13.058605,51.705975,42.1,-72.6,634.12,10.590415,2.023888,3.018349,5.766073,5.827264,9.8
1117,Hampshire County,Massachusetts,11.230907,11.358993,36.3,11.990147,53.197774,42.3,-72.7,545.44,8.817446,1.902717,1.82116,7.076286,7.247111,9.9
1118,Norfolk County,Massachusetts,12.722894,12.801913,40.9,13.456331,51.974439,42.2,-71.2,443.94,10.678238,2.207516,2.771488,6.317614,6.433537,7.8
1119,Suffolk County,Massachusetts,12.855346,12.921314,32.7,13.582021,51.648593,42.3,-71.1,120.19,11.231027,2.836043,4.613437,4.950425,4.968782,14.2
1120,Worcester County,Massachusetts,12.912713,12.940472,40.1,13.619836,50.693924,42.4,-71.9,1579.02,11.050572,1.849376,3.302834,5.977944,6.092464,8.8


In [35]:
print(len(tempdf))
dftemp = tempdf[tempdf['state'] == 'Massachusetts']
dftemp = dftemp.sort_values(by='county')
dftemp.head(10)

4632


Unnamed: 0,state,county,DEM%
1650,Massachusetts,Abington,0.539178
1567,Massachusetts,Acton,0.797315
1437,Massachusetts,Acushnet,0.438952
1405,Massachusetts,Adams,0.651508
1524,Massachusetts,Agawam,0.481527
1406,Massachusetts,Alford,0.831288
1464,Massachusetts,Amesbury,0.652751
1547,Massachusetts,Amherst,0.902796
1465,Massachusetts,Andover,0.666989
1457,Massachusetts,Aquinnah,0.873596


In [36]:
df = pd.merge(df2, tempdf, left_on=['county', 'state'], right_on=['county', 'state'])
print(len(df))
df.head(10)

2851


Unnamed: 0,county,state,male,female,median_age,population,female_percentage,lat,long,LND010200D,cases,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc,DEM%
0,Autauga County,Alabama,10.198915,10.251535,37.8,10.918718,51.315217,32.5,-86.6,604.45,8.703341,1.704899,1.565085,6.507945,6.942736,13.4,0.270174
1,Baldwin County,Alabama,11.524735,11.579827,42.8,12.245808,51.376936,30.7,-87.7,2026.93,9.857705,1.608269,2.57551,6.437728,6.587787,12.3,0.224088
2,Barbour County,Alabama,9.524932,9.39972,39.9,10.157432,46.873788,31.9,-85.4,904.52,7.621685,1.492115,1.495349,5.886817,6.788033,23.2,0.457838
3,Bibb County,Alabama,9.405249,9.247154,39.9,10.02247,46.055844,33.0,-87.1,626.16,7.781139,1.556379,0.0,6.489552,6.82559,15.8,0.206961
4,Blount County,Alabama,10.255341,10.282301,40.8,10.962059,50.673953,34.0,-86.6,650.6,8.692994,1.692225,1.565085,7.279508,7.749253,11.0,0.09569
5,Bullock County,Alabama,8.641709,8.452975,39.6,9.244935,45.295595,32.1,-85.7,626.06,7.040536,1.435672,0.0,5.169925,5.876708,26.0,0.746857
6,Butler County,Alabama,9.145055,9.273972,40.7,9.904737,53.218477,31.8,-86.7,777.92,7.550661,1.487952,1.626577,5.652392,6.143503,21.7,0.417852
7,Calhoun County,Alabama,10.920799,10.998477,39.7,11.653539,51.940955,33.8,-85.8,612.32,9.449357,1.815969,2.213364,5.485514,5.647652,16.7,0.298447
8,Chambers County,Alabama,9.696279,9.773891,43.0,10.428985,51.939337,32.9,-85.4,603.11,8.108021,1.628994,0.0,5.702942,0.0,19.5,0.416421
9,Cherokee County,Alabama,9.458762,9.47524,45.9,10.160182,50.411944,34.2,-85.6,599.95,7.463937,1.588311,0.0,6.061379,6.348478,12.5,0.132011


In [37]:
print(df.isna().sum())
print(len(df))

county               0
state                0
male                 0
female               0
median_age           0
population           0
female_percentage    0
lat                  0
long                 0
LND010200D           0
cases                0
log_pop_density      0
ICUBeds              0
BedsPC               0
StaffPC              0
FoodInsc             0
DEM%                 0
dtype: int64
2851


In [38]:
df.head()

Unnamed: 0,county,state,male,female,median_age,population,female_percentage,lat,long,LND010200D,cases,log_pop_density,ICUBeds,BedsPC,StaffPC,FoodInsc,DEM%
0,Autauga County,Alabama,10.198915,10.251535,37.8,10.918718,51.315217,32.5,-86.6,604.45,8.703341,1.704899,1.565085,6.507945,6.942736,13.4,0.270174
1,Baldwin County,Alabama,11.524735,11.579827,42.8,12.245808,51.376936,30.7,-87.7,2026.93,9.857705,1.608269,2.57551,6.437728,6.587787,12.3,0.224088
2,Barbour County,Alabama,9.524932,9.39972,39.9,10.157432,46.873788,31.9,-85.4,904.52,7.621685,1.492115,1.495349,5.886817,6.788033,23.2,0.457838
3,Bibb County,Alabama,9.405249,9.247154,39.9,10.02247,46.055844,33.0,-87.1,626.16,7.781139,1.556379,0.0,6.489552,6.82559,15.8,0.206961
4,Blount County,Alabama,10.255341,10.282301,40.8,10.962059,50.673953,34.0,-86.6,650.6,8.692994,1.692225,1.565085,7.279508,7.749253,11.0,0.09569


In [39]:
print('lat Mean and Standard Dev:')
print(np.mean(df['lat']))
print(np.std(df['lat']))

print('long Mean and Standard Dev:')
print(np.mean(df['long']))
print(np.std(df['long']))

print('Cases Mean and Standard Dev:')
print(np.mean(df['cases']))
print(np.std(df['cases']))

print('male Mean and Standard Dev:')
print(np.mean(df['male']))
print(np.std(df['male']))

print('female Mean and Standard Dev:')
print(np.mean(df['female']))
print(np.std(df['female']))

print('median_age Mean and Standard Dev:')
print(np.mean(df['median_age']))
print(np.std(df['median_age']))

print('population Mean and Standard Dev:')
print(np.mean(df['population']))
print(np.std(df['population']))

print('female_percentage Mean and Standard Dev:')
print(np.mean(df['female_percentage']))
print(np.std(df['female_percentage']))

lat Mean and Standard Dev:
38.22528937215003
4.805573266708192
long Mean and Standard Dev:
-92.00708523325174
11.10310937705081
Cases Mean and Standard Dev:
7.712253064519018
1.4274106742482227
male Mean and Standard Dev:
9.496162473729262
1.3807787100408004
female Mean and Standard Dev:
9.492908081011894
1.4029060787889494
median_age Mean and Standard Dev:
41.3431778323395
5.366418530636826
population Mean and Standard Dev:
10.188840129008602
1.3903938386276464
female_percentage Mean and Standard Dev:
49.924549688848046
2.3730783058636113


In [40]:
print('Landarea Mean and Standard Dev:')
print(np.mean(df['LND010200D']))
print(np.std(df['LND010200D']))

print('log_pop_density Mean and Standard Dev:')
print(np.mean(df['log_pop_density']))
print(np.std(df['log_pop_density']))

print('BedsPC Mean and Standard Dev:')
print(np.mean(df['BedsPC']))
print(np.std(df['BedsPC']))

print('StaffPC Mean and Standard Dev:')
print(np.mean(df['StaffPC']))
print(np.std(df['StaffPC']))

print('ICUBeds Mean and Standard Dev:')
print(np.mean(df['ICUBeds']))
print(np.std(df['ICUBeds']))

print('FoodInsc Mean and Standard Dev:')
print(np.mean(df['FoodInsc']))
print(np.std(df['FoodInsc']))

print('DEM% Mean and Standard Dev:')
print(np.mean(df['DEM%']))
print(np.std(df['DEM%']))

Landarea Mean and Standard Dev:
964.1473553139263
1235.2538381807128
log_pop_density Mean and Standard Dev:
1.6148895386401845
0.5252294848538431
BedsPC Mean and Standard Dev:
4.747893594721119
2.5849706586592007
StaffPC Mean and Standard Dev:
4.884188140551053
2.6604991836950433
ICUBeds Mean and Standard Dev:
0.9105844669798194
1.1078976158692566
FoodInsc Mean and Standard Dev:
13.700596282006325
4.237527926457397
DEM% Mean and Standard Dev:
0.3217797821340124
0.1514758133021496


In [41]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['cases', 'county', 'state']), df['cases'], test_size=0.2, random_state=10302000)
#Drop county and state
print(len(X_train))
print(len(X_test))

2280
571


In [42]:
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)

In [43]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
print(model.score(X_train, y_train))

# This is the coefficient Beta_0
print(model.intercept_)

test_output0 = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output0 = test_output0.merge(y_test, left_index = True, right_index = True)

#mean absolute error
mean_absolute_error = abs(test_output0['pred_Y'] - test_output0['cases']).mean()
print(mean_absolute_error / np.mean(test_output0['cases']))

0.9490329626569416
7.701682154621525
0.03287702088684975


In [44]:
model = Lasso(alpha=0.2)
model.fit(X_train, y_train)

# The following gives the R-square score
print(model.score(X_train, y_train))

# This is the coefficient Beta_0
print(model.intercept_)

test_output1 = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output1 = test_output1.merge(y_test, left_index = True, right_index = True)

#mean absolute error
mean_absolute_error = abs(test_output1['pred_Y'] - test_output1['cases']).mean()
print(mean_absolute_error / np.mean(test_output1['cases']))

0.9164791422350689
7.7016821546215235
0.04090367735757503


In [45]:
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(random_state=10302000, max_samples = 10)
model = model.fit(X_train, y_train)


# The following gives the R-square score
print(model.score(X_train, y_train))

test_output2 = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output2 = test_output2.merge(y_test, left_index = True, right_index = True)

#mean absolute error
mean_absolute_error = abs(test_output2['pred_Y'] - test_output2['cases']).mean()
print(mean_absolute_error / np.mean(test_output2['cases']))

0.7822022376403249
0.061035676615690815


In [46]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=10302000, min_samples_leaf = 3, max_features = "sqrt")
model = model.fit(X_train, y_train) 

# The following gives the R-square score
print(model.score(X_train, y_train))

test_output3 = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output3 = test_output3.merge(y_test, left_index = True, right_index = True)

#mean absolute error
mean_absolute_error = abs(test_output3['pred_Y'] - test_output3['cases']).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(test_output3['cases']))

0.9818362160801409
0.23001998248535685
0.029662917058485793


In [47]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=10302000, min_samples_split = 5, min_samples_leaf = 3, max_depth = 4)
model = model.fit(X_train, y_train) 

# The following gives the R-square score
print(model.score(X_train, y_train))

test_output4 = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output4 = test_output4.merge(y_test, left_index = True, right_index = True)

#mean absolute error
mean_absolute_error = abs(test_output4['pred_Y'] - test_output4['cases']).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(test_output4['cases']))

0.9854427424605477
0.20308879092255583
0.02618992443853194


In [48]:
AnalysisDF = test_output0[['cases', 'pred_Y']]
AnalysisDF = AnalysisDF.drop(columns=['pred_Y'])
AnalysisDF['Pred_Cases_Linear'] = test_output0['pred_Y']
AnalysisDF['Pred_Cases_Lasso'] = test_output1['pred_Y']
AnalysisDF['Pred_Cases_Bagging'] = test_output2['pred_Y']
AnalysisDF['Pred_Cases_RandomForest'] = test_output3['pred_Y']
AnalysisDF['Pred_Cases_Gradient'] = test_output4['pred_Y']
AnalysisDF['cases'] = test_output0['cases']

AnalysisDF.head()

Unnamed: 0,cases,Pred_Cases_Linear,Pred_Cases_Lasso,Pred_Cases_Bagging,Pred_Cases_RandomForest,Pred_Cases_Gradient
620,8.56274,8.148306,8.021122,8.54481,8.082146,8.204683
1019,7.358831,7.054051,7.111189,7.400609,7.174273,7.191312
1002,6.893656,7.088489,7.04941,6.880174,7.03215,7.052058
2337,8.800867,9.014959,8.611073,8.620367,8.786212,8.820541
2462,7.904335,7.862212,7.758491,7.673137,7.905596,7.93471


In [49]:
AnalysisDFExp = np.exp(test_output0[['cases', 'pred_Y']])
AnalysisDFExp = AnalysisDFExp.drop(columns=['pred_Y'])
AnalysisDFExp['Pred_Cases_Linear'] = np.exp(test_output0['pred_Y'])
AnalysisDFExp['Pred_Cases_Lasso'] = np.exp(test_output1['pred_Y'])
AnalysisDFExp['Pred_Cases_Bagging'] = np.exp(test_output2['pred_Y'])
AnalysisDFExp['Pred_Cases_RandomForest'] = np.exp(test_output3['pred_Y'])
AnalysisDFExp['Pred_Cases_Gradient'] = np.exp(test_output4['pred_Y'])
AnalysisDFExp['cases'] = np.exp(test_output0['cases'])

AnalysisDFExp.head()

Unnamed: 0,cases,Pred_Cases_Linear,Pred_Cases_Lasso,Pred_Cases_Bagging,Pred_Cases_RandomForest,Pred_Cases_Gradient
620,5233.0,3457.515921,3044.590369,5140.007343,3236.171323,3658.040952
1019,1570.0,1157.538879,1225.603982,1636.981487,1305.410697,1327.843891
1002,986.0,1198.095909,1152.179171,972.795167,1132.462427,1155.233849
2337,6640.0,8225.207964,5492.137776,5543.419644,6543.400243,6771.927758
2462,2709.0,2597.259751,2341.36949,2149.815315,2712.417791,2792.548064


In [50]:
print('Linear')
mean_absolute_error = abs(np.exp(AnalysisDF['Pred_Cases_Linear']) - np.exp(AnalysisDF['cases'])).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(np.exp(AnalysisDF['cases'])))

print('Lasso')
mean_absolute_error = abs(np.exp(AnalysisDF['Pred_Cases_Lasso']) - np.exp(AnalysisDF['cases'])).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(np.exp(AnalysisDF['cases'])))


print('Bagging')
mean_absolute_error = abs(np.exp(AnalysisDF['Pred_Cases_Bagging']) - np.exp(AnalysisDF['cases'])).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(np.exp(AnalysisDF['cases'])))


print('Random Forest')
mean_absolute_error = abs(np.exp(AnalysisDF['Pred_Cases_RandomForest']) - np.exp(AnalysisDF['cases'])).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(np.exp(AnalysisDF['cases'])))


print('Gradient Boosting')
mean_absolute_error = abs(np.exp(AnalysisDF['Pred_Cases_Gradient']) - np.exp(AnalysisDF['cases'])).mean()
print(mean_absolute_error)
print(mean_absolute_error / np.mean(np.exp(AnalysisDF['cases'])))


Linear
1324.646081342803
0.213248889438372
Lasso
2131.5124855433587
0.34314272828661685
Bagging
3209.124201475774
0.5166226523999443
Random Forest
1313.9568399397047
0.2115280759393892
Gradient Boosting
1106.2853370617227
0.17809596302977745


In [51]:
#sMAPE function
def smape(actual, predicted):
    n = len(actual)
    sMAPE = np.sum(abs(actual - predicted) / (abs(actual) + abs(predicted)))
    return 100 * sMAPE / n

In [52]:
#With Logarithmic Cases
linear_sMAPE = smape(AnalysisDF['cases'], AnalysisDF['Pred_Cases_Linear'])
print('Linear Log sMAPE:')
print(linear_sMAPE)

#With Exponential Cases
linear_sMAPE = smape(np.exp(AnalysisDF['cases']), np.exp(AnalysisDF['Pred_Cases_Linear']))
print('Linear Exp sMAPE:')
print(linear_sMAPE)

Linear Log sMAPE:
1.7414163877094233
Linear Exp sMAPE:
12.355091018718793


In [53]:
#With Logarithmic Cases
lasso_sMAPE = smape(AnalysisDF['cases'], AnalysisDF['Pred_Cases_Lasso'])
print('Lasso Log sMAPE:')
print(lasso_sMAPE)

#With Exponential Cases
lasso_sMAPE = smape(np.exp(AnalysisDF['cases']), np.exp(AnalysisDF['Pred_Cases_Lasso']))
print('Lasso Exp sMAPE:')
print(lasso_sMAPE)

Lasso Log sMAPE:
2.1391565581359004
Lasso Exp sMAPE:
15.253445971159039


In [54]:
#With Logarithmic Cases
bagging_sMAPE = smape(AnalysisDF['cases'], AnalysisDF['Pred_Cases_Bagging'])
print('Bagging Log sMAPE:')
print(bagging_sMAPE)

#With Exponential Cases
bagging_sMAPE = smape(np.exp(AnalysisDF['cases']), np.exp(AnalysisDF['Pred_Cases_Bagging']))
print('Bagging Exp sMAPE:')
print(bagging_sMAPE)

Bagging Log sMAPE:
3.1844540359798796
Bagging Exp sMAPE:
22.011025375532565


In [55]:
#With Logarithmic Cases
random_sMAPE = smape(AnalysisDF['cases'], AnalysisDF['Pred_Cases_RandomForest'])
print('Random Log sMAPE:')
print(random_sMAPE)
#With Exponential Cases
random_sMAPE = smape(np.exp(AnalysisDF['cases']), np.exp(AnalysisDF['Pred_Cases_RandomForest']))
print('Random Exp sMAPE:')
print(random_sMAPE)

Random Log sMAPE:
1.5636498835964052
Random Exp sMAPE:
11.145952471633658


In [56]:
#With Logarithmic Cases
gradient_sMAPE = smape(AnalysisDF['cases'], AnalysisDF['Pred_Cases_Gradient'])
print('Gradient Log sMAPE:')
print(gradient_sMAPE)
#With Exponential Cases
gradient_sMAPE = smape(np.exp(AnalysisDF['cases']), np.exp(AnalysisDF['Pred_Cases_Gradient']))
print('Gradient Exp sMAPE:')
print(gradient_sMAPE)

Gradient Log sMAPE:
1.3929496532002503
Gradient Exp sMAPE:
9.863801764362575


In [57]:
dftemp = df[['cases', 'county']]
AnalysisDF = pd.merge(AnalysisDF, dftemp, on='cases')
AnalysisDF.head()
BestAlg = []
iter = 0
for entry in AnalysisDF['cases']:
    temp = []
    temp.append([abs(entry - AnalysisDF['Pred_Cases_Linear'].loc[AnalysisDF.index[iter]]), 'Linear'])
    temp.append([abs(entry - AnalysisDF['Pred_Cases_Lasso'].loc[AnalysisDF.index[iter]]), 'Lasso'])
    temp.append([abs(entry - AnalysisDF['Pred_Cases_Bagging'].loc[AnalysisDF.index[iter]]), 'Bagging'])
    temp.append([abs(entry - AnalysisDF['Pred_Cases_RandomForest'].loc[AnalysisDF.index[iter]]), 'RandomForest'])
    temp.append([abs(entry - AnalysisDF['Pred_Cases_Gradient'].loc[AnalysisDF.index[iter]]), 'Gradient'])
    iter += 1
    temp.sort()
    BestAlg.append(temp[0][1])
print(BestAlg)

['Bagging', 'Bagging', 'Bagging', 'Bagging', 'RandomForest', 'RandomForest', 'Bagging', 'Bagging', 'Gradient', 'Gradient', 'Linear', 'Bagging', 'Gradient', 'Gradient', 'RandomForest', 'RandomForest', 'Linear', 'Bagging', 'RandomForest', 'RandomForest', 'RandomForest', 'Gradient', 'Lasso', 'Lasso', 'Gradient', 'Gradient', 'RandomForest', 'Linear', 'Linear', 'Bagging', 'Lasso', 'Lasso', 'Bagging', 'Gradient', 'Bagging', 'Bagging', 'Linear', 'Gradient', 'Lasso', 'Gradient', 'Linear', 'Gradient', 'Gradient', 'Gradient', 'Lasso', 'Lasso', 'Gradient', 'Bagging', 'Linear', 'Linear', 'Linear', 'Gradient', 'Bagging', 'Bagging', 'Lasso', 'Lasso', 'Bagging', 'Bagging', 'Bagging', 'Lasso', 'Lasso', 'Lasso', 'Lasso', 'Bagging', 'Bagging', 'Linear', 'Linear', 'Gradient', 'Bagging', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'RandomForest', 'RandomForest', 'Bagging', 'Bagging', 'Lasso', 'Lasso', 'Gradient', 'Gradient', 'Gradient', 'Gradient', 'Grad

In [58]:
lin = 0
las = 0
bag = 0
ran = 0
grd = 0
for x in BestAlg:
    if x == 'Linear':
        lin += 1
    if x == 'Lasso':
        las += 1
    if x == 'Bagging':
        bag += 1 
    if x == 'RandomForest':
        ran += 1 
    if x == 'Gradient':
        grd += 1

print('Linear:') 
print(lin/len(BestAlg))
print('Lasso:')
print(las/len(BestAlg))
print('Bagging:')
print(bag/len(BestAlg))
print('RandomForest:')
print(ran/len(BestAlg))
print('Gradient:')
print(grd/len(BestAlg))

Linear:
0.1696969696969697
Lasso:
0.1696969696969697
Bagging:
0.1503030303030303
RandomForest:
0.16727272727272727
Gradient:
0.343030303030303
