# Two-Week Average Positivity Rate by Zip Code

In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 100)

In [175]:
df = pd.read_csv('COVID-19_Cases__Tests__and_Deaths_by_ZIP_Code.csv')

In [176]:
df['ZIP Code'].unique()

array(['60601', '60603', '60620', '60624', '60606', '60607', '60608',
       '60626', '60602', '60609', '60610', '60611', '60612', '60623',
       '60613', '60614', '60615', '60617', '60618', '60619', '60621',
       '60622', '60604', '60625', '60628', '60605', '60629', '60631',
       '60632', '60640', '60633', '60634', '60630', '60636', '60637',
       'Unknown', '60638', '60639', '60641', '60649', '60652', '60827',
       '60656', '60659', '60707', '60661', '60642', '60643', '60644',
       '60645', '60646', '60647', '60651', '60653', '60654', '60660',
       '60655', '60657', '60666', '60616'], dtype=object)

In [177]:
# Removing unknown zip codes and zip codes that are not entirely in chicago. Also removing the zip code for Ohare
# as no one lives there.
df = df[(df['ZIP Code']!= 'Unknown')&(df['ZIP Code']!='60707')&(df['ZIP Code']!='60827')&(df['ZIP Code']!='60666')]

In [178]:
weeks = sorted(list(df['Week Number'].unique()))

In [179]:
# Confining the data to just the last 2 weeks
df = df[(df['Week Number']==weeks[-2])|(df['Week Number']==weeks[-1])]

## Creating Column for 2-week Avg Positivity Rate

In [131]:
# Calculate percentage to get a more granular percentage than that which is already provided in the dataset
two_wk_avg_pos = pd.DataFrame(df.groupby(['ZIP Code'])['Cases - Weekly'].sum()/df.groupby(['ZIP Code'])['Tests - Weekly'].sum())

In [132]:
# Convert to dataframe to merge into census dataframe
two_wk_avg_pos.reset_index(inplace=True)
two_wk_avg_pos.columns = ['ZCTA', 'posRate']

In [133]:
# Filling null value with 0 for zero confirmed cases in OHare (because no residents)
two_wk_avg_pos.posRate = two_wk_avg_pos.posRate.fillna('0')

## Creating Column for Number of Tests Administered 

In [134]:
# Calculate extracting the toal number of tests administered by zip code to later calculate the percentage of 
# population tested in the 2 week period
testing_df = pd.DataFrame(df.groupby(['ZIP Code'])['Tests - Weekly'].sum()).reset_index()

# Rename columns
testing_df.columns = ['ZCTA', 'tests2weeks']

In [135]:
# merging 2-week average positivity df with zip coordinates
fin_df = pd.merge(two_wk_avg_pos, testing_df, on='ZCTA')

## Creating Column for Distance from Testing Site

### Gathering Zip Code Lat/Lon

In [136]:
# Removing duplicate zip codes
df = df.drop_duplicates(subset=['ZIP Code'], keep='first')

In [137]:
# Add in zip coordinates. I'll use this to later calculate distance to testing centers.
# Creating individual df with zip code and zip code location
coords = df[['ZIP Code', 'ZIP Code Location']]

In [138]:
# Renaming columns for merge
coords.columns = ['ZCTA', 'coords']

In [139]:
coords.head()

Unnamed: 0,ZCTA,coords
0,60601,POINT (-87.622844 41.886262)
1,60603,POINT (-87.625473 41.880112)
17,60606,POINT (-87.63676 41.882634)
18,60607,POINT (-87.652727 41.876104)
22,60608,POINT (-87.670366 41.849879)


### Getting Testing Site Lat/Lon

In [140]:
testing_loc = pd.read_csv('COVID-19_Testing_Sites.csv')

In [141]:
testing_loc = testing_loc[['Facility', 'Address', 'Location']]

In [142]:
# Manually filling NaN with missing lat and lon
testing_loc.Location[0] = 'POINT (-87.705330 41.805860)'

In [143]:
# Dropping Howard Brown Health Mobile since there is no specific location attached to the testing site
testing_loc.dropna(inplace=True)
testing_loc.shape

(131, 3)

In [144]:
testing_loc.head()

Unnamed: 0,Facility,Address,Location
0,ACCESS Kedzie Family Health Center,"3229-3243 W 47th Pl Chicago, IL 60632",POINT (-87.705330 41.805860)
1,Howard Brown Health 63rd St,"641 W 63rd St Chicago, IL 60621",POINT (-87.64124500000001 41.77981400000001)
2,Norwegian American Hospital,"1044 N Francisco Ave Chicago, IL 60622",POINT (-87.699288 41.900481)
3,Aayu Clinics - Lakeview,"1645 A W School St Chicago, IL 60657",POINT (-87.670228 41.941508)
4,Michigan Avenue Primary Care,"180 N Michigan Ave #1720 Chicago, IL 60601",POINT (-87.624569 41.885151)


In [145]:
import geopandas as gpd
from scipy.spatial import cKDTree
from shapely.geometry import Point
from shapely import wkt

# Coercing each geometry column from string to geometry object
testing_loc['Location'] = testing_loc['Location'].apply(wkt.loads)
coords['coords'] = coords['coords'].apply(wkt.loads)

# Creating geometry dataframes
gpd1 = gpd.GeoDataFrame(testing_loc, geometry='Location')

gpd2 = gpd.GeoDataFrame(coords, geometry='coords')

# Defining function for calculating nearest distance from each zip code to a testing center
def ckdnearest(gdA, gdB):
    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA.reset_index(drop=True), gdB.loc[idx, gdB.columns != 'geometry'].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

# Reversing the order so that we get distance from zip codes to testing centers instead of testing 
# centers to zip codes
df_distance = ckdnearest(gpd2, gpd1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coords['coords'] = coords['coords'].apply(wkt.loads)


In [146]:
# trim this down to only include distance, zip code, and posRate
dist_posRate = df_distance[['ZCTA', 'dist']]

In [147]:
dist_posRate.head()

Unnamed: 0,ZCTA,dist
0,60601,0.002052
1,60603,0.005119
2,60606,0.006805
3,60607,0.010535
4,60608,0.009333


In [148]:
# Merge into larger df
fin_df = pd.merge(fin_df, dist_posRate, on='ZCTA')

## Creating Column for Baseline 2-Week Avg Positvity Rate (2 weeks prior)

In [149]:
baseline_pos = pd.read_csv('COVID-19_Cases__Tests__and_Deaths_by_ZIP_Code.csv')

In [150]:
# Removing zip codes that are not in chicago
baseline_pos = baseline_pos[(baseline_pos['ZIP Code']!= 'Unknown')&(baseline_pos['ZIP Code']!='60707')&(baseline_pos['ZIP Code']!='60827')&(baseline_pos['ZIP Code']!='60666')]


In [151]:
baseline_pos = baseline_pos[(baseline_pos['Week Number']==weeks[-4])|(baseline_pos['Week Number']==weeks[-3])]

In [152]:
# Calculate 2-week avg positivity rate for 2 weeks prior to analysis period
prior_pos= pd.DataFrame(baseline_pos.groupby(['ZIP Code'])['Cases - Weekly'].sum()/baseline_pos.groupby(['ZIP Code'])['Tests - Weekly'].sum())


In [153]:
# Convert to dataframe to merge 
prior_pos.reset_index(inplace=True)
prior_pos.columns = ['ZCTA', '2weeksprior']

In [154]:
# Filling null value with 0 for zero confirmed cases in OHare (because no residents)
prior_pos['2weeksprior'] = prior_pos['2weeksprior'].fillna('0')

In [155]:
# Merge into larger df
fin_df = pd.merge(fin_df, prior_pos, on='ZCTA')

In [159]:
fin_df.ZCTA = fin_df.ZCTA.astype(int)

## Merging into Census Data

In [157]:
census = pd.read_pickle(r'census_data_zcta.pickle')

In [160]:
fin_df = pd.merge(census, fin_df, on='ZCTA')

## Tailoring Features

In [164]:
# Create column for 2-week average testing
fin_df['testpercent'] = fin_df.tests2weeks/fin_df.totalPop

In [165]:
# Remove tests2weeks since our target will be testpercent
fin_df.drop(['tests2weeks'], axis=1, inplace=True)

In [167]:
# Re-order columns so that both targets (testpercent and posRate) are at the end
cols = list(fin_df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('posRate')) #Remove b from list
fin_df = fin_df[cols+['posRate']] #Create new dataframe with columns in the order you want

## Creating Feature for Percentage Point Difference in posRate from previous 2 weeks

In [169]:
# Calculating percentage point difference in positivity rates between prior two weeks and most recent 2 weeks.
fin_df['percent_change_pos_rate'] = (fin_df['posRate'] - fin_df['2weeksprior'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fin_df['percent_change_pos_rate'] = (fin_df['posRate'] - fin_df['2weeksprior'])


In [172]:
fin_df.head()

Unnamed: 0,ZCTA,totalPop,medianAge,medianHHInc,mean_HHsize,pctHispanic,pctBlack,pctWhite,pctUndocumented,pctUndocumentedFBLA,pctUninsured,pctUnemployed,pctHealthWorkers,pctEssential,dist,2weeksprior,testpercent,posRate,percent_change_pos_rate
0,60601,14675.0,34.9,103243.0,1.655573,0.086814,0.055741,0.74167,0.239387,0.035434,0.029972,0.040341,0.083543,0.145963,0.002052,0.037657,0.077888,0.054243,0.016586
1,60602,1244.0,30.6,157125.0,2.066445,0.065113,0.037781,0.681672,0.117363,0.047428,0.008842,0.000804,0.126206,0.245981,0.004248,0.065421,0.083601,0.048077,-0.017344
2,60603,1174.0,28.9,151765.0,2.08156,0.097956,0.032368,0.634583,0.160136,0.0,0.069847,0.055366,0.080068,0.231687,0.005119,0.034483,0.061329,0.041667,0.007184
3,60604,782.0,32.4,116250.0,1.632568,0.043478,0.056266,0.634271,0.162404,0.008951,0.003836,0.057545,0.067775,0.12532,0.008123,0.026667,0.084399,0.090909,0.064242
4,60605,27519.0,33.9,111164.0,1.984782,0.058432,0.171772,0.61205,0.103601,0.0137,0.026408,0.025146,0.101821,0.252516,0.008169,0.03348,0.086231,0.063211,0.029731


In [174]:
# Pickle data frame to merge later
import pickle
pickle_out = open('complete_df.pickle', 'wb')
pickle.dump(fin_df, pickle_out)
pickle_out.close()