In [1]:
import pandas as pd
import numpy as np
import statistics as st


## Epidemiological Data

Source: https://www.nyc.gov/site/doh/covid/covid-19-data-totals.page

(GitHub: 

- cases: https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/cases-by-day.csv

- deaths: https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/deaths-by-day.csv

)

In [23]:
### cases
cases = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/cases-by-day.csv")
cases = cases[["date_of_interest", "CASE_COUNT"]]
cases.rename(columns={"date_of_interest": "date", 
                      "CASE_COUNT": "new_cases"}, inplace=True)
cases.date = pd.to_datetime(cases.date, format="%m/%d/%Y")

### deaths
deaths = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/deaths-by-day.csv")
deaths = deaths[["date_of_interest", "DEATH_COUNT"]]
deaths.rename(columns={"date_of_interest": "date", 
                      "DEATH_COUNT": "new_deaths"}, inplace=True)
deaths.date = pd.to_datetime(deaths.date, format="%m/%d/%Y")

### merge and save
data = pd.merge(left=deaths, right=cases, how="outer", on="date")
data.to_csv("./epi-data/epi_data.csv", index=False)

## Google Mobility Report

Source: https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv

In [2]:
df = pd.read_csv('../google-mob-raw/2020_US_Region_Mobility_Report.csv')
df.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0
2,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-17,6.0,0.0,28.0,-9.0,-24.0,5.0
3,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-18,0.0,-1.0,6.0,1.0,0.0,1.0
4,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-19,2.0,0.0,8.0,1.0,1.0,0.0


In [30]:
df_nyc = df.loc[(df.sub_region_2.isin(['Queens County', 'New York County', 'Kings County', 'Richmond County', 'Bronx County'])) & \
                (df.sub_region_1 == "New York")]

df_nyc.drop("sub_region_2", axis=1, inplace=True)
df_nyc = df_nyc.groupby(by=['date']).agg({'country_region_code': st.mode, 
                                            'country_region': st.mode, 
                                            'sub_region_1': st.mode,
                                            'metro_area': st.mode,
                                            'iso_3166_2_code': st.mode,
                                            'census_fips_code': st.mode,
                                            'place_id': st.mode,
                                            'date': st.mode,
                                            'retail_and_recreation_percent_change_from_baseline': "mean", 
                                            'grocery_and_pharmacy_percent_change_from_baseline': "mean", 
                                            'parks_percent_change_from_baseline': "mean", 
                                            'transit_stations_percent_change_from_baseline': "mean", 
                                            'workplaces_percent_change_from_baseline': "mean", 
                                            'residential_percent_change_from_baseline': "mean"})



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nyc.drop("sub_region_2", axis=1, inplace=True)


In [31]:
df_nyc = df_nyc[[ 'date',
                        'retail_and_recreation_percent_change_from_baseline',
                        'grocery_and_pharmacy_percent_change_from_baseline',
                        'parks_percent_change_from_baseline',
                        'transit_stations_percent_change_from_baseline',
                        'workplaces_percent_change_from_baseline',
                        'residential_percent_change_from_baseline']]
df_nyc.reset_index(drop=True, inplace=True)
df_nyc.date = pd.to_datetime(df_nyc.date)

df_nyc.to_csv('./google-mobility-report/google_mobility_data.csv', index=False)

## Population Data
Source: https://data.census.gov/table?q=S0101&g=050XX00US36005,36061,36081,36085,36047

In [12]:
pop = pd.read_csv('./raw-data/ACSST1Y2022.S0101-2023-10-02T195537.csv')
pop.head()

Unnamed: 0,Age Group,Total
0,Under 5 years,472671
1,5 to 9 years,445677
2,10 to 14 years,485286
3,15 to 19 years,450637
4,20 to 24 years,520613


In [13]:
populations = {'0-9': [pop.loc[pop["Age Group"].isin(['Under 5 years','5 to 9 years'])].Total.sum()],
               '10-19': [pop.loc[pop["Age Group"].isin(['10 to 14 years', '15 to 19 years'])].Total.sum()],
               '20-24': [pop.loc[pop["Age Group"].isin(['20 to 24 years'])].Total.sum()],
               '25-29': [pop.loc[pop["Age Group"].isin(['25 to 29 years'])].Total.sum()],
               '30-39': [pop.loc[pop["Age Group"].isin(['30 to 34 years', '35 to 39 years'])].Total.sum()],
               '40-49': [pop.loc[pop["Age Group"].isin(['40 to 44 years', '45 to 49 years'])].Total.sum()],
               '50-59': [pop.loc[pop["Age Group"].isin(['50 to 54 years', '55 to 59 years'])].Total.sum()],
               '60-69': [pop.loc[pop["Age Group"].isin(['60 to 64 years', '65 to 69 years'])].Total.sum()],
               '70-79': [pop.loc[pop["Age Group"].isin(['70 to 74 years', '75 to 79 years'])].Total.sum()],
               '80+': [pop.loc[pop["Age Group"].isin(['80 to 84 years', '85 years and over'])].Total.sum()]}

pop_grouped = pd.DataFrame(data=populations)
print(pop_grouped.sum(axis=1) / 10**6)
pop_grouped.to_csv('./population-data/pop_data_Nk.csv', index=False)

0    8.335897
dtype: float64


# Contact Matrix

In [1]:
import numpy as np 

# weights of different contacts matrices
weight_home   = 4.11 
weight_school = 11.41 
weight_work   = 8.07 
weight_comm   = 2.79 

work = np.load("./contact_matrix/work.npz")["arr_0"]
home = np.load("./contact_matrix/home.npz")["arr_0"]
school = np.load("./contact_matrix/school.npz")["arr_0"]
community = np.load("./contact_matrix/community.npz")["arr_0"]

total = (weight_home * home) + (weight_school * school) + (weight_work * work) + (weight_comm * community)

print(np.linalg.eigvals(total).real.max())
np.savez_compressed("./contact_matrix/contact_matrix.npz", total)

13.72854395309837
