In [449]:
import pandas as pd
import numpy as np
import os

In [450]:
static_feature_1 = pd.read_csv('../data/covid19_merged.csv')
static_feature_2 = pd.read_csv('../data/covid19_by_country.csv')

In [451]:
feature_selected_1 = ['country','Flu_pneumonia_death_rate_per_100000', 'h1n1_Geographic_spread',
       'h1n1_Intensity', 'h1n1_Impact_on_healthcare_services',
       'h1n1_Cases_underestimate', 'h1n1_Cases_confirmed',
       'h1n1_Deaths_confirmed', 'airport_traffic_2018_thousands',
       'property_Affordability_Index', 'Health_Care_Index',
       'hosp_beds_per_1000_2017', 'ICU-CCB_beds_per_100000', 'Population_2020',
       'Density_KM2m', 'Fertility_rate', 'Median_age', 'Urban_pop_pct',
       'gdp_usd_million', 'age_0_to_14_years_percent',
       'age_15_to_64_years_percent', 'age_over_65_years_percent',
       'sex_male_to_female_At_birth_CIA_estimate_2020',
       'sex_male_to_female_0_14_years', 'sex_male_to_female_15_24_years',
       'sex_male_to_female_25_54_years', 'sex_male_to_female_55_64_years',
       'sex_male_to_female_over_65', 'sex_male_to_female_total']

feature_selected_2 = ['lung', 'Female Lung','Male Lung','Smoking 2016']

In [452]:
population_features = ['airport_traffic_2018_thousands',
       'property_Affordability_Index', 'Population_2020',
       'Density_KM2m', 'Fertility_rate', 'Median_age', 'Urban_pop_pct',
       'gdp_usd_million', 'age_0_to_14_years_percent',
       'age_15_to_64_years_percent', 'age_over_65_years_percent',
       'sex_male_to_female_At_birth_CIA_estimate_2020',
       'sex_male_to_female_0_14_years', 'sex_male_to_female_15_24_years',
       'sex_male_to_female_25_54_years', 'sex_male_to_female_55_64_years',
       'sex_male_to_female_over_65', 'sex_male_to_female_total']

healthcare_features = ['Flu_pneumonia_death_rate_per_100000', 'h1n1_Geographic_spread',
       'h1n1_Intensity', 'h1n1_Impact_on_healthcare_services',
       'h1n1_Cases_underestimate', 'h1n1_Cases_confirmed',
       'h1n1_Deaths_confirmed', 'Health_Care_Index',
       'hosp_beds_per_1000_2017', 'ICU-CCB_beds_per_100000',
       'lung', 'Female Lung','Male Lung','Smoking 2016'
        ]

In [453]:
static_feature_1.shape,static_feature_2.shape

((173, 65), (96, 28))

In [454]:
static_feature = pd.merge(static_feature_1,static_feature_2,how='left',left_on='country',right_on='Country')
static_feature['country'] = static_feature['country'].replace('United States','US')

In [455]:
static_feature[['country'] + population_features].to_csv('../data/population_features.csv',index=False,header=True)
static_feature[['country'] + healthcare_features].to_csv('../data/healthcare_features.csv',index=False,header=True)

In [456]:
daily_policy_feature = pd.read_csv('../data/oxford_cgrt.csv')
daily_policy_feature['CountryName'] = daily_policy_feature['CountryName'].replace('United States','US')

In [457]:
policy_features = ['CountryName', 'Date', 'S1_School closing',
       'S1_IsGeneral', 'S2_Workplace closing', 'S2_IsGeneral',
       'S3_Cancel public events', 'S3_IsGeneral',
       'S4_Close public transport', 'S4_IsGeneral', 
       'S5_Public information campaigns', 'S5_IsGeneral',
       'S6_Restrictions on internal movement', 'S6_IsGeneral',
       'S7_International travel controls', 'S8_Fiscal measures',
       'S9_Monetary measures', 
       'S10_Emergency investment in health care', 
       'S11_Investment in Vaccines', 'StringencyIndex']

In [458]:
daily_policy_feature['Date'] = daily_policy_feature.Date.map(lambda x:'-'.join([str(x)[:4],str(x)[4:6],str(x)[6:]]))

In [459]:
daily_policy_feature[policy_features].rename({'CountryName':'country','Date':'date'},axis=1).to_csv('../data/daily_policy_feature.csv',index=False,header=True)

In [460]:
daily_weather_feature = pd.read_csv('../data/training_data_with_weather_info_week_4.csv')

In [461]:
daily_weather_feature.columns

Index(['Id', 'Province_State', 'Country_Region', 'Date', 'ConfirmedCases',
       'Fatalities', 'country+province', 'Lat', 'Long', 'day_from_jan_first',
       'temp', 'min', 'max', 'stp', 'slp', 'dewp', 'rh', 'ah', 'wdsp', 'prcp',
       'fog'],
      dtype='object')

In [462]:
countries = [item+'-' for item in daily_weather_feature['Country_Region'].unique() if item not in ['US','China','Canada','Australia']]
countries_others = [
'US-New York',
'China-Hubei',
'Canada-Quebec',
'Australia-Australian Capital Territory']
countries_used = countries + countries_others

In [463]:
daily_weather_feature = daily_weather_feature[daily_weather_feature['country+province'].isin(countries_used)]

In [464]:
weather_features = ['Country_Region', 'Date', 'temp', 'min', 'max', 'stp', 'slp', 'dewp', 'rh', 'ah', 'wdsp', 'prcp','fog']
daily_weather_feature[weather_features].reset_index(drop=True).rename({'Country_Region':'country','Date':'date'},axis=1).to_csv('../data/daily_weather_feature.csv',
                                                                                                                                index=False,header=True)

In [465]:
daily_confirmed = pd.read_csv('../data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
daily_deaths = pd.read_csv('../data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
daily_recovered = pd.read_csv('../data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [466]:
daily_features = [item for item in daily_confirmed.columns if item not in ['Province/State', 'Country/Region', 'Lat', 'Long']]

In [467]:
daily_confirmed[daily_features] = daily_confirmed[daily_features].diff(axis=1)
daily_deaths[daily_features] = daily_deaths[daily_features].diff(axis=1)
daily_recovered[daily_features] = daily_recovered[daily_features].diff(axis=1)

In [468]:
daily_confirmed[daily_features] = daily_confirmed[daily_features].mask(daily_confirmed[daily_features]<0,0)
daily_deaths[daily_features] = daily_deaths[daily_features].mask(daily_deaths[daily_features]<0,0)
daily_recovered[daily_features] = daily_recovered[daily_features].mask(daily_recovered[daily_features]<0,0)

In [469]:
daily_confirmed = daily_confirmed.groupby('Country/Region')[daily_features].sum().reset_index()
daily_deaths = daily_deaths.groupby('Country/Region')[daily_features].sum().reset_index()
daily_recovered = daily_recovered.groupby('Country/Region')[daily_features].sum().reset_index()

In [470]:
daily_confirmed = daily_confirmed.rename({'Country/Region':'country'},
                                         axis=1).set_index('country')[daily_features].unstack()
daily_deaths = daily_deaths.rename({'Country/Region':'country'},
                                         axis=1).set_index('country')[daily_features].unstack()
daily_recovered = daily_recovered.rename({'Country/Region':'country'},
                                         axis=1).set_index('country')[daily_features].unstack()

In [471]:
daily_confirmed = pd.DataFrame(daily_confirmed).reset_index().rename({'level_0':'date',0:'confirmed'},axis=1)
daily_deaths = pd.DataFrame(daily_deaths).reset_index().rename({'level_0':'date',0:'deaths'},axis=1)
daily_recovered = pd.DataFrame(daily_recovered).reset_index().rename({'level_0':'date',0:'recovered'},axis=1)

In [472]:
daily_recovered.shape,daily_deaths.shape,daily_confirmed.shape

((19822, 3), (19822, 3), (19822, 3))

In [473]:
daily_ts = pd.merge(daily_confirmed,daily_deaths,how='left',on=['date','country'])
daily_ts = pd.merge(daily_ts,daily_recovered,how='left',on=['date','country'])

In [474]:
def normalized_ts_date(x):
    m,d,y = x.split('/')
    y = '20' + y
    m = '0' + m
    d = '0' + d if int(d) < 10 else d
    return '-'.join([y,m,d])

daily_ts['date'] = daily_ts.date.map(lambda x: normalized_ts_date(x))

In [475]:
daily_ts.to_csv('../data/daily_ts_feature.csv',index=False,header=True)

In [476]:
daily_recovered.country.value_counts().sort_values()

Bosnia and Herzegovina    106
Guinea                    106
Burma                     106
Ireland                   106
Andorra                   106
                         ... 
Senegal                   106
Fiji                      106
Georgia                   106
Cyprus                    106
Botswana                  106
Name: country, Length: 187, dtype: int64

In [477]:
daily_confirmed.country.value_counts().sort_values()

Bosnia and Herzegovina    106
Guinea                    106
Burma                     106
Ireland                   106
Andorra                   106
                         ... 
Senegal                   106
Fiji                      106
Georgia                   106
Cyprus                    106
Botswana                  106
Name: country, Length: 187, dtype: int64

In [478]:
daily_deaths.country.value_counts().sort_values()

Bosnia and Herzegovina    106
Guinea                    106
Burma                     106
Ireland                   106
Andorra                   106
                         ... 
Senegal                   106
Fiji                      106
Georgia                   106
Cyprus                    106
Botswana                  106
Name: country, Length: 187, dtype: int64

In [479]:
static_feature[['country','Health_Care_Index',
       'hosp_beds_per_1000_2017', 'ICU-CCB_beds_per_100000']]

Unnamed: 0,country,Health_Care_Index,hosp_beds_per_1000_2017,ICU-CCB_beds_per_100000
0,Afghanistan,,,
1,Albania,53.05,,
2,Algeria,54.86,,
3,Andorra,,,
4,Angola,,,
...,...,...,...,...
168,Uzbekistan,,,
169,Venezuela,39.66,,
170,Vietnam,57.70,,
171,Zambia,,,
