#### import the required libs (numpy and pandas)

In [1]:
import pandas as pd
import numpy as np
pd.set_option('expand_frame_repr', False)

#### load csvs to pandas dataframes, we will use online data for the best accuracy

In [2]:
search_trend_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/search_trends_symptoms_dataset/United%20States%20of%20America/2020_US_weekly_symptoms_dataset.csv"
hospitalization_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv"
search_trend_df = pd.read_csv(search_trend_url)
hospitalization_df = pd.read_csv(hospitalization_url)


### We will try to clean the search trend dataset first

after loading the dataset to pandas dataframe, we will check how many parameters does the dataframe contains

In [3]:
len(search_trend_df.columns.values)

430

#### Since we have 430 parameters, we now examine the data, and try to determine if we can drop some of the columns to avoid over-fitting

In [4]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,open_covid_region_code,country_region_code,country_region,sub_region_1,sub_region_1_code,sub_region_2,sub_region_2_code,date,symptom:Abdominal obesity,symptom:Abdominal pain,...,symptom:Wart,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis
0,US-AK,US,United States,Alaska,US-AK,,,2020-01-06,,,...,,,,,,,,14.28,,
1,US-AK,US,United States,Alaska,US-AK,,,2020-01-13,,,...,,,,,,,,16.26,,
2,US-AK,US,United States,Alaska,US-AK,,,2020-01-20,,,...,,,,,,,,17.48,,
3,US-AK,US,United States,Alaska,US-AK,,,2020-01-27,,,...,,,,,,,,10.93,,
4,US-AK,US,United States,Alaska,US-AK,,,2020-02-03,,,...,,,,,,,,18.93,,
5,US-AK,US,United States,Alaska,US-AK,,,2020-02-10,,,...,,,,,,,,17.05,,
6,US-AK,US,United States,Alaska,US-AK,,,2020-02-17,,,...,,,,,,,,16.62,,
7,US-AK,US,United States,Alaska,US-AK,,,2020-02-24,,,...,,,,,,,,14.42,,
8,US-AK,US,United States,Alaska,US-AK,,,2020-03-02,,,...,,,,,,,,9.3,,
9,US-AK,US,United States,Alaska,US-AK,,,2020-03-09,,,...,,,,,,,,10.77,,


Since we have several NaN columns, we can drop the columns where all data in the column are NaN

And also, by looking at the dataset, columns named as `country_region_code`, `country_region`, and `open_covid_region_code` can be dropped.

In [5]:
search_trend_df = search_trend_df.drop(search_trend_df.columns[[0,1,2,4,5,6]], axis=1)
search_trend_df = search_trend_df.dropna(axis=1, how='all')
search_trend_df = search_trend_df.rename(columns={'sub_region_1':'region_name'})

search_trend_df


11.31113043478261
10.484588235294119
7.344206896551725
4.651794871794872
10.525286343612335
6.945277777777777
12.655911330049259
7.666146788990826
5.019736842105263
13.511048034934499
10.37778761061947
7.999104477611941
11.736163793103447
10.11824034334764
9.084640522875818
7.084149659863945
10.21642857142857
10.433171806167401
13.372307692307693
12.59245614035088
12.070344827586206
8.611477832512316
4.6049999999999995
8.139107142857142
8.185633802816902
18.813728813559322
9.02555023923445
8.85990243902439
17.825909090909093
8.464285714285714
10.902202643171806
9.052499999999998
12.941082474226805
10.60651282051282
8.046908212560387
9.733695652173912
11.260410256410257
9.954026548672564
11.296896551724139
8.951491228070175
9.889439655172414
11.32217391304348
5.587435897435896
10.01203463203463
9.268571428571429
10.789423076923075
10.708711340206186
11.002709677419356
11.612068965517242
8.670828729281768
8.909565217391306
8.477163461538462
9.617467248908298
10.904889867841412
10.3611353

Unnamed: 0,region_name,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,symptom:Aphonia,...,symptom:Tenderness,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn
0,Alaska,2020-01-06,12.69000,10.484588,7.344207,4.651795,12.290000,6.945278,12.655911,7.666147,...,14.28,15.86,11.900000,18.840000,13.880000,7.244731,14.08,9.910000,12.117,14.28
1,Alaska,2020-01-13,9.56000,10.484588,7.344207,4.651795,12.620000,6.945278,12.655911,7.666147,...,13.39,12.24,7.840000,16.830000,13.010000,7.650000,15.30,8.410000,12.117,16.26
2,Alaska,2020-01-20,11.31113,10.484588,7.344207,4.651795,14.170000,6.945278,7.360000,7.666147,...,12.88,17.66,15.270000,18.400000,12.140000,7.540000,10.30,16.190000,12.117,17.48
3,Alaska,2020-01-27,15.31000,7.470000,7.344207,4.651795,15.670000,7.470000,12.655911,7.650000,...,13.30,12.76,9.840000,20.410000,17.490000,7.244731,11.30,7.978223,12.117,10.93
4,Alaska,2020-02-03,8.81000,10.484588,7.344207,4.651795,13.870000,8.060000,12.655911,12.560000,...,11.81,16.68,10.680000,20.620000,16.870000,7.244731,14.24,10.680000,10.500,18.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,Wyoming,2020-08-31,5.03000,2.790000,7.344207,5.750000,4.190000,6.945278,12.655911,7.666147,...,3.07,3.07,2.570000,2.680000,9.817435,7.244731,5.08,7.978223,12.117,3.63
620,Wyoming,2020-09-07,3.36000,3.250000,2.200000,3.630000,10.525286,2.250000,2.700000,7.666147,...,4.29,2.81,9.608908,3.190000,3.190000,7.244731,4.51,7.978223,12.117,4.02
621,Wyoming,2020-09-14,4.15000,4.970000,7.344207,5.140000,5.790000,6.945278,4.590000,7.666147,...,3.93,5.63,3.060000,4.590000,3.220000,2.680000,3.88,7.978223,12.117,4.37
622,Wyoming,2020-09-21,5.64000,3.670000,7.344207,5.250000,3.770000,6.945278,5.090000,7.666147,...,3.28,3.72,3.170000,3.230000,2.900000,7.244731,5.14,3.230000,12.117,4.76


now, let's check the dataset again

In [6]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,region_name,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,symptom:Aphonia,...,symptom:Tenderness,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn
0,Alaska,2020-01-06,12.69,10.484588,7.344207,4.651795,12.29,6.945278,12.655911,7.666147,...,14.28,15.86,11.9,18.84,13.88,7.244731,14.08,9.91,12.117,14.28
1,Alaska,2020-01-13,9.56,10.484588,7.344207,4.651795,12.62,6.945278,12.655911,7.666147,...,13.39,12.24,7.84,16.83,13.01,7.65,15.3,8.41,12.117,16.26
2,Alaska,2020-01-20,11.31113,10.484588,7.344207,4.651795,14.17,6.945278,7.36,7.666147,...,12.88,17.66,15.27,18.4,12.14,7.54,10.3,16.19,12.117,17.48
3,Alaska,2020-01-27,15.31,7.47,7.344207,4.651795,15.67,7.47,12.655911,7.65,...,13.3,12.76,9.84,20.41,17.49,7.244731,11.3,7.978223,12.117,10.93
4,Alaska,2020-02-03,8.81,10.484588,7.344207,4.651795,13.87,8.06,12.655911,12.56,...,11.81,16.68,10.68,20.62,16.87,7.244731,14.24,10.68,10.5,18.93
5,Alaska,2020-02-10,11.97,10.484588,7.344207,4.651795,11.97,6.945278,12.655911,10.88,...,9.97,15.05,12.51,18.68,14.69,7.44,9.07,7.978223,7.8,17.05
6,Alaska,2020-02-17,9.16,10.484588,7.344207,4.651795,9.33,6.945278,12.655911,9.16,...,22.56,14.42,10.18,16.96,14.93,7.244731,13.57,8.65,8.48,16.62
7,Alaska,2020-02-24,10.44,6.63,7.344207,4.651795,11.44,10.61,9.78,7.29,...,11.77,17.57,15.58,18.56,10.44,7.13,12.26,8.62,11.1,14.42
8,Alaska,2020-03-02,14.62,10.484588,7.344207,4.651795,9.97,6.945278,12.655911,7.666147,...,10.8,19.43,12.96,21.76,11.29,7.244731,13.29,10.46,14.12,9.3
9,Alaska,2020-03-09,10.6,10.484588,11.61,4.651795,12.79,6.945278,7.07,7.91,...,12.96,11.11,13.97,46.95,10.27,7.244731,9.26,7.978223,18.85,10.77


Thus, we reduce the number of columns from 430 to 123

Yet, we still see a lot of NaNs, let's check the number of NaNs in each column

In [7]:
search_trend_df.isnull().sum()

region_name                         0
date                                0
symptom:Adrenal crisis              0
symptom:Ageusia                     0
symptom:Allergic conjunctivitis     0
                                   ..
symptom:Urinary urgency             0
symptom:Vasculitis                  0
symptom:Ventricular fibrillation    0
symptom:Viral pneumonia             0
symptom:Yawn                        0
Length: 123, dtype: int64

We will determine if we need to deal with these during training

### Now, it's time to clean the hospitalization dataset

We still need to have a look at the dataset first

In [8]:
hospitalization_df

Unnamed: 0,open_covid_region_code,region_name,date,cases_cumulative,cases_new,cases_cumulative_per_million,cases_new_per_million,deaths_cumulative,deaths_new,deaths_cumulative_per_million,...,stringency_index,stringency_index_for_display,stringency_legacy_index,stringency_legacy_index_for_display,government_response_index,government_response_index_for_display,containment_health_index,containment_health_index_for_display,economic_support_index,economic_support_index_for_display
0,AFG,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,AFG,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
2,AFG,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
3,AFG,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
4,AFG,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98941,VUT,Vanuatu,2020-10-05,,,,,,,,...,22.22,22.22,28.57,28.57,25.64,25.64,25.76,25.76,25.0,25.0
98942,VUT,Vanuatu,2020-10-06,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0
98943,VUT,Vanuatu,2020-10-07,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0
98944,VUT,Vanuatu,2020-10-08,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0


Since we only need `open_covid_region_code`, `region_name`,  `date`, and `hospitalized_new`, we will select only these columns

And in this mini-project only, we will use only data gathered from US


In [9]:
hospitalization_df = hospitalization_df[['open_covid_region_code','region_name','date','hospitalized_new']]
hospitalization_df = hospitalization_df[hospitalization_df['open_covid_region_code'].str.contains('^US-')]
hospitalization_df = hospitalization_df.drop(hospitalization_df.columns[[0]], axis=1)

# regions = hospitalization_df.region_name.unique()
# for region in regions:
#     region_df = hospitalization_df[hospitalization_df['region_name'] == region]
#     if (region_df['hospitalized_new'] == 0).all():
#         hospitalization_df = hospitalization_df.drop(hospitalization_df[hospitalization_df.region_name == region].index, axis=0) 
hospitalization_df

Unnamed: 0,region_name,date,hospitalized_new
85278,Wyoming,2020-03-07,0.0
85279,Wyoming,2020-03-08,0.0
85280,Wyoming,2020-03-09,0.0
85281,Wyoming,2020-03-10,0.0
85282,Wyoming,2020-03-11,0.0
...,...,...,...
97526,Alaska,2020-10-04,0.0
97527,Alaska,2020-10-05,0.0
97528,Alaska,2020-10-06,0.0
97529,Alaska,2020-10-07,0.0


Since the data here are daily basis, and the data in search_trend_df are weekly basis, we need to convert these to weekly basis

In [10]:
hospitalization_df['date'] = pd.to_datetime(hospitalization_df['date'], format='%Y-%m-%d')
hospitalization_df = hospitalization_df.groupby(['region_name',]).resample('W', on='date',loffset='1d').sum()
hospitalization_df = hospitalization_df.reset_index()
hospitalization_df

Unnamed: 0,region_name,date,hospitalized_new
0,Alabama,2020-03-09,0.0
1,Alabama,2020-03-16,0.0
2,Alabama,2020-03-23,0.0
3,Alabama,2020-03-30,0.0
4,Alabama,2020-04-06,231.0
...,...,...,...
1795,Wyoming,2020-09-14,9.0
1796,Wyoming,2020-09-21,15.0
1797,Wyoming,2020-09-28,19.0
1798,Wyoming,2020-10-05,20.0


now the cleanning is done, we merge the datasets

We also drop column `open_covid_region_code` since we already have `sub_region_1` 

For future purpose, we rename `sub_region_1` to `region_name`

In [11]:
search_trend_df['date'] = pd.to_datetime(search_trend_df['date'], format='%Y-%m-%d')
result_df = search_trend_df.merge(hospitalization_df, on=['region_name', 'date'])
result_df

Unnamed: 0,region_name,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,symptom:Aphonia,...,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn,hospitalized_new
0,Alaska,2020-03-09,10.60,10.484588,11.610000,4.651795,12.790000,6.945278,7.070000,7.910000,...,11.11,13.970000,46.950000,10.270000,7.244731,9.26,7.978223,18.850,10.77,0.0
1,Alaska,2020-03-16,11.69,10.484588,6.390000,4.651795,10.525286,6.945278,15.590000,7.950000,...,14.49,9.040000,78.710000,12.000000,7.244731,11.53,7.978223,24.940,14.34,0.0
2,Alaska,2020-03-23,11.15,16.570000,7.344207,4.651795,9.530000,6.945278,45.170000,7.666147,...,7.63,8.360000,37.540000,8.800000,7.244731,6.75,7.040000,13.790,15.69,0.0
3,Alaska,2020-03-30,8.96,8.960000,7.110000,4.651795,8.530000,6.945278,19.050000,7.666147,...,9.38,14.790000,24.460000,9.810000,7.960000,7.54,7.110000,8.820,17.63,6.0
4,Alaska,2020-04-06,10.08,7.350000,7.344207,4.651795,10.800000,7.060000,6.770000,7.666147,...,8.64,13.830000,16.280000,10.080000,7.244731,11.09,7.978223,9.070,14.12,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,Wyoming,2020-08-31,5.03,2.790000,7.344207,5.750000,4.190000,6.945278,12.655911,7.666147,...,3.07,2.570000,2.680000,9.817435,7.244731,5.08,7.978223,12.117,3.63,8.0
176,Wyoming,2020-09-07,3.36,3.250000,2.200000,3.630000,10.525286,2.250000,2.700000,7.666147,...,2.81,9.608908,3.190000,3.190000,7.244731,4.51,7.978223,12.117,4.02,4.0
177,Wyoming,2020-09-14,4.15,4.970000,7.344207,5.140000,5.790000,6.945278,4.590000,7.666147,...,5.63,3.060000,4.590000,3.220000,2.680000,3.88,7.978223,12.117,4.37,9.0
178,Wyoming,2020-09-21,5.64,3.670000,7.344207,5.250000,3.770000,6.945278,5.090000,7.666147,...,3.72,3.170000,3.230000,2.900000,7.244731,5.14,3.230000,12.117,4.76,15.0


We may need to use the dataframe in another file, so we export it as a CSV file

And also, we do not want to have index in our generated CSV file, so we use `index=False`

In [12]:
result_df.to_csv('result.csv', index=False)

### The cleaning process is done