#### import the required libs (numpy and pandas)

In [1]:
import pandas as pd
import numpy as np
pd.set_option('expand_frame_repr', False)

#### load csvs to pandas dataframes, we will use online data for the best accuracy

In [2]:
search_trend_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/search_trends_symptoms_dataset/United%20States%20of%20America/2020_US_weekly_symptoms_dataset.csv"
hospitalization_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv"
search_trend_df = pd.read_csv(search_trend_url)
hospitalization_df = pd.read_csv(hospitalization_url)


### We will try to clean the search trend dataset first

after loading the dataset to pandas dataframe, we will check how many parameters does the dataframe contains

In [3]:
len(search_trend_df.columns.values)

430

#### Since we have 430 parameters, we now examine the data, and try to determine if we can drop some of the columns to avoid over-fitting

In [4]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,open_covid_region_code,country_region_code,country_region,sub_region_1,sub_region_1_code,sub_region_2,sub_region_2_code,date,symptom:Abdominal obesity,symptom:Abdominal pain,...,symptom:Wart,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis
0,US-AK,US,United States,Alaska,US-AK,,,2020-01-06,,,...,,,,,,,,14.28,,
1,US-AK,US,United States,Alaska,US-AK,,,2020-01-13,,,...,,,,,,,,16.26,,
2,US-AK,US,United States,Alaska,US-AK,,,2020-01-20,,,...,,,,,,,,17.48,,
3,US-AK,US,United States,Alaska,US-AK,,,2020-01-27,,,...,,,,,,,,10.93,,
4,US-AK,US,United States,Alaska,US-AK,,,2020-02-03,,,...,,,,,,,,18.93,,
5,US-AK,US,United States,Alaska,US-AK,,,2020-02-10,,,...,,,,,,,,17.05,,
6,US-AK,US,United States,Alaska,US-AK,,,2020-02-17,,,...,,,,,,,,16.62,,
7,US-AK,US,United States,Alaska,US-AK,,,2020-02-24,,,...,,,,,,,,14.42,,
8,US-AK,US,United States,Alaska,US-AK,,,2020-03-02,,,...,,,,,,,,9.3,,
9,US-AK,US,United States,Alaska,US-AK,,,2020-03-09,,,...,,,,,,,,10.77,,


Since we have several NaN columns, we can drop the columns where all data in the column are NaN

And also, by looking at the dataset, columns named as country_region_code, country_region, sub_region_1, and sub_region_1 can be dropped. (due to we can recognize them by open_covid_region_code only)

In [5]:
search_trend_df = search_trend_df.dropna(axis=1, how='all')
search_trend_df = search_trend_df.drop(['country_region_code', 'country_region', 'sub_region_1_code'], axis=1)
search_trend_df


Unnamed: 0,open_covid_region_code,sub_region_1,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,...,symptom:Tenderness,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn
0,US-AK,Alaska,2020-01-06,12.69,,,,12.29,,,...,14.28,15.86,11.90,18.84,13.88,,14.08,9.91,,14.28
1,US-AK,Alaska,2020-01-13,9.56,,,,12.62,,,...,13.39,12.24,7.84,16.83,13.01,7.65,15.30,8.41,,16.26
2,US-AK,Alaska,2020-01-20,,,,,14.17,,7.36,...,12.88,17.66,15.27,18.40,12.14,7.54,10.30,16.19,,17.48
3,US-AK,Alaska,2020-01-27,15.31,7.47,,,15.67,7.47,,...,13.30,12.76,9.84,20.41,17.49,,11.30,,,10.93
4,US-AK,Alaska,2020-02-03,8.81,,,,13.87,8.06,,...,11.81,16.68,10.68,20.62,16.87,,14.24,10.68,10.5,18.93
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,US-WY,Wyoming,2020-08-24,4.42,3.31,3.42,5.19,3.42,,2.65,...,3.98,3.87,2.26,3.98,4.09,,3.64,2.48,,3.87
604,US-WY,Wyoming,2020-08-31,5.03,2.79,,5.75,4.19,,,...,3.07,3.07,2.57,2.68,,,5.08,,,3.63
605,US-WY,Wyoming,2020-09-07,3.36,3.25,2.20,3.63,,2.25,2.70,...,4.29,2.81,,3.19,3.19,,4.51,,,4.02
606,US-WY,Wyoming,2020-09-14,4.15,4.97,,5.14,5.79,,4.59,...,3.93,5.63,3.06,4.59,3.22,2.68,3.88,,,4.37


now, let's check the dataset again

In [6]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,open_covid_region_code,sub_region_1,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,...,symptom:Tenderness,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn
0,US-AK,Alaska,2020-01-06,12.69,,,,12.29,,,...,14.28,15.86,11.9,18.84,13.88,,14.08,9.91,,14.28
1,US-AK,Alaska,2020-01-13,9.56,,,,12.62,,,...,13.39,12.24,7.84,16.83,13.01,7.65,15.3,8.41,,16.26
2,US-AK,Alaska,2020-01-20,,,,,14.17,,7.36,...,12.88,17.66,15.27,18.4,12.14,7.54,10.3,16.19,,17.48
3,US-AK,Alaska,2020-01-27,15.31,7.47,,,15.67,7.47,,...,13.3,12.76,9.84,20.41,17.49,,11.3,,,10.93
4,US-AK,Alaska,2020-02-03,8.81,,,,13.87,8.06,,...,11.81,16.68,10.68,20.62,16.87,,14.24,10.68,10.5,18.93
5,US-AK,Alaska,2020-02-10,11.97,,,,11.97,,,...,9.97,15.05,12.51,18.68,14.69,7.44,9.07,,7.8,17.05
6,US-AK,Alaska,2020-02-17,9.16,,,,9.33,,,...,22.56,14.42,10.18,16.96,14.93,,13.57,8.65,8.48,16.62
7,US-AK,Alaska,2020-02-24,10.44,6.63,,,11.44,10.61,9.78,...,11.77,17.57,15.58,18.56,10.44,7.13,12.26,8.62,11.1,14.42
8,US-AK,Alaska,2020-03-02,14.62,,,,9.97,,,...,10.8,19.43,12.96,21.76,11.29,,13.29,10.46,14.12,9.3
9,US-AK,Alaska,2020-03-09,10.6,,11.61,,12.79,,7.07,...,12.96,11.11,13.97,46.95,10.27,,9.26,,18.85,10.77


Thus, we reduce the number of columns from 430 to 123

Yet, we still see a lot of NaNs, let's check the number of NaNs in each column

In [7]:
search_trend_df.isnull().sum()

open_covid_region_code                0
sub_region_1                          0
date                                  0
symptom:Adrenal crisis              346
symptom:Ageusia                     370
                                   ... 
symptom:Urinary urgency             218
symptom:Vasculitis                  535
symptom:Ventricular fibrillation    189
symptom:Viral pneumonia             218
symptom:Yawn                        533
Length: 124, dtype: int64

In case we need to drop several columns that contains too many NaNs later, we will modify code below to achieve that

In [8]:
# cols_drop = []
# for col in search_trend_df.columns.values:
#     if search_trend_df[col].isnull().sum()/ (float)len(search_trend_df) > .75
#         cols_drop.append(col)
# search_trend_df = search_trend_df.drop(cols_drop, axis=1)

### Now, it's time to clean the hospitalization dataset

We still need to have a look at the dataset first

In [9]:
hospitalization_df

Unnamed: 0,open_covid_region_code,region_name,date,cases_cumulative,cases_new,cases_cumulative_per_million,cases_new_per_million,deaths_cumulative,deaths_new,deaths_cumulative_per_million,...,stringency_index,stringency_index_for_display,stringency_legacy_index,stringency_legacy_index_for_display,government_response_index,government_response_index_for_display,containment_health_index,containment_health_index_for_display,economic_support_index,economic_support_index_for_display
0,AFG,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,AFG,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
2,AFG,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
3,AFG,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
4,AFG,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98429,VUT,Vanuatu,2020-10-04,,,,,,,,...,22.22,22.22,28.57,28.57,25.64,25.64,25.76,25.76,25.0,25.0
98430,VUT,Vanuatu,2020-10-05,,,,,,,,...,22.22,22.22,28.57,28.57,25.64,25.64,25.76,25.76,25.0,25.0
98431,VUT,Vanuatu,2020-10-06,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0
98432,VUT,Vanuatu,2020-10-07,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0


Since we only need open_covid_region_code, date, and hospitalized_new, we will select only these columns

And in this mini-project only, we will use only data gathered from US


In [10]:
hospitalization_df = hospitalization_df[['open_covid_region_code','date','hospitalized_new']]
hospitalization_df = hospitalization_df[hospitalization_df['open_covid_region_code'].str.contains('^US-')]
hospitalization_df

Unnamed: 0,open_covid_region_code,date,hospitalized_new
84830,US-WY,2020-03-07,0.0
84831,US-WY,2020-03-08,0.0
84832,US-WY,2020-03-09,0.0
84833,US-WY,2020-03-10,0.0
84834,US-WY,2020-03-11,0.0
...,...,...,...
97019,US-AK,2020-10-03,0.0
97020,US-AK,2020-10-04,0.0
97021,US-AK,2020-10-05,0.0
97022,US-AK,2020-10-06,0.0


Since the data here are daily basis, and the data in search_trend_df are weekly basis, we need to convert these to weekly basis

In [11]:
hospitalization_df['date'] = pd.to_datetime(hospitalization_df['date'], format='%Y-%m-%d')
hospitalization_df = hospitalization_df.groupby(['open_covid_region_code',]).resample('W', on='date',loffset='1d').sum()
hospitalization_df = hospitalization_df.reset_index()
hospitalization_df

Unnamed: 0,open_covid_region_code,date,hospitalized_new
0,US-AK,2020-03-09,0.0
1,US-AK,2020-03-16,0.0
2,US-AK,2020-03-23,0.0
3,US-AK,2020-03-30,6.0
4,US-AK,2020-04-06,14.0
...,...,...,...
1795,US-WY,2020-09-14,9.0
1796,US-WY,2020-09-21,15.0
1797,US-WY,2020-09-28,19.0
1798,US-WY,2020-10-05,20.0


now the cleanning is done, we merge the datasets

We also drop column `open_covid_region_code` since we already have region_name

In [12]:
search_trend_df['date'] = pd.to_datetime(search_trend_df['date'], format='%Y-%m-%d')
result_df = search_trend_df.merge(hospitalization_df, on=['open_covid_region_code', 'date'])
result_df = result_df.drop('open_covid_region_code',axis=1)
result_df

Unnamed: 0,open_covid_region_code,sub_region_1,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,symptom:Amenorrhea,symptom:Angular cheilitis,symptom:Anosmia,...,symptom:Thyroid nodule,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn,hospitalized_new
0,US-AK,Alaska,2020-03-09,10.60,,11.61,,12.79,,7.07,...,11.11,13.97,46.95,10.27,,9.26,,18.85,10.77,0.0
1,US-AK,Alaska,2020-03-16,11.69,,6.39,,,,15.59,...,14.49,9.04,78.71,12.00,,11.53,,24.94,14.34,0.0
2,US-AK,Alaska,2020-03-23,11.15,16.57,,,9.53,,45.17,...,7.63,8.36,37.54,8.80,,6.75,7.04,13.79,15.69,0.0
3,US-AK,Alaska,2020-03-30,8.96,8.96,7.11,,8.53,,19.05,...,9.38,14.79,24.46,9.81,7.96,7.54,7.11,8.82,17.63,6.0
4,US-AK,Alaska,2020-04-06,10.08,7.35,,,10.80,7.06,6.77,...,8.64,13.83,16.28,10.08,,11.09,,9.07,14.12,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,US-WY,Wyoming,2020-08-24,4.42,3.31,3.42,5.19,3.42,,2.65,...,3.87,2.26,3.98,4.09,,3.64,2.48,,3.87,16.0
461,US-WY,Wyoming,2020-08-31,5.03,2.79,,5.75,4.19,,,...,3.07,2.57,2.68,,,5.08,,,3.63,8.0
462,US-WY,Wyoming,2020-09-07,3.36,3.25,2.20,3.63,,2.25,2.70,...,2.81,,3.19,3.19,,4.51,,,4.02,4.0
463,US-WY,Wyoming,2020-09-14,4.15,4.97,,5.14,5.79,,4.59,...,5.63,3.06,4.59,3.22,2.68,3.88,,,4.37,9.0


We may need to use the dataframe in another file, so we export it as a csv file

In [13]:
result_df.to_csv('result.csv', index=False)

### The cleaning process is done