#### import the required libs (numpy and pandas)

In [1]:
import pandas as pd
import numpy as np
pd.set_option('expand_frame_repr', False)

#### load csvs to pandas dataframes, we will use online data for the best accuracy

In [2]:
search_trend_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/search_trends_symptoms_dataset/United%20States%20of%20America/2020_US_weekly_symptoms_dataset.csv"
hospitalization_url ="https://raw.githubusercontent.com/google-research/open-covid-19-data/master/data/exports/cc_by/aggregated_cc_by.csv"
search_trend_df = pd.read_csv(search_trend_url)
hospitalization_df = pd.read_csv(hospitalization_url)


### We will try to clean the search trend dataset first

after loading the dataset to pandas dataframe, we will check how many parameters does the dataframe contains

In [3]:
len(search_trend_df.columns.values)

430

#### Since we have 430 parameters, we now examine the data, and try to determine if we can drop some of the columns to avoid over-fitting

In [4]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,open_covid_region_code,country_region_code,country_region,sub_region_1,sub_region_1_code,sub_region_2,sub_region_2_code,date,symptom:Abdominal obesity,symptom:Abdominal pain,...,symptom:Wart,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis
0,US-AK,US,United States,Alaska,US-AK,,,2020-01-06,,,...,,,,,,,,14.28,,
1,US-AK,US,United States,Alaska,US-AK,,,2020-01-13,,,...,,,,,,,,16.26,,
2,US-AK,US,United States,Alaska,US-AK,,,2020-01-20,,,...,,,,,,,,17.48,,
3,US-AK,US,United States,Alaska,US-AK,,,2020-01-27,,,...,,,,,,,,10.93,,
4,US-AK,US,United States,Alaska,US-AK,,,2020-02-03,,,...,,,,,,,,18.93,,
5,US-AK,US,United States,Alaska,US-AK,,,2020-02-10,,,...,,,,,,,,17.05,,
6,US-AK,US,United States,Alaska,US-AK,,,2020-02-17,,,...,,,,,,,,16.62,,
7,US-AK,US,United States,Alaska,US-AK,,,2020-02-24,,,...,,,,,,,,14.42,,
8,US-AK,US,United States,Alaska,US-AK,,,2020-03-02,,,...,,,,,,,,9.3,,
9,US-AK,US,United States,Alaska,US-AK,,,2020-03-09,,,...,,,,,,,,10.77,,


Since we have several NaN columns, we can drop the columns where all data in the column are NaN

And also, by looking at the dataset, columns named as `country_region_code`, `country_region`, and `sub_region_1_code` can be dropped.

In [5]:
# search_trend_df = search_trend_df.dropna(axis=1, how='all')
search_trend_df = search_trend_df.fillna(0)
search_trend_df = search_trend_df.drop(['country_region_code', 'country_region', 'sub_region_1_code','sub_region_2','sub_region_2_code'], axis=1)
search_trend_df


Unnamed: 0,open_covid_region_code,sub_region_1,date,symptom:Abdominal obesity,symptom:Abdominal pain,symptom:Acne,symptom:Actinic keratosis,symptom:Acute bronchitis,symptom:Adrenal crisis,symptom:Ageusia,...,symptom:Wart,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis
0,US-AK,Alaska,2020-01-06,0.0,0.0,0.0,0.0,0.0,12.69,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.28,0.0,0.0
1,US-AK,Alaska,2020-01-13,0.0,0.0,0.0,0.0,0.0,9.56,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.26,0.0,0.0
2,US-AK,Alaska,2020-01-20,0.0,0.0,0.0,0.0,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.48,0.0,0.0
3,US-AK,Alaska,2020-01-27,0.0,0.0,0.0,0.0,0.0,15.31,7.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.93,0.0,0.0
4,US-AK,Alaska,2020-02-03,0.0,0.0,0.0,0.0,0.0,8.81,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.93,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,US-WY,Wyoming,2020-08-31,0.0,0.0,0.0,0.0,0.0,5.03,2.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.63,0.0,0.0
620,US-WY,Wyoming,2020-09-07,0.0,0.0,0.0,0.0,0.0,3.36,3.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.02,0.0,0.0
621,US-WY,Wyoming,2020-09-14,0.0,0.0,0.0,0.0,0.0,4.15,4.97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.37,0.0,0.0
622,US-WY,Wyoming,2020-09-21,0.0,0.0,0.0,0.0,0.0,5.64,3.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.76,0.0,0.0


now, let's check the dataset again

In [6]:
search_trend_df.iloc[np.r_[0:10,-10:0]]

Unnamed: 0,open_covid_region_code,sub_region_1,date,symptom:Abdominal obesity,symptom:Abdominal pain,symptom:Acne,symptom:Actinic keratosis,symptom:Acute bronchitis,symptom:Adrenal crisis,symptom:Ageusia,...,symptom:Wart,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis
0,US-AK,Alaska,2020-01-06,0.0,0.0,0.0,0.0,0.0,12.69,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.28,0.0,0.0
1,US-AK,Alaska,2020-01-13,0.0,0.0,0.0,0.0,0.0,9.56,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.26,0.0,0.0
2,US-AK,Alaska,2020-01-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.48,0.0,0.0
3,US-AK,Alaska,2020-01-27,0.0,0.0,0.0,0.0,0.0,15.31,7.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.93,0.0,0.0
4,US-AK,Alaska,2020-02-03,0.0,0.0,0.0,0.0,0.0,8.81,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.93,0.0,0.0
5,US-AK,Alaska,2020-02-10,0.0,0.0,0.0,0.0,0.0,11.97,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.05,0.0,0.0
6,US-AK,Alaska,2020-02-17,0.0,0.0,0.0,0.0,0.0,9.16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.62,0.0,0.0
7,US-AK,Alaska,2020-02-24,0.0,0.0,0.0,0.0,0.0,10.44,6.63,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.42,0.0,0.0
8,US-AK,Alaska,2020-03-02,0.0,0.0,0.0,0.0,0.0,14.62,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.3,0.0,0.0
9,US-AK,Alaska,2020-03-09,0.0,0.0,0.0,0.0,0.0,10.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.77,0.0,0.0


Thus, we reduce the number of columns from 430 to 123

Yet, we still see a lot of NaNs, let's check the number of NaNs in each column

In [7]:
search_trend_df.isnull().sum()

open_covid_region_code       0
sub_region_1                 0
date                         0
symptom:Abdominal obesity    0
symptom:Abdominal pain       0
                            ..
symptom:Xeroderma            0
symptom:Xerostomia           0
symptom:Yawn                 0
symptom:hyperhidrosis        0
symptom:pancreatitis         0
Length: 425, dtype: int64

In case we need to drop several columns that contains too many NaNs later, we will modify code below to achieve that

In [8]:
# cols_drop = []
# for col in search_trend_df.columns.values:
#     if search_trend_df[col].isnull().sum()/ (float)len(search_trend_df) > .75
#         cols_drop.append(col)
# search_trend_df = search_trend_df.drop(cols_drop, axis=1)

### Now, it's time to clean the hospitalization dataset

We still need to have a look at the dataset first

In [9]:
hospitalization_df

Unnamed: 0,open_covid_region_code,region_name,date,cases_cumulative,cases_new,cases_cumulative_per_million,cases_new_per_million,deaths_cumulative,deaths_new,deaths_cumulative_per_million,...,stringency_index,stringency_index_for_display,stringency_legacy_index,stringency_legacy_index_for_display,government_response_index,government_response_index_for_display,containment_health_index,containment_health_index_for_display,economic_support_index,economic_support_index_for_display
0,AFG,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,AFG,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
2,AFG,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
3,AFG,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
4,AFG,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98941,VUT,Vanuatu,2020-10-05,,,,,,,,...,22.22,22.22,28.57,28.57,25.64,25.64,25.76,25.76,25.0,25.0
98942,VUT,Vanuatu,2020-10-06,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0
98943,VUT,Vanuatu,2020-10-07,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0
98944,VUT,Vanuatu,2020-10-08,,,,,,,,...,,22.22,,28.57,,25.64,,25.76,,25.0


Since we only need `open_covid_region_code`, `date`, and `hospitalized_new`, we will select only these columns

And in this mini-project only, we will use only data gathered from US


In [10]:
hospitalization_df = hospitalization_df[['open_covid_region_code','date','hospitalized_new']]
hospitalization_df = hospitalization_df[hospitalization_df['open_covid_region_code'].str.contains('^US-')]
hospitalization_df

Unnamed: 0,open_covid_region_code,date,hospitalized_new
85278,US-WY,2020-03-07,0.0
85279,US-WY,2020-03-08,0.0
85280,US-WY,2020-03-09,0.0
85281,US-WY,2020-03-10,0.0
85282,US-WY,2020-03-11,0.0
...,...,...,...
97526,US-AK,2020-10-04,0.0
97527,US-AK,2020-10-05,0.0
97528,US-AK,2020-10-06,0.0
97529,US-AK,2020-10-07,0.0


Since the data here are daily basis, and the data in search_trend_df are weekly basis, we need to convert these to weekly basis

In [11]:
hospitalization_df['date'] = pd.to_datetime(hospitalization_df['date'], format='%Y-%m-%d')
hospitalization_df = hospitalization_df.groupby(['open_covid_region_code',]).resample('W', on='date',loffset='1d').sum()
hospitalization_df = hospitalization_df.reset_index()
hospitalization_df

Unnamed: 0,open_covid_region_code,date,hospitalized_new
0,US-AK,2020-03-03,0.0
1,US-AK,2020-03-10,0.0
2,US-AK,2020-03-17,0.0
3,US-AK,2020-03-24,6.0
4,US-AK,2020-03-31,14.0
...,...,...,...
1795,US-WY,2020-09-08,9.0
1796,US-WY,2020-09-15,15.0
1797,US-WY,2020-09-22,19.0
1798,US-WY,2020-09-29,20.0


now the cleanning is done, we merge the datasets

We also drop column `open_covid_region_code` since we already have `sub_region_1` 

For future purpose, we rename `sub_region_1` to `region_name`

In [18]:
search_trend_df['date'] = pd.to_datetime(search_trend_df['date'], format='%Y-%m-%d')
result_df = search_trend_df.merge(hospitalization_df, on=['open_covid_region_code', 'date'])
result_df = result_df.rename({'sub_region_1': 'region_name'}, axis=1)
result_df = result_df.drop('open_covid_region_code',axis=1)
result_df

Unnamed: 0,region_name,date,symptom:Abdominal obesity,symptom:Abdominal pain,symptom:Acne,symptom:Actinic keratosis,symptom:Acute bronchitis,symptom:Adrenal crisis,symptom:Ageusia,symptom:Alcoholism,...,symptom:Water retention,symptom:Weakness,symptom:Weight gain,symptom:Wheeze,symptom:Xeroderma,symptom:Xerostomia,symptom:Yawn,symptom:hyperhidrosis,symptom:pancreatitis,hospitalized_new


We may need to use the dataframe in another file, so we export it as a CSV file

And also, we do not want to have index in our generated CSV file, so we use `index=False`

In [13]:
result_df.to_csv('result.csv', index=False)

### The cleaning process is done