### Exploratory analysis of covid_cases hospitalisation data set 

In [1]:
#   import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
from urllib.error import HTTPError
import numpy as np
import wget
import time
from datetime import datetime


In [2]:
#   create dataframes from csv's
df = pd.read_csv('../CSVs/covid_cases.csv', 
                 parse_dates=['date'])
df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
0,0,AFG,2020-01-22,0,0,0,0,0,0,0
1,1,AFG,2020-01-23,0,0,0,0,0,0,0
2,2,AFG,2020-01-24,0,0,0,0,0,0,0
3,3,AFG,2020-01-25,0,0,0,0,0,0,0
4,4,AFG,2020-01-26,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
142552,142552,ZWE,2022-03-06,239209,5399,0,233810,190,2,0
142553,142553,ZWE,2022-03-07,239710,5399,0,234311,501,0,0
142554,142554,ZWE,2022-03-08,240343,5400,0,234943,633,1,0
142555,142555,ZWE,2022-03-09,240343,5400,0,234943,0,0,0


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142557 entries, 0 to 142556
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   Id             142557 non-null  int64         
 1   country_id     142557 non-null  object        
 2   date           142557 non-null  datetime64[ns]
 3   confirmed      142557 non-null  int64         
 4   deaths         142557 non-null  int64         
 5   recovered      142557 non-null  int64         
 6   active         142557 non-null  int64         
 7   new_cases      142557 non-null  int64         
 8   new_deaths     142557 non-null  int64         
 9   new_recovered  142557 non-null  int64         
dtypes: datetime64[ns](1), int64(8), object(1)
memory usage: 10.9+ MB


In [4]:
df = df.reset_index(drop=True)
df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,new_cases,new_deaths,new_recovered
0,0,AFG,2020-01-22,0,0,0,0,0,0,0
1,1,AFG,2020-01-23,0,0,0,0,0,0,0
2,2,AFG,2020-01-24,0,0,0,0,0,0,0
3,3,AFG,2020-01-25,0,0,0,0,0,0,0
4,4,AFG,2020-01-26,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
142552,142552,ZWE,2022-03-06,239209,5399,0,233810,190,2,0
142553,142553,ZWE,2022-03-07,239710,5399,0,234311,501,0,0
142554,142554,ZWE,2022-03-08,240343,5400,0,234943,633,1,0
142555,142555,ZWE,2022-03-09,240343,5400,0,234943,0,0,0


In [5]:
#   Drop new cases, deaths, recovered columns
df.drop('new_cases', axis=1, inplace=True)
df.drop('new_deaths', axis=1, inplace=True)
df.drop('new_recovered', axis=1, inplace=True)
df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active
0,0,AFG,2020-01-22,0,0,0,0
1,1,AFG,2020-01-23,0,0,0,0
2,2,AFG,2020-01-24,0,0,0,0
3,3,AFG,2020-01-25,0,0,0,0
4,4,AFG,2020-01-26,0,0,0,0
...,...,...,...,...,...,...,...
142552,142552,ZWE,2022-03-06,239209,5399,0,233810
142553,142553,ZWE,2022-03-07,239710,5399,0,234311
142554,142554,ZWE,2022-03-08,240343,5400,0,234943
142555,142555,ZWE,2022-03-09,240343,5400,0,234943


In [6]:
#   Save basic covid cases data
df.to_csv('../CSVs/covid_cases_basic.csv')


In [7]:
#   Create a DF for just those countries with hospitalisation data
hosp_data_df = df[df['country_id'].isin(["AUS","AUT","BEL","BOL","BGR","CAN","HRV","CYP","CZE","DNK","EST","FIN","FRA","HUN",
                                         "ISL","IRL","ISR","ITA","JPN","LVA","LTU","LUX","MYS","MLT","NLD","NOR","POL","PRT",
                                         "ROU","SRB","SVK","SVN","ZAF","ESP","SWE","CHE","GBR","USA"])]
hosp_data_df.dropna(axis=1, how="all")
hosp_data_df 


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active
6232,6232,AUS,2020-01-22,0,0,0,0
6233,6233,AUS,2020-01-23,0,0,0,0
6234,6234,AUS,2020-01-24,0,0,0,0
6235,6235,AUS,2020-01-25,0,0,0,0
6236,6236,AUS,2020-01-26,4,0,0,4
...,...,...,...,...,...,...,...
136320,136320,USA,2022-03-06,79276278,958819,0,78317459
136321,136321,USA,2022-03-07,79339388,960505,0,78378883
136322,136322,USA,2022-03-08,79369007,961843,0,78407164
136323,136323,USA,2022-03-09,79406602,963819,0,78442783


In [8]:
#   Remove hospitalisation data that does not match the range of 04/01/2020 - 03/10/2022
hosp_data_filtered_df = hosp_data_df [hosp_data_df ['date'] > '2020-03-31']
hosp_data_filtered_df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active
6302,6302,AUS,2020-04-01,4862,20,422,4420
6303,6303,AUS,2020-04-02,5116,24,520,4572
6304,6304,AUS,2020-04-03,5330,28,649,4653
6305,6305,AUS,2020-04-04,5550,30,701,4819
6306,6306,AUS,2020-04-05,5687,35,757,4895
...,...,...,...,...,...,...,...
136320,136320,USA,2022-03-06,79276278,958819,0,78317459
136321,136321,USA,2022-03-07,79339388,960505,0,78378883
136322,136322,USA,2022-03-08,79369007,961843,0,78407164
136323,136323,USA,2022-03-09,79406602,963819,0,78442783


In [9]:
#   Save basic Covid cases for which Hospitalisation data is available
hosp_data_df.to_csv('../CSVs/covid_cases_hosp_data.csv')


In [10]:
#   Get the Hospitalisation data and create dataframe
df = pd.read_csv('../CSVs/current-covid-patients-hospital.csv', 
                 parse_dates=['date'])
df


Unnamed: 0,Entity,Code,date,Daily hospital occupancy
0,Australia,AUS,2020-04-01,441
1,Australia,AUS,2020-04-02,415
2,Australia,AUS,2020-04-03,457
3,Australia,AUS,2020-04-04,490
4,Australia,AUS,2020-04-05,457
...,...,...,...,...
25568,United States,USA,2022-04-18,10414
25569,United States,USA,2022-04-19,10675
25570,United States,USA,2022-04-20,10906
25571,United States,USA,2022-04-21,11083


In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25573 entries, 0 to 25572
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Entity                    25573 non-null  object        
 1   Code                      25573 non-null  object        
 2   date                      25573 non-null  datetime64[ns]
 3   Daily hospital occupancy  25573 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 799.3+ KB


In [12]:
#   Remove hospitalisation data that does not match the range of 04/01/2020 - 03/10/2022
hosp_filtered_df = df[df['date'] < '2022-03-11']
hosp_filtered_df


Unnamed: 0,Entity,Code,date,Daily hospital occupancy
0,Australia,AUS,2020-04-01,441
1,Australia,AUS,2020-04-02,415
2,Australia,AUS,2020-04-03,457
3,Australia,AUS,2020-04-04,490
4,Australia,AUS,2020-04-05,457
...,...,...,...,...
25525,United States,USA,2022-03-06,27719
25526,United States,USA,2022-03-07,27074
25527,United States,USA,2022-03-08,25787
25528,United States,USA,2022-03-09,24766


In [13]:
hosp_filtered_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 24311 entries, 0 to 25529
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Entity                    24311 non-null  object        
 1   Code                      24311 non-null  object        
 2   date                      24311 non-null  datetime64[ns]
 3   Daily hospital occupancy  24311 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 949.6+ KB


In [14]:
#   Drop unwanted column with Country name and change 'Code' to 'country_id'
hosp_filtered_df.drop('Entity', axis=1, inplace=True)
hosp_filtered_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Code,date,Daily hospital occupancy
0,AUS,2020-04-01,441
1,AUS,2020-04-02,415
2,AUS,2020-04-03,457
3,AUS,2020-04-04,490
4,AUS,2020-04-05,457
...,...,...,...
25525,USA,2022-03-06,27719
25526,USA,2022-03-07,27074
25527,USA,2022-03-08,25787
25528,USA,2022-03-09,24766


In [15]:
#   Change 'Code' to 'country_id'
hosp_filtered_df.rename(columns={'Code' : 'country_id'}, index={'ONE': 'Row_1'}, inplace=True)
hosp_filtered_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,country_id,date,Daily hospital occupancy
0,AUS,2020-04-01,441
1,AUS,2020-04-02,415
2,AUS,2020-04-03,457
3,AUS,2020-04-04,490
4,AUS,2020-04-05,457
...,...,...,...
25525,USA,2022-03-06,27719
25526,USA,2022-03-07,27074
25527,USA,2022-03-08,25787
25528,USA,2022-03-09,24766


In [16]:
hosp_data_df

Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active
6232,6232,AUS,2020-01-22,0,0,0,0
6233,6233,AUS,2020-01-23,0,0,0,0
6234,6234,AUS,2020-01-24,0,0,0,0
6235,6235,AUS,2020-01-25,0,0,0,0
6236,6236,AUS,2020-01-26,4,0,0,4
...,...,...,...,...,...,...,...
136320,136320,USA,2022-03-06,79276278,958819,0,78317459
136321,136321,USA,2022-03-07,79339388,960505,0,78378883
136322,136322,USA,2022-03-08,79369007,961843,0,78407164
136323,136323,USA,2022-03-09,79406602,963819,0,78442783


In [17]:
hosp_filtered_df

Unnamed: 0,country_id,date,Daily hospital occupancy
0,AUS,2020-04-01,441
1,AUS,2020-04-02,415
2,AUS,2020-04-03,457
3,AUS,2020-04-04,490
4,AUS,2020-04-05,457
...,...,...,...
25525,USA,2022-03-06,27719
25526,USA,2022-03-07,27074
25527,USA,2022-03-08,25787
25528,USA,2022-03-09,24766


In [18]:
#    merge basic Covid cases for which Hospitalisation data is available and hospitalisation data
merged_df=hosp_data_df.merge(
  right=hosp_filtered_df, 
  how='left',
  on=['country_id', 'date']
)
merged_df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,Daily hospital occupancy
0,6232,AUS,2020-01-22,0,0,0,0,
1,6233,AUS,2020-01-23,0,0,0,0,
2,6234,AUS,2020-01-24,0,0,0,0,
3,6235,AUS,2020-01-25,0,0,0,0,
4,6236,AUS,2020-01-26,4,0,0,4,
...,...,...,...,...,...,...,...,...
29597,136320,USA,2022-03-06,79276278,958819,0,78317459,27719.0
29598,136321,USA,2022-03-07,79339388,960505,0,78378883,27074.0
29599,136322,USA,2022-03-08,79369007,961843,0,78407164,25787.0
29600,136323,USA,2022-03-09,79406602,963819,0,78442783,24766.0


In [21]:
#   Replace NaNs with zeroes
merged_df['Daily hospital occupancy'] = merged_df['Daily hospital occupancy'].fillna(0)
merged_df


Unnamed: 0,Id,country_id,date,confirmed,deaths,recovered,active,Daily hospital occupancy
0,6232,AUS,2020-01-22,0,0,0,0,441.0
1,6233,AUS,2020-01-23,0,0,0,0,415.0
2,6234,AUS,2020-01-24,0,0,0,0,457.0
3,6235,AUS,2020-01-25,0,0,0,0,490.0
4,6236,AUS,2020-01-26,4,0,0,4,457.0
...,...,...,...,...,...,...,...,...
29597,136320,USA,2022-03-06,79276278,958819,0,78317459,0.0
29598,136321,USA,2022-03-07,79339388,960505,0,78378883,0.0
29599,136322,USA,2022-03-08,79369007,961843,0,78407164,0.0
29600,136323,USA,2022-03-09,79406602,963819,0,78442783,0.0


In [22]:
#   Save merged 'basic Covid cases for which Hospitalisation data is available' and 'Daily hospital occupancy'
merged_df.to_csv('../CSVs/merged_covid_cases_hosp_data.csv')


####  Next step is to do machine learning model with Dependent Variable Y set to 'Daily hospital occupancy' and Independent Variable X set to 'confirmed', 'deaths', 'recovered', and 'active'  