To view Plotly Graphs: https://nbviewer.jupyter.org/github/IcedLemonTea0/EDA-COVID-19-in-China/blob/master/EDA%20COVID-19%20Analysis%20in%20China.ipynb

**Goal**

Explore the effect of COVID-19 lockdown across Mainland China after Jan 23rd 2020

In [1]:
# Import
import numpy as np
import pandas as pd

#Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import __version__
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

import plotly.graph_objs as go
import plotly.express as px


%matplotlib inline
plt.style.use('fivethirtyeight')

In [2]:
init_notebook_mode(connected=True)

In [3]:
df = pd.read_csv('covid_19_data.csv')
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


# Data Cleaning

We will begin by cleaning the column names to more programming friendly syntax 

In [4]:
df.columns = df.columns.str.lower().str.replace('/','_').str.replace(' ', '_').str.replace('observationdate','observation_date')
df.columns

Index(['sno', 'observation_date', 'province_state', 'country_region',
       'last_update', 'confirmed', 'deaths', 'recovered'],
      dtype='object')

Columns: 
* sno - Serial Number
* Observation_date - Date of observation in MM/DD/YYYY
* Province_state - Province or state of the observation 
* country_region - Country of observation 
* last_update - Time in UTC at which the row is updated 
* confirmed - Cumulative number of confirmed cases till that date
* deaths - Cumulative number of deats till that date
* recovered - Cumulative number of recovered cases till that date

We will only focus on China for this EDA.

In [5]:
df = df[df['country_region']=='Mainland China']
print('Number of rows: ', df.shape[0])
df.head()

Number of rows:  1672


Unnamed: 0,sno,observation_date,province_state,country_region,last_update,confirmed,deaths,recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1672 entries, 0 to 5857
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sno               1672 non-null   int64  
 1   observation_date  1672 non-null   object 
 2   province_state    1672 non-null   object 
 3   country_region    1672 non-null   object 
 4   last_update       1672 non-null   object 
 5   confirmed         1672 non-null   float64
 6   deaths            1672 non-null   float64
 7   recovered         1672 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 117.6+ KB


# Exploratory Data Analysis in China

**Number of COVID-19 deaths in China**

The objective of this section is to find the number of deaths and compare its deaths across Mainland China

In [7]:
total_deaths = df.groupby(by='province_state').max()['deaths'].sort_values(ascending=True)
total_deaths.sort_values(ascending = False)

province_state
Hubei             3085.0
Henan               22.0
Heilongjiang        13.0
Beijing              8.0
Guangdong            8.0
Shandong             7.0
Chongqing            6.0
Anhui                6.0
Hebei                6.0
Hainan               6.0
Hunan                4.0
Shanghai             3.0
Sichuan              3.0
Tianjin              3.0
Xinjiang             3.0
Guizhou              2.0
Yunnan               2.0
Shaanxi              2.0
Guangxi              2.0
Gansu                2.0
Fujian               1.0
Zhejiang             1.0
Inner Mongolia       1.0
Jiangxi              1.0
Jilin                1.0
Liaoning             1.0
Ningxia              0.0
Qinghai              0.0
Shanxi               0.0
Tibet                0.0
Jiangsu              0.0
Name: deaths, dtype: float64

In [8]:
fig = go.Figure(go.Bar(
    y=total_deaths.index,
    x=total_deaths.values,
    orientation ='h'
    
))

fig.layout.update(
    title = 'Cumulative deaths in China',
    xaxis_title = 'Deaths',
    yaxis_title = 'Region'
)

fig.show()

**Insight**

* Approximately **95% of deaths (3085 deaths)** is located in the region of **Hubei**
* The second highest (with 22 deaths) is located in the region of **Henan**



**Observe trends**

Explore trends according by its dates

In [9]:
observe_date = df.groupby(by='observation_date').sum()[['confirmed','deaths','recovered']]
observe_date.head()

Unnamed: 0_level_0,confirmed,deaths,recovered
observation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/22/2020,547.0,17.0,28.0
01/23/2020,639.0,18.0,30.0
01/24/2020,916.0,26.0,36.0
01/25/2020,1399.0,42.0,39.0
01/26/2020,2062.0,56.0,49.0


In [10]:
fig1 = go.Figure(go.Scatter(
    x = observe_date.index,
    y = observe_date['recovered'].values
))

fig1.layout.update(
    title = 'Daily Cumulative Recovered Case of COVID-19 in China',
    xaxis_title = 'Observed dates after lockdown',
    yaxis_title = 'Recovered'
)

fig = go.Figure(go.Scatter(
    x = observe_date.index,
    y = observe_date['confirmed'].values
))

fig.layout.update(
    title = 'Daily Cumulative Tested Positive Cases of COVID-19 in China',
    xaxis_title = 'Observed dates after lockdown',
    yaxis_title = 'Positive Cases'
)

fig2 = go.Figure(go.Scatter(
    x = observe_date.index,
    y = observe_date['deaths'].values
))

fig2.layout.update(
    title = 'Daily Cumulative Confirmed Deaths of COVID-19 in China',
    xaxis_title = 'Observed dates after lockdown',
    yaxis_title = 'Deaths'
)


fig.show()
fig1.show()
fig2.show()

**Insight:**

All 3 graphs have a charateristic of Logistic curve. It starts off with few cases, and then exponentially increases in number of cases, after which the cases level out. 

**Rate of deaths**

We'll observe the rate of deaths and rate of recovery over time 

In [11]:
def create_death_rate(df):
    death_rate = []
    for row in range(len(df)):
        deaths = df.iloc[row]['deaths']
        confirmed = df.iloc[row]['confirmed']
        death_rate.append(deaths/confirmed *100)
    return death_rate
        
def create_recovery_rate(df):
    recovery_rate = []
    for row in range(len(df)):
        recovered = df.iloc[row]['recovered']
        confirmed = df.iloc[row]['confirmed']
        recovery_rate.append(recovered/confirmed *100)     
    return recovery_rate 

observe_date['death_rate'] = create_death_rate(observe_date)
observe_date['recovery_rate'] = create_recovery_rate(observe_date)

observe_date.head()

Unnamed: 0_level_0,confirmed,deaths,recovered,death_rate,recovery_rate
observation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01/22/2020,547.0,17.0,28.0,3.107861,5.11883
01/23/2020,639.0,18.0,30.0,2.816901,4.694836
01/24/2020,916.0,26.0,36.0,2.838428,3.930131
01/25/2020,1399.0,42.0,39.0,3.002144,2.787706
01/26/2020,2062.0,56.0,49.0,2.71581,2.376334


In [12]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = observe_date.index,
    y = observe_date['death_rate'].values,
    name = 'Daily death rate'
))

fig.add_trace(go.Scatter(
    x = observe_date.index,
    y = [observe_date.mean()['death_rate']]*len(observe_date),
    name = 'Average death rate',
    line = dict(dash='dot')
    
))

fig.layout.update(
    title = 'Death Rate Over Time',
    xaxis_title = "Observed dates after lockdown",
    yaxis_title = 'Death Rate (%)'
)

fig1 = go.Figure()


fig1.add_trace(go.Scatter(
    x = observe_date.index,
    y = observe_date['recovery_rate'].values,
    name = 'Daily recovery rate'
))

fig1.add_trace(go.Scatter(
    x = observe_date.index,
    y = [observe_date.mean()['recovery_rate']]*len(observe_date),
    name = 'Average recovery rate',
    line = dict(dash='dot')
))

fig1.layout.update(
    title = 'Recovery Rate Over Time',
    xaxis_title = "Observed dates after lockdown",
    yaxis_title = 'Recovery Rate (%)'
)

fig.update_xaxes(tickangle=45) 
fig1.update_xaxes(tickangle=45) 

fig.show()
fig1.show()

**Insight:**

* The latest death rate increased around to 4% where as the recovery rate has also risen up to 83%.
* Death rate sits above overall average of 3% - a 1% increase. 
* The recovery rate charts shows that China is making significant progress in recovery over time. 

**Hubei**

Hubei leads the number of deaths in China. 
Let's compare Hubei with China's average rate.

In [13]:
hubei = df[df['province_state']=='Hubei']
hubei.head()

Unnamed: 0,sno,observation_date,province_state,country_region,last_update,confirmed,deaths,recovered
13,14,01/22/2020,Hubei,Mainland China,1/22/2020 17:00,444.0,17.0,28.0
51,52,01/23/2020,Hubei,Mainland China,1/23/20 17:00,444.0,17.0,28.0
84,85,01/24/2020,Hubei,Mainland China,1/24/20 17:00,549.0,24.0,31.0
125,126,01/25/2020,Hubei,Mainland China,1/25/20 17:00,761.0,40.0,32.0
169,170,01/26/2020,Hubei,Mainland China,1/26/20 16:00,1058.0,52.0,42.0


In [14]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = hubei['observation_date'],
    y = hubei['deaths']
))

fig.layout.update(
    title='Number of Recorded Deaths in Hubei',
    xaxis_title = 'Recorded Dates',
    yaxis_title = 'Deaths'
)

fig.update_xaxes(tickangle=75) 

fig.show()

In [15]:
hubei.head()

Unnamed: 0,sno,observation_date,province_state,country_region,last_update,confirmed,deaths,recovered
13,14,01/22/2020,Hubei,Mainland China,1/22/2020 17:00,444.0,17.0,28.0
51,52,01/23/2020,Hubei,Mainland China,1/23/20 17:00,444.0,17.0,28.0
84,85,01/24/2020,Hubei,Mainland China,1/24/20 17:00,549.0,24.0,31.0
125,126,01/25/2020,Hubei,Mainland China,1/25/20 17:00,761.0,40.0,32.0
169,170,01/26/2020,Hubei,Mainland China,1/26/20 16:00,1058.0,52.0,42.0


In [16]:
observe_date_mean = df.groupby('observation_date').mean()
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = create_death_rate(hubei),
    name = 'Hubei Death Rate',
))

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = create_death_rate(observe_date_mean),
    name = 'China\'s Average Death Rate',
    line = dict(dash='dot')
    
))

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = [observe_date['death_rate'].iloc[len(observe_date)-1]] * len(observe_date),
    name = 'China\'s latest death rate',
    line = dict(dash='dot')
))


fig.layout.update(
    title = 'Comparing Hubei\'s Death Rate',
    xaxis_title = 'Observed Dates',
    yaxis_title = 'Death Rate (%)'
)

fig.update_xaxes(tickangle=45) 

fig.show()

In [17]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = create_recovery_rate(hubei),
    name = 'Hubei\'s Recovery Rate',
))

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = create_recovery_rate(observe_date_mean),
    name = 'China\'s Average Recovery Rate',
    line = dict(dash='dot')
    
))

fig.add_trace(go.Scatter(
    x = hubei['observation_date'],
    y = [observe_date['recovery_rate'].iloc[len(observe_date)-1]] * len(observe_date),
    name = 'China\'s latest Recovery rate',
    line = dict(dash='dot')
))


fig.layout.update(
    title = 'Comparing Hubei\'s Recovery Rate',
    xaxis_title = 'Observed Dates',
    yaxis_title = 'Recovery Rate (%)'
)

fig.update_xaxes(tickangle=45) 

fig.show()

In [18]:
'''
plt.scatter(create_death_rate(observe_date_mean),create_death_rate(hubei))
plt.show()
'''

fig = go.Figure()

fig.add_trace(go.Scatter(

    x = create_death_rate(observe_date_mean),
    y = create_death_rate(hubei),
    mode = 'markers'
))

fig.show()

In [19]:
np.corrcoef(create_death_rate(observe_date_mean),create_death_rate(hubei))

array([[1.        , 0.77094411],
       [0.77094411, 1.        ]])

In [20]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = create_recovery_rate(observe_date_mean),
    y = create_recovery_rate(hubei),
    mode = 'markers'
))    
fig.show()

In [21]:
np.corrcoef(create_recovery_rate(observe_date_mean),create_recovery_rate(hubei))

array([[1.        , 0.99717218],
       [0.99717218, 1.        ]])