<h1>This notebook tracks the spread of the novel coronavirus, also known as SARS-CoV-2. It is a contagious respiratory virus that first started in Wuhan in December 2019. On 2/11/2020, the disease is officially named COVID-19 by the World Health Organization. 
    <br>Data: <a href='https://github.com/CSSEGISandData/COVID-19'>https://github.com/CSSEGISandData/COVID-19</a>. A big thank you to Johns Hopkins for providing the data.
    <br>
    <br>Learn more from the <a href='https://www.who.int/emergencies/diseases/novel-coronavirus-2019'>WHO</a>
    <br>Learn more from the <a href='https://www.cdc.gov/coronavirus/2019-ncov'>CDC</a>
    <br>Map Visualizations from  <a href='https://gisanddata.maps.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6'>Johns Hopkins</a>
    <br>Feel free to provide me with feedbacks. 
    <br> Last update: 3/5/2020 7:28 PM
    <br> Make sure you run the notebook to see the graphs better. Some diagrams are hard to see in the default view. 
    
</h1>

<center><img src='https://newsfortomorrow.com/wp-content/uploads/2020/01/1578562454_Wuhan-pneumonia-New-coronavirus-related-to-SARS-idenitified.jpg'>
* Source: https://newsfortomorrow.com/wp-content/uploads/2020/01/1578562454_Wuhan-pneumonia-New-coronavirus-related-to-SARS-idenitified.jpg </center>

<h1>Keep strong, world!</h1>






In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import pandas as pd 
import random
import math
import time
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
import datetime
%matplotlib inline 

Import the data (make sure you update this on a daily basis)

In [None]:
confirmed_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')
deaths_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv')
recoveries_df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv')

In [None]:
confirmed_df

In [None]:
cols = confirmed_df.keys()
cols

Get all the dates for the outbreak

In [None]:
confirmed = confirmed_df.loc[:, cols[4]:cols[-1]]
deaths = deaths_df.loc[:, cols[4]:cols[-1]]
recoveries = recoveries_df.loc[:, cols[4]:cols[-1]]

In [None]:
confirmed

In [None]:
dates = confirmed.keys()
world_cases = [] # for문을 돌면서 1/22 일부터 전세계 일별 확진자수 sum()해서 모음
total_deaths = [] 
mortality_rate = []
total_recovered = [] 

for i in dates:
    confirmed_sum = confirmed[i].sum()
    death_sum = deaths[i].sum()
    recovered_sum = recoveries[i].sum()
    world_cases.append(confirmed_sum)
    total_deaths.append(death_sum)
    mortality_rate.append(death_sum/confirmed_sum)
    total_recovered.append(recovered_sum)
    

In [None]:
epidemics = pd.DataFrame({
    'epidemic' : ['COVID-19', 'SARS', 'EBOLA', 'MERS', 'H1N1'],
    'start_year' : [2019, 2003, 2014, 2012, 2009],
    'end_year' : [2020, 2004, 2016, 2017, 2010],
    'confirmed' : [world_cases[-1], 8096, 28646, 2494, 6724149],
    'deaths' : [total_deaths[-1], 774, 11323, 858, 19654]
})

epidemics['mortality'] = round((epidemics['deaths']/epidemics['confirmed'])*100, 2)

epidemics.head()

In [78]:
import plotly.express as px
import plotly.graph_objs as go


temp = epidemics.melt(id_vars='epidemic', value_vars=['confirmed', 'deaths', 'mortality'],
                      var_name='Case', value_name='Value')

fig = px.bar(temp, x="epidemic", y="Value", color='epidemic', text='Value', facet_col="Case",
             color_discrete_sequence = px.colors.qualitative.Bold)
fig.update_traces(textposition='outside')
#fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_yaxes(showticklabels=False)
fig.layout.yaxis2.update(matches=None)
fig.layout.yaxis3.update(matches=None)
fig.show()

NameError: name 'init_notebook_mode' is not defined

In [None]:
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)
total_recovered = np.array(total_recovered).reshape(-1, 1)


In [None]:
# Future forcasting
days_in_future = 14
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
# future_forcast

In [None]:
#Convert integer into datetime for better visualization
# x축으로 날짜 시각화를 위해서 datetime 함수를 이용해서 
start = '2020/1/22'
start_date = datetime.datetime.strptime(start, '%Y/%m/%d')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d'))
adjusted_dates = future_forcast_dates[:-14]
adjusted_dates # 현재 그래프 시각화용 만듦
future_forcast_dates # ~예측용 시각화

In [None]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, world_cases, test_size=0.2, shuffle=False) 

In [None]:
y_train_confirmed # start = '1/22/2020'

Model for predicting # of confirmed cases. I am using support vector machine, random forest, and linear regression in this example. 

In [None]:
kernel = ['linear', 'rbf']
c = [0.01, 0.1, 1, 10]
gamma = [0.01, 0.1, 1]
epsilon = [0.01, 0.1, 1]
shrinking = [True, False]
svm_grid = {'kernel': kernel, 'C': c, 'gamma' : gamma, 'epsilon': epsilon, 'shrinking' : shrinking}

svm = SVR()
svm_search = RandomizedSearchCV(svm, svm_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True, n_jobs=-1, n_iter=30, verbose=1)
svm_search.fit(X_train_confirmed, y_train_confirmed)

In [None]:
svm_search.best_params_

In [None]:
svm_confirmed = svm_search.best_estimator_
svm_pred = svm_confirmed.predict(future_forcast)
svm_pred

In [None]:
# check against testing data
svm_test_pred = svm_confirmed.predict(X_test_confirmed)
plt.plot(svm_test_pred, color='red', ls='--')
plt.plot(y_test_confirmed)
plt.xlabel('Accumulated_Confirmed_Count')
plt.box(False)

In [None]:
linear_model = LinearRegression(fit_intercept=False, normalize=True)
linear_model.fit(X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(X_test_confirmed)
linear_pred = linear_model.predict(future_forcast)
print('MAE:', mean_absolute_error(test_linear_pred, y_test_confirmed))
print('MSE:',mean_squared_error(test_linear_pred, y_test_confirmed))

In [None]:
linear_model.coef_

In [None]:
plt.plot(y_test_confirmed)
plt.plot(test_linear_pred, color='red', ls='--')
plt.xlabel('Accumulated_Confirmed_Count')
plt.box(False)

In [None]:
svm_test_pred = svm_confirmed.predict(future_forcast)
test_linear_pred = linear_model.predict(future_forcast)

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(test_linear_pred, color='red', ls='--')
plt.plot(svm_test_pred, color='orange', ls='--')
plt.title('Accumulated Confirmed Count in the World', size=30)
plt.xlabel('Time in Days', size=20)
plt.ylabel('Accumulated', size=20)
plt.vlines(x='02/12', ymin=0, ymax=160000, colors='red', linestyles='dotted')
plt.xticks(rotation=90, size=15)
plt.legend(['Confirmed Cases', 'Linear Regreasion', 'SVM'])
plt.show()

# 한국 데이터 정리하기.

In [None]:
korea_Cf = confirmed_df[confirmed_df['Country/Region'] == 'Korea, South' ]
korea_Cf = korea_Cf.loc[:, cols[4]:cols[-1]]
korea_Cf.index = ['Korea']
korea_Cf

In [None]:
korea_death = deaths_df[deaths_df['Country/Region'] == 'Korea, South' ]
korea_death = korea_death.loc[:, cols[4]:cols[-1]]
korea_death.index = ['Korea']
korea_death

In [None]:
recoveries

In [None]:
korea_Rc = recoveries_df[recoveries_df['Country/Region'] == 'Korea, South' ]
korea_Rc = korea_Rc.loc[:, cols[4]:cols[-1]]
korea_Rc.index = ['Korea']
korea_Rc

In [None]:
Accumulated_in_Korea= np.array(korea_Cf).reshape(-1, 1)
Accumulated_in_Korea

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(adjusted_dates, Accumulated_in_Korea)
plt.title('Accumulated Confirmed Count in the Korea', size=30)
plt.xlabel('Time in Days', size=20)
plt.ylabel('Accumulated', size=20)
plt.vlines(x='02/18', ymin=0, ymax=8000, colors='red', linestyles='dotted')
plt.xticks(rotation=90)
plt.legend(['Confirmed Cases', 'Confirmed day of 31 patient'])
plt.show()

In [None]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, Accumulated_in_Korea, test_size=0.1, shuffle=False) 
y_train_confirmed

In [None]:
kernel = ['linear', 'rbf']
c = [0.01, 0.1, 1, 10]
degree = [4,5,6,7]
gamma = [0.01, 0.1, 1, 10]
epsilon = [0.01, 0.1, 1]
shrinking = [True, False]
svm_grid = {'kernel': kernel, 'C': c, 'gamma' : gamma, 'epsilon': epsilon, 'shrinking' : shrinking, 'degree':degree}

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

svm = SVR()
svm_search = GridSearchCV(svm, svm_grid, scoring='neg_mean_squared_error', cv=5, return_train_score=True, n_jobs=-1, verbose=1)
svm_search.fit(X_train_confirmed, y_train_confirmed)

In [None]:
svm_search.best_params_

In [None]:
svm_search.best_estimator_

In [None]:
svm_pred = svm_search.best_estimator_.predict(future_forcast)
svm_pred

In [None]:
import statsmodels.api as sm

lm= sm.OLS(X_train_confirmed,y_train_confirmed)
results =lm.fit()
results.summary()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

linear_model = LinearRegression(fit_intercept=False, normalize=True)
linear_model.fit(X_train_confirmed,y_train_confirmed)
linear_pred = linear_model.predict(future_forcast)
linear_model.coef_


In [None]:
plt.plot(svm_pred, color='green', ls='-.', label = 'Prediction by SVM')
plt.plot(linear_pred, color='red', ls='--', label='Prediction by Linear Regression')
plt.plot(Accumulated_in_Korea, label='Accumulated real count')
plt.xlabel('days')
#plt.vlines(x=21, ymin=0, ymax=10000, alpha=0.3, linestyles='--')
#plt.text(x=23, y=5000, s='prediction',color='black', fontsize =20,horizontalalignment='center') 
plt.xticks(rotation=90, ha='left')
plt.legend()
plt.box(False)

In [None]:
# predictions using svm, which is the best model out of all three 
print('SVM prediction: ', set(zip(future_forcast_dates[-3:], svm_pred[-3:])))

 Graphing the number of confirmed cases, deaths, and the mortality rate over time, as well as the number of recoveries

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, total_deaths, color='red')
plt.title('Deceased Count in the World', size=30)
plt.xlabel('Time', size=20)
plt.ylabel('Deceased Count', size=20)
plt.xticks(rotation=90, size=15)
plt.show()

In [None]:
mean_mortality_rate = np.mean(mortality_rate)
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, mortality_rate, color='orange')
plt.axhline(y = mean_mortality_rate,linestyle='--', color='black')
plt.title('Mortality Rate of Coronavirus Over Time', size=30)
plt.legend(['mortality rate', 'y='+str(mean_mortality_rate)])
plt.xlabel('Time', size=20)
plt.ylabel('Mortality Rate', size=20)
plt.xticks(rotation=90, size=15)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, total_recovered, color='green')
plt.title('Recovered Count in the World', size=30)
plt.xlabel('Time', size=20)
plt.ylabel('Recovered', size=20)
plt.xticks(rotation=90, size=15)
plt.show()

Graphing the number of deaths and the number of recoveries

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, total_deaths, color='red')
plt.plot(adjusted_dates, total_recovered, color='green')
plt.legend(['death', 'recoveries'], loc='best', fontsize=20)
plt.title('Result of Coronavirus Cases', size=30)
plt.xlabel('Time', size=20)
plt.ylabel('# of Cases', size=20)
plt.xticks(rotation=90, size=15)
plt.show()

Plotting the number of deaths against the number of recoveries

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(total_recovered, total_deaths)
plt.title('Deaths vs Recovered', size=30)
plt.xlabel('Recovered Count', size=20)
plt.ylabel('Deceased Count', size=20)
plt.xticks(size=15)
plt.show()

Getting the latest information about provinces/states that have confirmed coronavirus cases

In [None]:
latest_confirmed = confirmed_df[dates[-1]]
latest_deaths = deaths_df[dates[-1]]
latest_recoveries = recoveries_df[dates[-1]]
latest_confirmed

Getting information about countries/regions that have confirmed coronavirus cases

In [None]:
unique_countries =  list(confirmed_df['Country/Region'].unique())
unique_countries

In [None]:
country_confirmed_cases = []
no_cases = []
for i in unique_countries:
    cases = latest_confirmed[confirmed_df['Country/Region']==i].sum()
    if cases > 0:
        country_confirmed_cases.append(cases)
    else:
        no_cases.append(i)
        
for i in no_cases:
    unique_countries.remove(i)

In [None]:
country_confirmed_cases

In [None]:
# number of cases per country/region
for i in range(len(unique_countries)):
    print(f'{unique_countries[i]}: {country_confirmed_cases[i]} cases')

Getting information about province/states that have confirmed coronavirus cases

In [None]:
unique_provinces =  list(confirmed_df['Province/State'].unique())

In [None]:
province_confirmed_cases = []
no_cases = [] 
for i in unique_provinces:
    cases = latest_confirmed[confirmed_df['Province/State']==i].sum()
    if cases > 0:
        province_confirmed_cases.append(cases)
    else:
        no_cases.append(i)
 
# remove areas with no confirmed cases
for i in no_cases:
    unique_provinces.remove(i)

In [None]:
# number of cases per province/state/city

for i in range(len(unique_provinces)):
    print(f'{unique_provinces[i]}: {province_confirmed_cases[i]} cases')

In [None]:
nan_indices = [] 

# handle nan if there is any, it is usually a float: float('nan')

for i in range(len(unique_provinces)):
    if type(unique_provinces[i]) == float:
        nan_indices.append(i)

unique_provinces = list(unique_provinces)
province_confirmed_cases = list(province_confirmed_cases)

for i in nan_indices:
    unique_provinces.pop(i)
    province_confirmed_cases.pop(i)

***Visual Representations*** (bar charts and pie charts)

In [None]:
plt.figure(figsize=(32, 18))
plt.barh(unique_countries, country_confirmed_cases)
plt.title('# of Coronavirus Confirmed Cases in Countries/Regions')
plt.xlabel('# of Covid19 Confirmed Cases')
plt.show()

In [None]:
outside_mainland_china_confirmed = np.sum(country_confirmed_cases[1:])
plt.figure(figsize=(16, 9))
plt.barh(unique_countries[0:10], country_confirmed_cases[0:10])
plt.barh('Outside Mainland China', outside_mainland_china_confirmed)
plt.title('# of Coronavirus Confirmed Cases')
plt.show()

In [None]:
# lets look at it in a logarithmic scale 
log_country_confirmed_cases = [math.log10(i) for i in country_confirmed_cases]
plt.figure(figsize=(32, 18))
plt.barh(unique_countries, log_country_confirmed_cases)
plt.title('Common Log # of Coronavirus Confirmed Cases in Countries/Regions')
plt.xlabel('Log of # of Covid19 Confirmed Cases')
plt.show()

In [None]:
plt.figure(figsize=(32, 18))
plt.barh(unique_provinces, province_confirmed_cases)
plt.title('# of Coronavirus Confirmed Cases in Provinces/States')
plt.show()

In [None]:
c = random.choices(list(mcolors.CSS4_COLORS.values()),k = len(unique_countries))
plt.figure(figsize=(20,20))
plt.title('Covid-19 Confirmed Cases per Country')
plt.pie(country_confirmed_cases, colors=c)
plt.legend(unique_countries, loc='best')
plt.show()

In [None]:
c = random.choices(list(mcolors.CSS4_COLORS.values()),k = len(unique_countries))
plt.figure(figsize=(20,20))
plt.title('Covid-19 Confirmed Cases per State/Province/Region')
plt.pie(province_confirmed_cases, colors=c)
plt.legend(unique_provinces, loc='best')
plt.show()

In [None]:
c = random.choices(list(mcolors.CSS4_COLORS.values()),k = len(unique_countries))
plt.figure(figsize=(20,20))
plt.title('Covid-19 Confirmed Cases in Countries Outside of Mainland China')
plt.pie(country_confirmed_cases[1:], colors=c)
plt.legend(unique_countries[1:], loc='best')
plt.show()

In [1]:

 # 중국본토내 데이터는 추후에 EDA 진행

In [None]:
confirmed_df[confirmed_df['Country/Region'] == 'China' ]

In [None]:
dates = confirmed.keys()
China = confirmed_df[confirmed_df['Country/Region'] == 'China' ]
China = China.loc[:, cols[4]:cols[-1]]
China_sum = []
for i in dates:
    confirmed_china = China[i].sum()
    China_sum.append(confirmed_china)
    
China_sum= np.array(China_sum).reshape(-1, 1)