In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
# Read the csv of ecdc

df = pd.read_csv('data/ecdc_covid19_20200426.csv', parse_dates=['dateRep'], dayfirst=True)
df.tail()

In [None]:
# Filter data to keep only China and France

flt_df = df.loc[df.countriesAndTerritories.isin(['China','France'])].reset_index()
flt_df.head()

In [None]:
# DEPRECATED Calculate cumsum of cases and deaths

flt_df['cases_cum'] = flt_df.sort_values('dateRep').groupby(['countriesAndTerritories'])['cases'].transform(lambda g: g.cumsum())
flt_df['deaths_cum'] = flt_df.sort_values('dateRep').groupby(['countriesAndTerritories'])['deaths'].transform(lambda g: g.cumsum())
flt_df.tail(20)


In [None]:
# DEPRECATED Calculate death_rate on cumsum

flt_df['death_rate']=flt_df['deaths_cum']/flt_df['cases_cum']

In [None]:
# DEPRECATED Calculate the growth of death_rate

flt_df['death_growth_rate']=flt_df.sort_values('dateRep').groupby(['countriesAndTerritories'])['deaths_cum'].transform(lambda g: g.pct_change())
flt_df.head()


In [None]:
# Normalize data of cases to get cases per million 

fr_norm = flt_df.loc[flt_df.countriesAndTerritories=='France']['popData2018'][118]/10**6
ch_norm = flt_df.loc[flt_df.countriesAndTerritories=='China']['popData2018'][113]/10**6

flt_df['norm_cases'] = flt_df.apply(lambda x: x['cases']/fr_norm if x.countriesAndTerritories=='France' else x['cases']/ch_norm, axis=1)
flt_df.head()

In [None]:
# Calculate the cumulative sum of norm_cases

flt_df['norm_cases_cum'] = flt_df.sort_values('dateRep').groupby(['countriesAndTerritories'])['norm_cases'].transform(lambda g: g.cumsum())
flt_df.head()



In [None]:
# Calculate growth rate for norm_cases_cum

flt_df['norm_cases_growth']=flt_df.sort_values('dateRep').groupby(['countriesAndTerritories'])['norm_cases_cum'].transform(lambda g: g.pct_change())



In [None]:
# Checking the growth of norm_cases for the last 5 days in France 
# and compare it to China growth of norm_cases day by day to get the date of China with the similar growth

fr_rate = flt_df[(flt_df.countriesAndTerritories=='France')].sort_values('dateRep',ascending=False)['norm_cases_growth'][:5]
ch_df = flt_df[(flt_df.countriesAndTerritories=='China')].sort_values('dateRep',ascending=False)

match_dates=[]

for i in range(flt_df[(flt_df.countriesAndTerritories=='China')].shape[0]):
    ch_rate = ch_df['norm_cases_growth'][i]
    pvalue = stats.ttest_1samp(fr_rate,ch_rate)[1]
    
    if pvalue > 0.05:
        match_dates.append(ch_df['dateRep'][i])
        
print(match_dates)


In [None]:
# Plotting growth of norm_cases by day to 

from pandas.plotting import register_matplotlib_converters
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

date_form = DateFormatter("%d-%m-%y") #too much values on the x axis for dates, so we format it with DateFormatter

ch = flt_df[flt_df.countriesAndTerritories=='China']['norm_cases_growth']
x_ch = flt_df[flt_df.countriesAndTerritories=='China']['dateRep']
fr = flt_df[flt_df.countriesAndTerritories=='France']['norm_cases_growth']
x_fr = flt_df[flt_df.countriesAndTerritories=='France']['dateRep']

fig, [ax1,ax2] = plt.subplots(1,2,figsize=(17,6))

ax1.plot(x_fr,fr)
ax1.xaxis.set_major_formatter(date_form)
ax1.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
ax1.set_title('France')

ax2.plot(x_ch,ch, color='orange')
ax2.vlines(match_dates[0], ymin=0.00, ymax=1.75, colors='r', linestyles='dashed', label='date of current growth in France')
ax2.xaxis.set_major_formatter(date_form)
ax2.xaxis.set_major_locator(mdates.WeekdayLocator(interval=3))
ax2.set_title('China')

fig.suptitle('Growth rate of cumulate cases per million',fontsize=16)
plt.legend()
plt.show()


In [None]:
fig.savefig('img/hypothesis-testing.png')