In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [2]:
df_case = pd.read_csv("Korea/PatientInfo.csv")
df_case = df_case.rename(columns = {'confirmed_date':'date'})
df_float = pd.read_csv("Korea/SeoulFloating.csv")
df_regions = pd.read_csv("Korea/Region.csv")
df_weather = pd.read_csv("Korea/Weather.csv")

In [3]:
infected = df_case.groupby(['date', 'city', 'age', 'sex']).count().reset_index()
time_series = infected[['date', 'city', 'age', 'sex', 'patient_id']].copy()
time_series['age'] = time_series['age'].apply(lambda x : int(x[:-1]))
time_series.sort_values(by=['city', 'age', 'sex', 'date'], inplace = True)
time_series = time_series.reset_index(drop = True)

In [4]:
def construct_time_series(df):
    overall_shift = 0
    while np.isnan([df.shift(-overall_shift).loc[0]['patient_id']]) == False:
        list_time = [df.shift(-overall_shift).loc[0]['patient_id']]
        old_shift = overall_shift
        new_shift = overall_shift+1
        while (new_shift != 0) :
            if (df.shift(-old_shift).loc[0]['city'] == df.shift(-new_shift).loc[0]['city'] and df.shift(-old_shift).loc[0]['age'] == df.shift(-new_shift).loc[0]['age'] and df.shift(-old_shift).loc[0]['sex'] == df.shift(-new_shift).loc[0]['sex']):
                interval = (datetime.strptime(df.shift(-new_shift).loc[0]['date'], '%Y-%m-%d') - datetime.strptime(df.shift(-old_shift).loc[0]['date'], '%Y-%m-%d')).days
                list_time.extend([0]*(interval-1))
                list_time.append(df.shift(-new_shift).loc[0]['patient_id'])
                new_shift += 1
                old_shift +=1
                overall_shift += 1
            else :
                new_shift = 0
                overall_shift += 1
        yield list_time, df.shift(-overall_shift+1).loc[0]['date'], overall_shift-1

In [5]:
from statsmodels.tsa.ar_model import AR
time_generator = construct_time_series(time_series)
time_series['list_cases'] = np.empty((len(time_series), 0)).tolist()

In [6]:
for cases in time_generator:
    try :
        interval = (datetime.strptime('2020-04-02', '%Y-%m-%d') - datetime.strptime(cases[1], '%Y-%m-%d')).days
        model = AR(cases[0])
        model_fit = model.fit()
        yhat = model_fit.predict(len(cases[0]), interval)
        cases[0].extend(yhat)
        time_series.at[cases[2], 'list_cases'] = cases[0]
    except ValueError : 
        cases[0].extend([0]*interval)
        time_series.at[cases[2], 'list_cases'] = cases[0]

statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





In [7]:
time_series = time_series[time_series.astype(str)['list_cases'] != '[]'].explode('list_cases') \
.drop(['patient_id'], axis = 1).reset_index(drop = True)

In [8]:
def change_dates(df):
    for i in range(len(df)-1):
        if df.loc[i]['city'] == df.loc[i+1]['city'] and df.loc[i]['age'] == df.loc[i+1]['age'] and df.loc[i]['sex'] == df.loc[i+1]['sex'] :
            df.at[i+1, 'date'] = (datetime.strptime(df.loc[i]['date'], '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')
    return df

In [9]:
df_case = change_dates(time_series)
case_city = pd.DataFrame(df_case.groupby(['date', 'city']).count()['list_cases']).reset_index()
fp_city = pd.DataFrame(df_float.groupby(['date', 'city']).sum()['fp_num']).reset_index()
fp_age = pd.DataFrame(df_float.groupby(['date', 'city', 'sex', 'birth_year']).sum()['fp_num']).reset_index()

In [10]:
df_corona = case_city.merge(fp_city, how='right', on = ['date', 'city']).fillna(0)
df_corona = df_corona.drop(['list_cases'], axis = 1).sort_values(['date'])

In [11]:
def symptom_apparition(date, onset):
    if onset == True:
        contamination = [(datetime.strptime(date, '%Y-%m-%d') - timedelta(days=x)).strftime('%Y-%m-%d') for x in range(2,12)]
    else : 
        contamination = [(datetime.strptime(date, '%Y-%m-%d') - timedelta(days=x)).strftime('%Y-%m-%d') for x in range(2,12)]
    return contamination

In [12]:
df_case['contamination_period'] = df_case['date'].apply(lambda x : symptom_apparition(x, False)  if pd.notnull(x) else 'NaN')
case_daily = df_case[['contamination_period', 'city', 'sex', 'age', 'list_cases']].explode('contamination_period').groupby(['contamination_period', 'city', 'sex', 'age']).count().reset_index()

In [13]:
df_corona = df_corona.merge(case_daily, left_on = ['date', 'city'], right_on = ['contamination_period', 'city']) \
.drop(['contamination_period'], axis = 1)
df_corona = df_corona.rename(columns = {'list_cases':'total_number'})
df_corona['contaminated'] = np.ones(len(df_corona))

In [14]:
df_corona = df_corona.merge(fp_age, how = 'right', left_on = ['date', 'city', 'sex', 'age'], right_on = ['date', 'city', 'sex', 'birth_year']) \
.drop(['age'], axis = 1)
df_corona['combined'] = df_corona[['total_number', 'fp_num_y']].values.tolist()
df_corona = df_corona.explode('combined').drop(['total_number', 'fp_num_y'], axis = 1)
df_corona = df_corona.merge(fp_city, how='right', on = ['date', 'city']).fillna(0).drop(['fp_num_x'], axis = 1)
df_corona['contaminated'] = df_corona.apply(lambda x : 1-x.index%2)['contaminated']
df_corona['new_combined'] = df_corona['combined'].diff()

In [15]:
df_corona['group_number'] = np.where(df_corona.index % 2 == 0, df_corona.combined, df_corona.new_combined)
df_corona = df_corona.drop(['combined', 'new_combined'], axis = 1)
df_corona = df_corona.rename(columns = {'fp_num':'total_number'})
df_corona = df_corona.merge(df_regions, on = ['city']).drop(['code'], axis = 1)
df_corona = df_corona.merge(df_weather, on = ['date', 'province'])

In [16]:
infected = df_corona.groupby(['contaminated', 'date', 'city']).sum().reset_index()
nb_infected = infected[infected['contaminated'] == 1][['group_number', 'date', 'city']]

In [17]:
df_corona = df_corona.merge(nb_infected, on = ['date', 'city'])
df_corona['proportion_case'] = df_corona['group_number_y']/df_corona['total_number']
df_corona = df_corona.drop(['group_number_y'], axis = 1)
df_corona.sort_values(by=['date', 'city', 'age', 'sex', 'contaminated'], inplace = True)

In [24]:
df_corona.to_csv('corona.csv')

In [26]:
df_ contaminated = df_corona[df_corona['contaminated'] == 1].copy()

Unnamed: 0,date,city,sex,contaminated,birth_year,total_number,group_number_x,province,latitude,longitude,...,nursing_home_count,code,avg_temp,min_temp,max_temp,precipitation,max_wind_speed,most_wind_direction,avg_relative_humidity,proportion_case
30720,2020-01-01,Dobong-gu,female,1,20,6285430,0.0,Seoul,37.668952,127.047082,...,485,10000,-2.2,-6.5,0.3,0.1,2.6,50.0,64.4,0.0
30732,2020-01-01,Dobong-gu,male,1,20,6285430,0.0,Seoul,37.668952,127.047082,...,485,10000,-2.2,-6.5,0.3,0.1,2.6,50.0,64.4,0.0
30722,2020-01-01,Dobong-gu,female,1,30,6285430,0.0,Seoul,37.668952,127.047082,...,485,10000,-2.2,-6.5,0.3,0.1,2.6,50.0,64.4,0.0
30734,2020-01-01,Dobong-gu,male,1,30,6285430,0.0,Seoul,37.668952,127.047082,...,485,10000,-2.2,-6.5,0.3,0.1,2.6,50.0,64.4,0.0
30724,2020-01-01,Dobong-gu,female,1,40,6285430,0.0,Seoul,37.668952,127.047082,...,485,10000,-2.2,-6.5,0.3,0.1,2.6,50.0,64.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30234,2020-02-29,Yongsan-gu,male,1,50,4852260,0.0,Seoul,37.532768,126.990021,...,435,10000,7.7,4.5,13.0,0.5,4.2,50.0,68.6,0.0
30224,2020-02-29,Yongsan-gu,female,1,60,4852260,0.0,Seoul,37.532768,126.990021,...,435,10000,7.7,4.5,13.0,0.5,4.2,50.0,68.6,0.0
30236,2020-02-29,Yongsan-gu,male,1,60,4852260,0.0,Seoul,37.532768,126.990021,...,435,10000,7.7,4.5,13.0,0.5,4.2,50.0,68.6,0.0
30226,2020-02-29,Yongsan-gu,female,1,70,4852260,0.0,Seoul,37.532768,126.990021,...,435,10000,7.7,4.5,13.0,0.5,4.2,50.0,68.6,0.0
