In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_case = pd.read_csv("Korea/PatientInfo.csv")
df_case = df_case.rename(columns = {'confirmed_date':'date'})
df_float = pd.read_csv("Korea/SeoulFloating.csv")
df_regions = pd.read_csv("Korea/Region.csv")
df_weather = pd.read_csv("Korea/Weather.csv")

In [3]:
case_city = pd.DataFrame(df_case.groupby(['date', 'city']).count()['patient_id']).reset_index()
fp_city = pd.DataFrame(df_float.groupby(['date', 'city']).sum()['fp_num']).reset_index()
fp_age = pd.DataFrame(df_float.groupby(['date', 'city', 'sex', 'birth_year']).sum()['fp_num']).reset_index()

In [4]:
df_corona = case_city.merge(fp_city, how='right', on = ['date', 'city']).fillna(0)
df_corona = df_corona.drop(['patient_id'], axis = 1).sort_values(['date'])

In [5]:
from datetime import datetime, timedelta

def symptom_apparition(date, onset):
    if onset == True:
        contamination = [(datetime.strptime(date, '%Y-%m-%d') - timedelta(days=x)).strftime('%Y-%m-%d') for x in range(2,12)]
    else : 
        contamination = [(datetime.strptime(date, '%Y-%m-%d') - timedelta(days=x)).strftime('%Y-%m-%d') for x in range(4,14)]
    return contamination

In [6]:
df_case['contamination_period'] = df_case[['symptom_onset_date', 'date']] \
.apply(lambda x : symptom_apparition(x['symptom_onset_date'], True)  if pd.notnull(x['symptom_onset_date']) \
else (symptom_apparition(x['date'], False) if pd.notnull(x['date']) \
else 'NaN'), axis = 1)
case_daily = df_case[['contamination_period', 'city', 'sex', 'age', 'patient_id']].explode('contamination_period').groupby(['contamination_period', 'city', 'sex', 'age']).count().reset_index()

In [7]:
df_corona = df_corona.merge(case_daily, left_on = ['date', 'city'], right_on = ['contamination_period', 'city']) \
.drop(['contamination_period'], axis = 1)
df_corona = df_corona.rename(columns = {'patient_id':'total_number'})
df_corona['contaminated'] = np.ones(len(df_corona))
df_corona['age'] = df_corona['age'].apply(lambda x : int(x[:-1]))

In [8]:
df_corona = df_corona.merge(fp_age, how = 'right', left_on = ['date', 'city', 'sex', 'age'], right_on = ['date', 'city', 'sex', 'birth_year']) \
.drop(['age'], axis = 1)
df_corona['combined'] = df_corona[['total_number', 'fp_num_y']].values.tolist()
df_corona = df_corona.explode('combined').drop(['total_number', 'fp_num_y'], axis = 1)
df_corona = df_corona.merge(fp_city, how='right', on = ['date', 'city']).fillna(0).drop(['fp_num_x'], axis = 1)
df_corona['contaminated'] = df_corona.apply(lambda x : 1-x.index%2)['contaminated']
df_corona['new_combined'] = df_corona['combined'].diff()

In [9]:
df_corona['group_number'] = np.where(df_corona.index % 2 == 0, df_corona.combined, df_corona.new_combined)
df_corona = df_corona.drop(['combined', 'new_combined'], axis = 1)
df_corona = df_corona.rename(columns = {'fp_num':'total_number'})
df_corona = df_corona.merge(df_regions, on = ['city']).drop(['code'], axis = 1)
df_corona = df_corona.merge(df_weather, on = ['date', 'province'])

In [10]:
infected = df_corona.groupby(['contaminated', 'date', 'city']).sum().reset_index()
nb_infected = infected[infected['contaminated'] == 1][['group_number', 'date', 'city']]

In [11]:
df_corona = df_corona.merge(nb_infected, on = ['date', 'city'])
df_corona['proportion_case'] = df_corona['group_number_y']/df_corona['total_number']
df_corona = df_corona.drop(['group_number_y'], axis = 1)

In [12]:
df_corona.to_csv('corona.csv')

In [39]:
infected = df_case.groupby(['date', 'city', 'age', 'sex']).count().reset_index()
time_series = infected[['date', 'city', 'age', 'sex', 'patient_id']]
time_series['age'] = time_series['age'].apply(lambda x : int(x[:-1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_series['age'] = time_series['age'].apply(lambda x : int(x[:-1]))


In [131]:
time_series.sort_values(by=['city', 'age', 'sex', 'date'], inplace = True)
cities = time_series['city'].unique().tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time_series.sort_values(by=['city', 'age', 'sex', 'date'], inplace = True).reset_index()


AttributeError: 'NoneType' object has no attribute 'reset_index'

In [133]:
time_series = time_series.reset_index()

In [134]:
time_series[time_series['city']=='Bucheon-si']

Unnamed: 0,index,date,city,age,sex,patient_id
22,979,2020-03-07,Bucheon-si,20,female,1
23,1120,2020-03-10,Bucheon-si,20,female,1
24,100,2020-02-22,Bucheon-si,20,male,1
25,1020,2020-03-08,Bucheon-si,20,male,1
26,101,2020-02-22,Bucheon-si,30,female,1
27,1021,2020-03-08,Bucheon-si,30,female,1
28,204,2020-02-24,Bucheon-si,30,male,1
29,1059,2020-03-09,Bucheon-si,40,female,1
30,1121,2020-03-10,Bucheon-si,40,female,1
31,1191,2020-03-11,Bucheon-si,40,female,1


In [102]:
time_series.groupby(['city']).count()

Unnamed: 0_level_0,date,age,sex,patient_id
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ansan-si,4,4,4,4
Anseong-si,1,1,1,1
Anyang-si,10,10,10,10
Asan-si,7,7,7,7
Bucheon-si,24,24,24,24
...,...,...,...,...
Yongin-si,18,18,18,18
Yongsan-gu,5,5,5,5
Yuseong-gu,11,11,11,11
etc,25,25,25,25


In [114]:
list_time = [3]
list_time.extend([0]*(1-3))

In [135]:
def construct_time_series(df):
    overall_shift = 0
    while np.isnan([df.shift(-overall_shift).loc[0]['patient_id']]) == False:
        list_time = [df.shift(-overall_shift).loc[0]['patient_id']]
        old_shift = overall_shift
        new_shift = overall_shift+1
        while (new_shift != 0) :
            if (df.shift(-old_shift).loc[0]['city'] == df.shift(-new_shift).loc[0]['city'] and df.shift(-old_shift).loc[0]['age'] == df.shift(-new_shift).loc[0]['age'] and df.shift(-old_shift).loc[0]['sex'] == df.shift(-new_shift).loc[0]['sex']):
                interval = (datetime.strptime(df.shift(-new_shift).loc[0]['date'], '%Y-%m-%d') - datetime.strptime(df.shift(-old_shift).loc[0]['date'], '%Y-%m-%d')).days
                list_time.extend([0]*(interval-1))
                list_time.append(df.shift(-new_shift).loc[0]['patient_id'])
                new_shift += 1
                old_shift +=1
                overall_shift += 1
            else :
                new_shift = 0
                overall_shift += 1
        yield list_time, overall_shift

In [136]:
time_generator = construct_time_series(time_series)

In [137]:
for cases in time_generator:
    print(cases)

([1], 1)
([1.0], 2)
([1.0], 3)
([1.0], 4)
([1.0], 5)
([1.0, 0, 0, 1.0], 7)
([1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 10)
([1.0], 11)
([1.0], 12)
([1.0], 13)
([3.0], 14)
([1.0], 15)
([1.0], 16)
([2.0, 1.0], 18)
([1.0, 0, 2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 21)
([1.0], 22)
([1.0, 0, 0, 1.0], 24)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 26)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 28)
([1.0], 29)
([1.0, 1.0, 1.0, 2.0], 33)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 36)
([1.0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 0, 2.0, 0, 1.0, 1.0], 42)
([1.0], 43)
([1.0], 44)
([1.0, 0, 0, 1.0], 46)
([1.0], 47)
([1.0], 48)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 50)
([1.0, 0, 1.0], 52)
([1.0], 53)
([1.0], 54)
([1.0, 0, 1.0], 56)
([1.0, 0, 0, 0, 0, 0, 1.0], 58)
([1.0], 59)
([1.0], 60)
([1.0], 61)
([1.0], 62)
([1.0], 63)
([1.0], 64)
([1.0], 65)
([1.0, 0, 0, 0, 0, 0, 0, 0,

([1.0, 3.0, 2.0, 0, 1.0, 0, 1.0, 2.0, 0, 2.0, 4.0, 4.0, 9.0, 9.0, 5.0, 3.0, 0, 3.0, 1.0], 672)
([1.0, 0, 0, 2.0, 0, 1.0, 2.0, 0, 1.0, 0, 0, 0, 2.0, 1.0, 0, 0, 1.0], 680)
([1.0, 0, 1.0, 3.0, 3.0, 2.0, 9.0, 3.0, 4.0, 4.0, 9.0, 10.0, 8.0, 10.0, 5.0, 6.0, 1.0, 3.0, 1.0, 0, 2.0, 0, 0, 0, 1.0, 0, 1.0], 701)
([2.0, 0, 0, 0, 1.0, 1.0, 1.0, 1.0, 0, 5.0, 0, 2.0, 3.0, 0, 2.0, 6.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0], 713)
([1.0, 2.0, 3.0, 3.0, 2.0, 4.0, 3.0, 2.0, 0, 1.0, 2.0, 3.0, 5.0, 1.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 728)
([1.0, 0, 0, 1.0, 2.0, 1.0, 1.0, 0, 4.0, 0, 3.0, 1.0, 1.0, 0, 1.0, 3.0, 0, 0, 0, 0, 0, 1.0], 740)
([1.0, 0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 5.0, 5.0, 1.0, 0, 0, 1.0, 0, 0, 0, 0, 1.0, 1.0], 753)
([1.0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 0, 1.0, 0, 0, 1.0, 1.0, 1.0], 760)
([2.0, 1.0, 3.0, 2.0, 3.0, 0, 4.0, 7.0, 1.0, 0, 0, 1.0, 0, 1.0, 1.0, 0, 1.0, 0, 1.0], 773)
([1.0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 1.0, 2.0, 1.0, 0, 1.0, 3.0, 0, 0, 0, 1.0, 0, 1.0], 783)
([1.0, 0, 0, 1.

([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 1270)
([1.0, 0, 0, 1.0, 0, 1.0, 1.0], 1274)
([1.0, 1.0], 1276)
([1.0, 0, 0, 0, 0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 1.0], 1281)
([1.0, 0, 0, 0, 0, 1.0], 1283)
([1.0, 0, 1.0, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 1.0, 0, 0, 0, 0, 1.0], 1289)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 1291)
([1.0], 1292)
([1.0, 0, 1.0], 1294)
([1.0], 1295)
([1.0], 1296)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 1298)
([1.0], 1299)
([4.0], 1300)
([1.0], 1301)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 1303)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0], 1305)
([1.0], 1306)
([1.0], 1307)
([1.0], 1308)
([1.0], 1309)
([1.0], 1310)
([1.0], 1311)
([1.0], 1312)
([1.0], 1313)
([1.0], 1314)
([1.0], 1315)
([1.0], 1316)
([1.0], 1317)
([1.0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 1.0], 1320)
([1.0], 1321)
([1.0], 1322)
([1.0], 1323)
([1.0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0, 1.0], 1327)
([1.0, 0, 1.0, 0, 0, 0, 0, 0, 1.0], 1330)
([1.0, 0, 0, 0, 0, 0, 1.0, 0, 1.0], 1333)
([1.0], 1334)
([1.0, 0, 