# 01PatientCOVID19

# Libraries

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from IPython.display import display

sns.set_palette("pastel")

# Loading Data

In [2]:
patient = pd.read_csv(
    "C:\py\Projects\TuringCollege\COVID19\DataSets\patient.csv",
    index_col=False,
    skipinitialspace=True,
)

In [3]:
pd.set_option("max_rows", None)

# Size

In [4]:
patient.shape

(5165, 14)

# Delete Unneeded Columns

In [5]:
patient = patient.drop(
    [
        "ID",
        "city",
        "infected_by",
        "contact_number",
        "symptom_onset_date",
        "released_date",
        "deceased_date",
        "state"
    ],
    axis=1,
)

# Rename Columns

In [6]:
patient.rename(
    columns={
        "sex": "Gender",
        "age": "Age",
        "infection_case": "Infection Reason",
        "country": "Country",
        "province": "Province",
        "city": "City",
        "confirmed_date": "Confirmed Date"
    },
    inplace=True,
)
patient.head()

Unnamed: 0,Gender,Age,Country,Province,Infection Reason,Confirmed Date
0,male,50s,Korea,Seoul,overseas inflow,1/23/2020
1,male,30s,Korea,Seoul,overseas inflow,1/30/2020
2,male,50s,Korea,Seoul,contact with patient,1/30/2020
3,male,20s,Korea,Seoul,overseas inflow,1/30/2020
4,female,20s,Korea,Seoul,contact with patient,1/31/2020


# Missing Data

In [7]:
patient.isna().sum()

Gender              1122
Age                 1380
Country                0
Province               0
Infection Reason     919
Confirmed Date         3
dtype: int64

In [8]:
patient.dropna(subset = ['Infection Reason'], how='any', inplace = True) 

In [9]:
patient.fillna(value = "MISSING", inplace = True)
patient.isna().sum()

Gender              0
Age                 0
Country             0
Province            0
Infection Reason    0
Confirmed Date      0
dtype: int64

In [10]:
patient.head()

Unnamed: 0,Gender,Age,Country,Province,Infection Reason,Confirmed Date
0,male,50s,Korea,Seoul,overseas inflow,1/23/2020
1,male,30s,Korea,Seoul,overseas inflow,1/30/2020
2,male,50s,Korea,Seoul,contact with patient,1/30/2020
3,male,20s,Korea,Seoul,overseas inflow,1/30/2020
4,female,20s,Korea,Seoul,contact with patient,1/31/2020


# Delete Unneeded Data

In [11]:
patient = patient[patient["Infection Reason"] != 'etc']

In [12]:
patient.loc[(patient.Country == 'China')]
patient = patient[patient.Country != "China"]

In [13]:
patient = patient[patient.Country != "China"]

In [14]:
patient = patient.drop(
    [
        "Country",
        "Confirmed Date"
    ],
    axis=1,
)

# Date as Index

In [15]:
patient = patient.sort_values("Confirmed Date")
patient = patient.set_index('Confirmed Date')
patient.index.names = ['Date']

# Column Value Modifications 

In [16]:
patient.replace('overseas inflow', 'Overseas Inflow', inplace = True)
patient.replace('contact with patient', 'Contact with Patient', inplace = True)
patient.replace('male', 'Male', inplace = True)
patient.replace('female', 'Female', inplace = True)

# Column Order Modifications

In [17]:
patient = patient[
    ["Age", "Gender", "Province", "Infection Reason"]
]
patient.head()

Unnamed: 0_level_0,Province,Infection Reason,Gender,Age
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1/23/2020,Seoul,Overseas Inflow,Male,50s
1/26/2020,Gyeonggi-do,Overseas Inflow,Male,50s
1/27/2020,Gyeonggi-do,Overseas Inflow,Male,50s
1/30/2020,Seoul,Overseas Inflow,Male,30s
1/30/2020,Seoul,Contact with Patient,Male,50s


## Infection Case

In [18]:
df_infection = patient[['Province','Infection Reason']]

In [19]:
df_infection.groupby(['Province'])['Infection Reason'].value_counts()

Province           Infection Reason                             
Busan              Contact with Patient                              45
                   Overseas Inflow                                   40
                   Onchun Church                                     32
                   Shincheonji Church                                 5
                   Suyeong-gu Kindergarten                            3
                   Cheongdo Daenam Hospital                           1
Chungcheongbuk-do  Contact with Patient                              24
                   Overseas Inflow                                   11
                   Itaewon Clubs                                      1
Chungcheongnam-do  Contact with Patient                             101
                   gym facility in Cheonan                           27
                   Overseas Inflow                                   16
                   Richway                                            2

In [20]:
df_infection['Infection Reason'].value_counts()

Contact with Patient                             1607
Overseas Inflow                                   834
Itaewon Clubs                                     162
Richway                                           128
Guro-gu Call Center                               112
Shincheonji Church                                106
Coupang Logistics Center                           80
Yangcheon Table Tennis Club                        44
Day Care Center                                    43
SMR Newly Planted Churches Group                   36
Onchun Church                                      33
Bonghwa Pureun Nursing Home                        31
gym facility in Cheonan                            30
Ministry of Oceans and Fisheries                   28
Wangsung Church                                    24
Cheongdo Daenam Hospital                           21
Dongan Church                                      17
Eunpyeong St. Mary's Hospital                      15
Gyeongsan Seorin Nursing Hom

In [21]:
df_infection[df_infection['Infection Reason'] == 'MISSING'].value_counts()

Series([], dtype: int64)

## Busan

In [22]:
df_Busan = df_infection.loc[df_infection['Province'] == 'Busan']
df_Busan.groupby(['Province'])['Infection Reason'].value_counts()

Province  Infection Reason        
Busan     Contact with Patient        45
          Overseas Inflow             40
          Onchun Church               32
          Shincheonji Church           5
          Suyeong-gu Kindergarten      3
          Cheongdo Daenam Hospital     1
Name: Infection Reason, dtype: int64

## Seoul

In [23]:
df_Seoul = df_infection.loc[df_infection['Province'] == 'Seoul']

In [24]:
df_groupby_Seoul = df_Seoul.groupby(['Province'])['Infection Reason'].value_counts()

## Daegu

In [25]:
df_Daegu = df_infection.loc[df_infection['Province'] == 'Daegu']

In [26]:
df_Daegu.groupby(['Province'])['Infection Reason'].value_counts()

Province  Infection Reason    
Daegu     Shincheonji Church      13
          Overseas Inflow         10
          Contact with Patient     4
Name: Infection Reason, dtype: int64

## Gyeongsangbuk-do

In [27]:
df_Gyeongsangbukdo = df_infection.loc[df_infection['Province'] == 'Gyeongsangbuk-do']

In [28]:
df_Gyeongsangbukdo.groupby(['Province'])['Infection Reason'].value_counts()

Province          Infection Reason                     
Gyeongsangbuk-do  Contact with Patient                     129
                  Bonghwa Pureun Nursing Home               31
                  Shincheonji Church                        25
                  Overseas Inflow                           23
                  Cheongdo Daenam Hospital                  20
                  Gyeongsan Seorin Nursing Home             15
                  Gyeongsan Jeil Silver Town                12
                  Milal Shelter                             11
                  Gyeongsan Cham Joeun Community Center     10
                  Pilgrimage to Israel                       2
Name: Infection Reason, dtype: int64

## Gyeonggi-do

In [29]:
df_Gyeonggido = df_infection.loc[df_infection['Province'] == 'Gyeonggi-do']

In [30]:
df_Gyeonggido.groupby(['Province'])['Infection Reason'].value_counts()

Province     Infection Reason           
Gyeonggi-do  Contact with Patient           774
             Overseas Inflow                236
             Coupang Logistics Center        23
             Itaewon Clubs                   12
             Shincheonji Church              10
             Richway                          5
             Yangcheon Table Tennis Club      1
             gym facility in Cheonan          1
Name: Infection Reason, dtype: int64

## Gender

In [31]:
df_gender = patient[['Province','Gender']]

In [32]:
df_gender.groupby(['Province'])['Gender'].value_counts()

Province           Gender 
Busan              Male        66
                   Female      58
                   MISSING      2
Chungcheongbuk-do  Female      19
                   Male        17
Chungcheongnam-do  Female      94
                   Male        52
                   MISSING      1
Daegu              Female      16
                   Male        11
Daejeon            Male        40
                   Female      39
Gangwon-do         Male        26
                   Female      22
Gwangju            Female      21
                   Male        13
Gyeonggi-do        Female     389
                   Male       345
                   MISSING    328
Gyeongsangbuk-do   Female     164
                   Male       114
Gyeongsangnam-do   Male        53
                   Female      42
                   MISSING      1
Incheon            Female     191
                   Male       138
Jeju-do            Female       8
                   MISSING      4
                   Ma

In [33]:
df_gender['Gender'].value_counts()

Female     1375
Male       1183
MISSING     974
Name: Gender, dtype: int64

In [34]:
df_gender[df_gender['Gender'] == 'MISSING'].value_counts()

Province           Gender 
Seoul              MISSING    636
Gyeonggi-do        MISSING    328
Jeju-do            MISSING      4
Busan              MISSING      2
Chungcheongnam-do  MISSING      1
Gyeongsangnam-do   MISSING      1
Sejong             MISSING      1
Ulsan              MISSING      1
dtype: int64

## Age

In [35]:
df_age = patient[['Province','Age']]

In [36]:
df_age.groupby(['Province'])['Age'].value_counts()

Province           Age    
Busan              20s         42
                   30s         15
                   60s         15
                   40s         13
                   50s         13
                   10s         11
                   70s          7
                   0s           6
                   80s          2
                   MISSING      2
Chungcheongbuk-do  50s          8
                   30s          6
                   10s          5
                   20s          4
                   40s          4
                   70s          4
                   0s           2
                   60s          2
                   80s          1
Chungcheongnam-do  40s         46
                   30s         27
                   50s         18
                   60s         17
                   20s         15
                   10s         12
                   0s           6
                   70s          4
                   80s          2
Daegu              40

In [37]:
df_age.loc[df_age['Age'] == 'MISSING'].value_counts()

Province          Age    
Seoul             MISSING    636
Gyeonggi-do       MISSING    328
Incheon           MISSING    246
Gyeongsangbuk-do  MISSING      4
Jeju-do           MISSING      4
Busan             MISSING      2
Gyeongsangnam-do  MISSING      1
Jeollabuk-do      MISSING      1
Ulsan             MISSING      1
dtype: int64

In [38]:
df_age['Age'].value_counts(normalize=True)

MISSING    0.346263
20s        0.155436
50s        0.111835
30s        0.103341
40s        0.095696
60s        0.081540
10s        0.034541
70s        0.030294
80s        0.020385
0s         0.014723
90s        0.005663
100s       0.000283
Name: Age, dtype: float64

In [44]:
df_groupby_Seoul.iloc[4:].sum()

392