# Covid-19 Data Analysis

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



### Import dataset

In [2]:
df = pd.read_csv('./covid19.csv')

In [3]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,female,returned home,03/05/2020,,1.0,65,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,
1,2,1,male,returned home,03/06/2020,,1.0,72,,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,5,
2,2,1,male,hospitalization,09/06/2020,1.0,2.0,55,,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
3,2,1,female,returned home,12/06/2020,,2.0,53,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
4,2,1,male,returned home,21/06/2020,,2.0,68,,1.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,


In [4]:
df.describe()

Unnamed: 0,USMER,MEDICAL_UNIT,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,192706.0,1032572.0,1048575.0,521310.0,1045237.0,1045572.0,1045596.0,1045171.0,1045471.0,1043530.0,1045499.0,1045543.0,1045569.0,1045355.0,1048575.0,192543.0
mean,1.632194,8.980565,1.825351,1.864379,41.7941,1.984403,1.88042,1.985594,1.969805,1.986442,1.844349,1.97313,1.980135,1.847145,1.98192,1.919285,5.305653,1.912446
std,0.4822084,3.723278,0.379668,0.3423854,16.90739,0.123911,0.3244694,0.1191554,0.1711242,0.1156451,0.3625247,0.1617045,0.1395369,0.3598474,0.1332413,0.2723973,1.881165,0.282647
min,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,2.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0
50%,2.0,12.0,2.0,2.0,40.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,2.0
75%,2.0,12.0,2.0,2.0,53.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,2.0
max,2.0,13.0,2.0,2.0,121.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,2.0


In [5]:
col_names = df.columns
col_names

Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

In [6]:
sex_values = df['SEX'].unique()
sex_values

array(['female', 'male'], dtype=object)

### Change 'SEX' to numerical value

In [7]:
sex_mapping = {
    'female': 0,
    'male': 1
}

In [8]:
df['SEX'] = df['SEX'].map(sex_mapping)

In [9]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,0,returned home,03/05/2020,,1.0,65,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,
1,2,1,1,returned home,03/06/2020,,1.0,72,,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,5,
2,2,1,1,hospitalization,09/06/2020,1.0,2.0,55,,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
3,2,1,0,returned home,12/06/2020,,2.0,53,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
4,2,1,1,returned home,21/06/2020,,2.0,68,,1.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,


In [10]:
ptype_values = df['PATIENT_TYPE'].unique()
ptype_values

array(['returned home', 'hospitalization'], dtype=object)

### Change 'PATIENT_TYPE' to numerical value

In [11]:
ptype_mapping = {
    'returned home': 0,
    'hospitalization': 1
}

In [12]:
df['PATIENT_TYPE'] = df['PATIENT_TYPE'].map(ptype_mapping)

In [13]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,0,0,03/05/2020,,1.0,65,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,
1,2,1,1,0,03/06/2020,,1.0,72,,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,5,
2,2,1,1,1,09/06/2020,1.0,2.0,55,,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
3,2,1,0,0,12/06/2020,,2.0,53,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
4,2,1,1,0,21/06/2020,,2.0,68,,1.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,


### Check NaN values

In [14]:
NaN_intubed_count = df['INTUBED'].isna().sum()
NaN_intubed_count

855869

In [15]:
NaN_pneumonia_count = df['PNEUMONIA'].isna().sum()
NaN_pneumonia_count

16003

In [16]:
NaN_pregnant_count = df['PREGNANT'].isna().sum()
NaN_pregnant_count

527265

In [44]:
NaN_diabetes_count = df['DIABETES'].isna().sum()
NaN_diabetes_count

3338

### Drop NaN Diabetes Values

In [45]:
diabetes_wNaN = df.dropna(subset=['DIABETES'])
diabetes_wNaN

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,0,0,03/05/2020,,1.0,65,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,
1,2,1,1,0,03/06/2020,,1.0,72,,2.0,...,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,5,
2,2,1,1,1,09/06/2020,1.0,2.0,55,,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
3,2,1,0,0,12/06/2020,,2.0,53,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
4,2,1,1,0,21/06/2020,,2.0,68,,1.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2,13,1,0,9999-99-99,,2.0,40,,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
1048571,1,13,1,1,9999-99-99,2.0,2.0,51,,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,7,2.0
1048572,2,13,1,0,9999-99-99,,2.0,55,,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,
1048573,2,13,1,0,9999-99-99,,2.0,28,,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7,


In [46]:
NaN_diabetes_recount = df['DIABETES'].isna().sum()
NaN_diabetes_recount

3338

In [18]:
NaN_copd_count = df['COPD'].isna().sum()
NaN_copd_count

3003

In [19]:
NaN_asthma_count = df['ASTHMA'].isna().sum()
NaN_asthma_count

2979

In [20]:
NaN_inmsupr_count = df['INMSUPR'].isna().sum()
NaN_inmsupr_count

3404

In [21]:
NaN_hipertension_count = df['HIPERTENSION'].isna().sum()
NaN_hipertension_count

3104

In [22]:
NaN_other_count = df['OTHER_DISEASE'].isna().sum()
NaN_other_count

5045

In [23]:
NaN_cardio_count = df['CARDIOVASCULAR'].isna().sum()
NaN_cardio_count

3076

In [24]:
NaN_obs_count = df['OBESITY'].isna().sum()
NaN_obs_count

3032

In [25]:
NaN_renal_count = df['RENAL_CHRONIC'].isna().sum()
NaN_renal_count

3006

In [26]:
NaN_tobacco_count = df['TOBACCO'].isna().sum()
NaN_tobacco_count

3220

In [31]:
NaN_icu_count = df['I'].isna().sum()
NaN_icu_count

856032