In [1]:
import pandas as pd 
import numpy as np 

covid = pd.read_csv('covid.csv')

## Data Exploration

In [2]:
covid.head()

Unnamed: 0,id,sex,patient_type,entry_date,date_symptoms,date_died,intubed,pneumonia,age,pregnancy,...,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,16169f,2,1,04-05-2020,02-05-2020,9999-99-99,97,2,27,97,...,2,2,2,2,2,2,2,2,1,97
1,1009bf,2,1,19-03-2020,17-03-2020,9999-99-99,97,2,24,97,...,2,2,2,2,2,2,2,99,1,97
2,167386,1,2,06-04-2020,01-04-2020,9999-99-99,2,2,54,2,...,2,2,2,2,1,2,2,99,1,2
3,0b5948,2,2,17-04-2020,10-04-2020,9999-99-99,2,1,30,97,...,2,2,2,2,2,2,2,99,1,2
4,0d01b5,1,2,13-04-2020,13-04-2020,22-04-2020,2,2,60,2,...,2,1,2,1,2,2,2,99,1,2


In [3]:
covid.shape

(566602, 23)

In [4]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566602 entries, 0 to 566601
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id                   566602 non-null  object
 1   sex                  566602 non-null  int64 
 2   patient_type         566602 non-null  int64 
 3   entry_date           566602 non-null  object
 4   date_symptoms        566602 non-null  object
 5   date_died            566602 non-null  object
 6   intubed              566602 non-null  int64 
 7   pneumonia            566602 non-null  int64 
 8   age                  566602 non-null  int64 
 9   pregnancy            566602 non-null  int64 
 10  diabetes             566602 non-null  int64 
 11  copd                 566602 non-null  int64 
 12  asthma               566602 non-null  int64 
 13  inmsupr              566602 non-null  int64 
 14  hypertension         566602 non-null  int64 
 15  other_disease        566602 non-nu

## Data Cleanup: Deleting Irrelevent Columns

In [5]:
covid.drop(columns = ['id', 'entry_date', 'date_symptoms', 'date_died'], inplace=True)

In [6]:
covid.head()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,2,1,97,2,27,97,2,2,2,2,2,2,2,2,2,2,2,1,97
1,2,1,97,2,24,97,2,2,2,2,2,2,2,2,2,2,99,1,97
2,1,2,2,2,54,2,2,2,2,2,2,2,2,1,2,2,99,1,2
3,2,2,2,1,30,97,2,2,2,2,2,2,2,2,2,2,99,1,2
4,1,2,2,2,60,2,1,2,2,2,1,2,1,2,2,2,99,1,2


## Data Cleanup: Dealing with "missing" data

This dataset is super sloppy, values of 97, 98, or 99 indicates missing data.

In [7]:
# pneumonia
covid = covid.loc[(covid.pneumonia == 1) | (covid.pneumonia == 2)]
# diabetes
covid = covid.loc[(covid.diabetes == 1) | (covid.diabetes == 2)]
# copd
covid = covid.loc[(covid.copd == 1) | (covid.copd == 2)]
# asthma
covid = covid.loc[(covid.asthma == 1) | (covid.asthma == 2)]
# inmsupr
covid = covid.loc[(covid.inmsupr == 1) | (covid.inmsupr == 2)]
# hypertension
covid = covid.loc[(covid.hypertension == 1) | (covid.hypertension == 2)]
# other_disease
covid = covid.loc[(covid.other_disease == 1) | (covid.other_disease == 2)]
# cardiovascular
covid = covid.loc[(covid.cardiovascular == 1) | (covid.cardiovascular == 2)]
# obesity
covid = covid.loc[(covid.obesity == 1) | (covid.obesity == 2)]
# renal_chronic
covid = covid.loc[(covid.renal_chronic == 1) | (covid.renal_chronic == 2)]
# tobacco
covid = covid.loc[(covid.tobacco == 1) | (covid.tobacco == 2)]
# contact_other_covid
covid = covid.loc[(covid.contact_other_covid == 1) | (covid.contact_other_covid == 2)]
# covid_res
covid = covid.loc[(covid.covid_res == 1) | (covid.covid_res == 2)]

In [8]:
# Get rid of all rows containing info other than 1 or 2 (except in the age column)
# columns = []
# col = covid.columns
# for column in columns:
#     col_list.append(column)

# def fix_missing(covid, columns):
#     for column in covid[columns]:
#         if column != 'age':
#             covid = covid.loc[(covid[column] == 1) | (covid[column] == 2)]
#     return covid

# fix_missing(covid, columns)

In [9]:
covid.head()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,2,1,97,2,27,97,2,2,2,2,2,2,2,2,2,2,2,1,97
7,1,1,97,2,56,2,2,2,2,2,1,2,2,2,1,1,1,1,97
13,1,1,97,2,34,2,2,2,2,2,2,2,2,2,2,2,1,1,97
15,1,1,97,2,34,2,2,2,2,2,2,2,2,2,2,1,2,1,97
16,1,1,97,2,49,2,1,2,2,2,2,2,2,2,2,2,1,1,97


In [10]:
print(covid.pregnancy.value_counts())

covid.sex.value_counts()

97    174281
2     166018
1       2396
98      1029
Name: pregnancy, dtype: int64


2    174281
1    169443
Name: sex, dtype: int64

In this case, 97 (174281) are all males. So rather than dropping every value that is 97, I changed it to 2.

In [11]:
covid = covid.loc[(covid.pregnancy == 1) | (covid.pregnancy == 2) | (covid.pregnancy == 97)]

covid.pregnancy = covid.pregnancy.apply(lambda x: x if x == 1 else 2)

covid.pregnancy.value_counts()

2    340299
1      2396
Name: pregnancy, dtype: int64

In [12]:
print(covid.intubed.value_counts())

print(covid.icu.value_counts())

covid.patient_type.value_counts()

97    290925
2      44679
1       7000
99        91
Name: intubed, dtype: int64
97    290925
2      44955
1       6724
99        91
Name: icu, dtype: int64


1    290925
2     51770
Name: patient_type, dtype: int64

In this case, there are 290925 patients hospitalized, so the 97 values for intubed and icu are probably due to that. To prevent wrongfully dropping these values, I'll change very 97 to null.

In [13]:
# icu
covid = covid.loc[(covid.icu == 1) | (covid.icu == 2) | (covid.icu == 97)]

covid.icu.replace({2: 0}, inplace=True)
covid.icu.replace({97: np.nan}, inplace=True)

covid.icu.value_counts()

0.0    44955
1.0     6724
Name: icu, dtype: int64

In [14]:
# intubed
covid = covid.loc[(covid.intubed == 1) | (covid.intubed == 2) | (covid.intubed == 97)]

covid.intubed.replace({2: 0}, inplace=True)
covid.intubed.replace({97: np.nan}, inplace=True)

covid.intubed.value_counts()

0.0    44679
1.0     7000
Name: intubed, dtype: int64

In [15]:
covid.shape

(342604, 19)

In [16]:
# fixing the binary format

# for column in covid[columns]:
#     if column != 'age':
#         covid[column] = covid[column].apply(lambda x: x if x == 1 else 0)
#         covid = covid.loc[covid[column]]
#         print(covid.column.unique())

# covid.shape

In [17]:
covid.head()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,2,1,,2,27,2,2,2,2,2,2,2,2,2,2,2,2,1,
7,1,1,,2,56,2,2,2,2,2,1,2,2,2,1,1,1,1,
13,1,1,,2,34,2,2,2,2,2,2,2,2,2,2,2,1,1,
15,1,1,,2,34,2,2,2,2,2,2,2,2,2,2,1,2,1,
16,1,1,,2,49,2,1,2,2,2,2,2,2,2,2,2,1,1,


## Data Cleanup: Fixing the Binary Numbers

In [18]:
# sex
covid.sex = covid.sex.apply(lambda x: x if x == 1 else 0)
# patient_type
covid.patient_type = covid.patient_type.apply(lambda x: x if x == 1 else 0)
# intubed
# covid.intubed = covid.intubed.apply(lambda x: x if x == 1 else 0)
# pneumonia
covid.pneumonia = covid.pneumonia.apply(lambda x: x if x == 1 else 0)
# pregnancy
covid.pregnancy = covid.pregnancy.apply(lambda x: x if x == 1 else 0)
# diabetes
covid.diabetes = covid.diabetes.apply(lambda x: x if x == 1 else 0)
# copd
covid.copd = covid.copd.apply(lambda x: x if x == 1 else 0)
# asthma
covid.asthma = covid.asthma.apply(lambda x: x if x == 1 else 0)
# inmsupr
covid.inmsupr = covid.inmsupr.apply(lambda x: x if x == 1 else 0)
# hypertension
covid.hypertension = covid.hypertension.apply(lambda x: x if x == 1 else 0)
# other_disease
covid.other_disease = covid.other_disease.apply(lambda x: x if x == 1 else 0)
# cardiovascular
covid.cardiovascular = covid.cardiovascular.apply(lambda x: x if x == 1 else 0)
# obesity
covid.obesity = covid.obesity.apply(lambda x: x if x == 1 else 0)
# renal_chronic
covid.renal_chronic = covid.renal_chronic.apply(lambda x: x if x == 1 else 0)
# tobacco
covid.tobacco = covid.tobacco.apply(lambda x: x if x == 1 else 0)
# contact_other_covid
covid.contact_other_covid = covid.contact_other_covid.apply(lambda x: x if x == 1 else 0)
# covid_res
covid.covid_res = covid.covid_res.apply(lambda x: x if x == 1 else 0)
# icu
# covid.icu = covid.icu.apply(lambda x: x if x == 1 else 0)

In [19]:
covid.head()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,0,1,,0,27,0,0,0,0,0,0,0,0,0,0,0,0,1,
7,1,1,,0,56,0,0,0,0,0,1,0,0,0,1,1,1,1,
13,1,1,,0,34,0,0,0,0,0,0,0,0,0,0,0,1,1,
15,1,1,,0,34,0,0,0,0,0,0,0,0,0,0,1,0,1,
16,1,1,,0,49,0,1,0,0,0,0,0,0,0,0,0,1,1,


## Final Touches

In [20]:
covid.isnull().sum()

sex                         0
patient_type                0
intubed                290925
pneumonia                   0
age                         0
pregnancy                   0
diabetes                    0
copd                        0
asthma                      0
inmsupr                     0
hypertension                0
other_disease               0
cardiovascular              0
obesity                     0
renal_chronic               0
tobacco                     0
contact_other_covid         0
covid_res                   0
icu                    290925
dtype: int64

In [21]:
covid.reset_index(drop=True, inplace=True)
covid.head()

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
0,0,1,,0,27,0,0,0,0,0,0,0,0,0,0,0,0,1,
1,1,1,,0,56,0,0,0,0,0,1,0,0,0,1,1,1,1,
2,1,1,,0,34,0,0,0,0,0,0,0,0,0,0,0,1,1,
3,1,1,,0,34,0,0,0,0,0,0,0,0,0,0,1,0,1,
4,1,1,,0,49,0,1,0,0,0,0,0,0,0,0,0,1,1,


In [22]:
covid.to_csv('clean_covid.csv')