In [138]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file
df = pd.read_csv('dataset/Covid Data.csv')

# Print the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97
5,2,1,1,2,9999-99-99,2,1,40,2,2,...,2,2,2,2,2,2,2,2,3,2
6,2,1,1,1,9999-99-99,97,2,64,2,2,...,2,2,2,2,2,2,2,2,3,97
7,2,1,1,1,9999-99-99,97,1,64,2,1,...,2,1,1,2,2,2,1,2,3,97
8,2,1,1,2,9999-99-99,2,2,37,2,1,...,2,2,1,2,2,1,2,2,3,2
9,2,1,1,2,9999-99-99,2,2,25,2,2,...,2,2,2,2,2,2,2,2,3,2


In [11]:
# Print the last 10 rows of the DataFrame
print(df.shape)

(1048575, 21)


In [12]:
# Print the data type of each column
print(df.columns)

Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')


In [13]:
# Print the summary statistics of the DataFrame
df.describe()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,1.632194,8.980565,1.499259,1.190765,79.52288,3.346831,41.7941,49.76558,2.186404,2.260569,2.242626,2.298132,2.128989,2.435143,2.26181,2.125176,2.25718,2.214333,5.305653,79.55397
std,0.4822084,3.723278,0.4999997,0.3929041,36.86889,11.91288,16.90739,47.51073,5.424242,5.132258,5.114089,5.462843,5.236397,6.646676,5.19485,5.175445,5.135354,5.323097,1.881165,36.82307
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,97.0,2.0,30.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,97.0
50%,2.0,12.0,1.0,1.0,97.0,2.0,40.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6.0,97.0
75%,2.0,12.0,2.0,1.0,97.0,2.0,53.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
max,2.0,13.0,2.0,2.0,99.0,99.0,121.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,7.0,99.0


# Handling Missing Values

In [139]:
# show the num of 97/99 in each column
missing_data = df[df.isin([97, 99])].sum()
print(missing_data)
# Since the missing data for INTUBED and ICU is too much, we just drop these two features.
df = df.drop(columns=['INTUBED', 'ICU'])

USMER                          0.0
MEDICAL_UNIT                   0.0
SEX                            0.0
PATIENT_TYPE                   0.0
DATE_DIED                        0
INTUBED                 83033943.0
PNEUMONIA                1584297.0
AGE                        21609.0
PREGNANT                50780567.0
DIABETES                       0.0
COPD                           0.0
ASTHMA                         0.0
INMSUPR                        0.0
HIPERTENSION                   0.0
OTHER_DISEASE                  0.0
CARDIOVASCULAR                 0.0
OBESITY                        0.0
RENAL_CHRONIC                  0.0
TOBACCO                        0.0
CLASIFFICATION_FINAL           0.0
ICU                     83050080.0
dtype: object


# Data Preparation for Make classification based on label 'DIED' (Predicting the patient will be dead) 

In [140]:
# Replacing missing value with nan
df.loc[:, df.columns != 'AGE'] = df.loc[:, df.columns != 'AGE'].replace([97, 98, 99], np.nan)

# For pregnant column, replace NaN with 2 if sex == 2 (male)

#Male cannot pregnant for sure, so we will fill in the missing data for male with 2 (no)
df.loc[(df['SEX'] == 2) & (df['PREGNANT'].isna()), 'PREGNANT'] = 2

# For age column, replace NaN with median age
df.loc[df['AGE'].isna(), 'AGE'] = df['AGE'].median()

# For date_died column, change to 1 if date_died == 9999-99-99, 0 otherwise
# df.loc[df['DATE_DIED'] == '9999-99-99', 'DATE_DIED'] = 0
# df.loc[df['DATE_DIED'] != 0, 'DATE_DIED'] = 1

df['DIED'] = np.where(df['DATE_DIED'] == '9999-99-99', 0, 1)
df.drop(columns='DATE_DIED', inplace=True)

# Impute missing values with mode
df = df.fillna(df.mode().iloc[0])

# For USMER SEX PATIENT_TYPE DATE_DIED INTUBED PNEUMONIA PREGNANT DIABETES COPD ASTHMA INMSUPR HIPERTENSION OTHER_DISEASE OBESITY RENAL_CHRONIC TOBACCO ICU columns, replace 2 with 0
columns_to_replace = ['USMER', 'SEX', 'PATIENT_TYPE', 'DIED', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'CARDIOVASCULAR']
df[columns_to_replace] = df[columns_to_replace].replace(2, 0)
# Normalization
scaler = StandardScaler()

numerical_cols = ['AGE']

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head(10)

encoder = OneHotEncoder(sparse_output=False)

categorical_cols = ["MEDICAL_UNIT","CLASIFFICATION_FINAL"]
df_categorical = encoder.fit_transform(df[categorical_cols])
categorical_names = encoder.get_feature_names_out(categorical_cols)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_names)



df = df.drop(categorical_cols, axis=1)
df = df.reset_index(drop=True)
df = pd.concat([df, df_categorical], axis=1)

df_output = df['DIED']
df_output.to_csv('dataset/covid_y_died.csv', index=False)

df = df.drop(columns=["DIED"])
df.to_csv('dataset/covid_x_died.csv', index=False)
df.head()

Unnamed: 0,USMER,SEX,PATIENT_TYPE,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,...,MEDICAL_UNIT_11,MEDICAL_UNIT_12,MEDICAL_UNIT_13,CLASIFFICATION_FINAL_1,CLASIFFICATION_FINAL_2,CLASIFFICATION_FINAL_3,CLASIFFICATION_FINAL_4,CLASIFFICATION_FINAL_5,CLASIFFICATION_FINAL_6,CLASIFFICATION_FINAL_7
0,0,1,1,1.0,1.372531,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,0,1,1.0,1.786551,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,0,0,0.0,0.781073,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,1,1,0.0,0.662781,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0,1,0.0,1.549968,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Data Preparation for Make classification based on label classification_final (Predicting the severity of the COVID-19 patient) 

In [131]:

df.loc[:, df.columns != 'AGE'] = df.loc[:, df.columns != 'AGE'].replace([97, 98, 99], np.nan)

# For pregnant column, replace NaN with 2 if sex == 2 (male)
df.loc[(df['SEX'] == 2) & (df['PREGNANT'].isna()), 'PREGNANT'] = 2

# For age column, replace NaN with median age
df.loc[df['AGE'].isna(), 'AGE'] = df['AGE'].median()

# For date_died column, change to 1 if date_died == 9999-99-99, 0 otherwise
df.loc[df['DATE_DIED'] == '9999-99-99', 'DATE_DIED'] = 0
df.loc[df['DATE_DIED'] != 0, 'DATE_DIED'] = 1


# Impute missing values with mode
df = df.fillna(df.mode().iloc[0])

# For USMER SEX PATIENT_TYPE DATE_DIED INTUBED PNEUMONIA PREGNANT DIABETES COPD ASTHMA INMSUPR HIPERTENSION OTHER_DISEASE OBESITY RENAL_CHRONIC TOBACCO ICU columns, replace 2 with 0
columns_to_replace = ['USMER', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'CARDIOVASCULAR']
df[columns_to_replace] = df[columns_to_replace].replace(2, 0)

# Normalization
scaler = StandardScaler()

numerical_cols = ['AGE']

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head(10)


encoder = OneHotEncoder(sparse_output=False)

categorical_cols = ["MEDICAL_UNIT"]
df_categorical = encoder.fit_transform(df[categorical_cols])
categorical_names = encoder.get_feature_names_out(categorical_cols)
df_categorical = pd.DataFrame(df_categorical, columns=categorical_names)



df = df.drop(categorical_cols, axis=1)
df = df.reset_index(drop=True)
df = pd.concat([df, df_categorical], axis=1)


df_output = df['CLASIFFICATION_FINAL']
# i want to change all the value of CLASIFFICATION_FINAL that are higher than 3 to be 4
df_output = df_output.replace([4, 5, 6, 7], 4)
df_output.to_csv('dataset/covid_y_classification.csv', index=False)
df = df.drop(columns=["CLASIFFICATION_FINAL"])
df.to_csv('dataset/covid_x_csv_classification.csv', index=False)

df.head(10)


Unnamed: 0,USMER,SEX,PATIENT_TYPE,DATE_DIED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,...,MEDICAL_UNIT_4,MEDICAL_UNIT_5,MEDICAL_UNIT_6,MEDICAL_UNIT_7,MEDICAL_UNIT_8,MEDICAL_UNIT_9,MEDICAL_UNIT_10,MEDICAL_UNIT_11,MEDICAL_UNIT_12,MEDICAL_UNIT_13
0,0,1,1,1,1.0,1.372531,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,1,1.0,1.786551,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,1,0.0,0.781073,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,1,1,0.0,0.662781,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,1,1,0.0,1.549968,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,1,0,0,1.0,-0.106114,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,1,1,0,0.0,1.313385,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,1,1,0,1.0,1.313385,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,1,0,0,0.0,-0.283551,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,1,0,0,0.0,-0.9933,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
