# COVID-19 Mortality Risk Prediction Using Machine Learning

## Environment Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score

## Data Loading

In [None]:
df = pd.read_csv('../data/Covid19Data.csv')
df.head()

## Initial Data Inspection

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.describe()

In [None]:
df['PNEUMONIA'].value_counts()

In [None]:
df['TOBACCO'].value_counts()

## Data Cleaning and Preprocessing

### Filtering Uncertain Categories

In [None]:
cols = ['PNEUMONIA','DIABETES', 'COPD', 'ASTHMA', 'INMSUPR','HIPERTENSION',
        'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY','RENAL_CHRONIC', 'TOBACCO']

mask = df[cols].isin([1, 2]).all(axis=1)
df = df[mask]
df.shape

### Pregnancy Variable Preprocessing

In [None]:
df['PREGNANT'].value_counts()

In [None]:
df[df['SEX'] == 2]['PREGNANT'].value_counts()

In [None]:
df[df['SEX'] == 1]['PREGNANT'].value_counts()

In [None]:
df['PREGNANT'] = df['PREGNANT'].replace([97, 98], 2) # 1: pregnant, 2: not pregnant
df['PREGNANT'].value_counts()

### Binary Feature Identification

In [None]:
binary_cols_auto = [col for col in df.columns if set(df[col].dropna().unique()).issubset({1,2})]
binary_cols_auto

In [None]:
binary_cols = ['PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO']

df[binary_cols].apply(pd.Series.value_counts)

### Convert Binary Variables to 0/1 Encoding

In [None]:
df[binary_cols] = df[binary_cols].replace(2, 0)

df[binary_cols].apply(pd.Series.value_counts)

## Target Variable Construction

In [None]:
df['IS_DEAD'] = (df['DATE_DIED'] != '9999-99-99').astype(int) # 1: Deceased, 0: Alive
df = df.drop(columns=['DATE_DIED'])
df['IS_DEAD'].value_counts()

## Exploratory Data Analysis (EDA)

### Remove Potential Data Leakage Features

In [None]:
df = df.drop(columns=['INTUBED', 'ICU'])
df.columns

In [None]:
df.nunique()

### Correlation Analysis

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt='.1f', cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap', fontsize=16)
plt.savefig('../figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

### Age Distribution

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['AGE'], bins=30, kde=True, color='skyblue')
plt.title('Age Distribution', fontsize=16)
plt.xlabel('Age')
plt.ylabel('Number of Patients')
plt.savefig("../figures/age_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

### Age Distribution by Mortality Status and Gender

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='IS_DEAD', y='AGE', hue='SEX', data=df, palette='Set2', width=0.45)
plt.title('Age Distribution by Mortality Status and Gender', fontsize=16)
plt.xlabel('Mortality Status (0: Alive, 1: Deceased)')
plt.ylabel('Age')
plt.legend(title='Gender (1: Female, 2: Male)')
plt.show()

### Age Distribution by Pneumonia Status

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='PNEUMONIA', y='AGE', data=df, width=0.45, color='lightcoral')
plt.title('Age Distribution by Pneumonia Status', fontsize=16)
plt.xlabel('Pneumonia (0: No, 1: Yes)')
plt.ylabel('Age')
plt.show()

### Mortality Distribution by Patient Type

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='PATIENT_TYPE', hue='IS_DEAD', data=df, palette='Set2')
plt.title('Mortality Distribution by Patient Type', fontsize=16)
plt.xlabel('Patient Type (1: Outpatient, 2: Inpatient)')
plt.ylabel('Number of Patients')
plt.legend(title='Mortality Status (0: Alive, 1: Deceased)')
plt.show()

### Mortality Rates by Cardiovascular Disease Status

In [None]:
cv_death_rate = (df.groupby('CARDIOVASCULAR')['IS_DEAD'].mean().reset_index())
cv_death_rate['IS_DEAD'] *= 100

plt.figure(figsize=(8, 6))
sns.barplot(x='CARDIOVASCULAR', y='IS_DEAD', data=cv_death_rate, color='lightcoral')
plt.title('Mortality Rates by Cardiovascular Disease Status', fontsize=16)
plt.xlabel('Cardiovascular Disease (0: No, 1: Yes)')
plt.ylabel('Mortality Rate (%)')
plt.show()

### Mortality Rates by Diabetes Status

In [None]:
diabetes_death_rate = (df.groupby('DIABETES')['IS_DEAD'].mean().reset_index())
diabetes_death_rate['IS_DEAD'] *= 100

plt.figure(figsize=(8, 6))
sns.barplot(x='DIABETES', y='IS_DEAD', data=diabetes_death_rate, color='lightcoral')
plt.title('Mortality Rates by Diabetes Status', fontsize=16)
plt.xlabel('Diabetes (0: No, 1: Yes)')
plt.ylabel('Mortality Rate (%)')
plt.show()

### Mortality Rates by Hypertension Status

In [None]:
hypertension_death_rate = (df.groupby('HIPERTENSION')['IS_DEAD'].mean().reset_index())
hypertension_death_rate['IS_DEAD'] *= 100

plt.figure(figsize=(8, 6))
sns.barplot(x='HIPERTENSION', y='IS_DEAD', data=hypertension_death_rate, color='lightcoral')
plt.title('Mortality Rates by Hypertension Status', fontsize=16)
plt.xlabel('Hypertension (0: No, 1: Yes)')
plt.ylabel('Mortality Rate (%)')
plt.show()

### Mortality Rates by Immunosuppression Status

In [None]:
inmsupr_death_rate = (df.groupby('INMSUPR')['IS_DEAD'].mean().reset_index())
inmsupr_death_rate['IS_DEAD'] *= 100

plt.figure(figsize=(8, 6))
sns.barplot(x='INMSUPR', y='IS_DEAD', data=inmsupr_death_rate, color='lightcoral')
plt.title('Mortality Rates by Immunosuppression Status', fontsize=16)
plt.xlabel('Immunosuppression (0: No, 1: Yes)')
plt.ylabel('Mortality Rate (%)')
plt.show()

## Data Preparation for Modeling

### Class Imbalance Analysis

In [None]:
df['IS_DEAD'].value_counts(normalize=True) * 100

### Feature–Target Correlation

In [None]:
death_corr = df.corr(numeric_only=True)['IS_DEAD'].drop('IS_DEAD')
death_corr[death_corr.abs() > 0.4]

### Feature–Target Split

In [None]:
X = df.drop(columns=['IS_DEAD'])
y = df['IS_DEAD']

print("Feature Matrix Shape:", X.shape)
print("Target Variable Shape:", y.shape)

### Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)
print("y_train Shape:", y_train.shape)
print("y_test Shape:", y_test.shape)

## Pipeline-Based Baseline Modeling (Logistic Regression)

In [None]:
preprocessor = ColumnTransformer(transformers = [('age_scaler', StandardScaler(), ['AGE'])], remainder='passthrough')

pipeline = Pipeline(steps = [('preprocessing', preprocessor),('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

### Model Evaluation (Confusion Matrix & ROC-AUC)

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('../figures/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
y_prob = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)

print("ROC-AUC Score:", roc_auc)