# **Feature Engineering Notebook**

This notebook handles:


  1. Encode categorical variables

  2. Perform feature engineering on date columns




## Load Dataset

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv('../data/Cleaned_Data.csv')
df.head()

Unnamed: 0,Age,Gender,Region,Preexisting_Condition,Date_of_Infection,COVID_Strain,Symptoms,Severity,Hospitalized,Hospital_Admission_Date,...,Date_of_Reinfection,Vaccination_Status,Vaccine_Type,Doses_Received,Date_of_Last_Dose,Long_COVID_Symptoms,Occupation,Smoking_Status,BMI,Recovery_Classification
0,69,Male,Hovedstaden,Obesity,2022-06-21,Delta,Mild,Moderate,Yes,2025-01-13,...,2022-12-15,Yes,,1,2022-09-22,,Healthcare,Never,27.7,Delayed Recovery
1,38,Male,Sjælland,Asthma,2024-02-02,XBB.1.5,Mild,Moderate,No,2024-02-03,...,2024-06-08,No,,0,2023-08-21,,Healthcare,Never,21.9,Typical Recovery
2,41,Female,Syddanmark,Hypertension,2023-05-28,Beta,Mild,High,Yes,2025-03-07,...,2023-12-19,Yes,Janssen,3,2024-05-14,,Unemployed,Never,22.7,Delayed Recovery
3,81,Female,Hovedstaden,Asthma,2023-08-13,Delta,Severe,High,No,2023-08-15,...,2024-08-24,Yes,AstraZeneca,1,2024-10-31,,Office Worker,Never,27.7,Delayed Recovery
4,50,Female,Syddanmark,Cardiovascular,2023-03-10,Delta,Mild,High,No,2023-03-12,...,2023-09-08,Yes,,2,2023-07-05,,Student,Never,11.9,Delayed Recovery


In [5]:
df.isnull().sum()

Age                           0
Gender                        0
Region                        0
Preexisting_Condition       434
Date_of_Infection             0
COVID_Strain                  0
Symptoms                      0
Severity                      0
Hospitalized                  0
Hospital_Admission_Date       0
Hospital_Discharge_Date       0
ICU_Admission                 0
Ventilator_Support            0
Recovered                     0
Date_of_Recovery              0
Reinfection                   0
Date_of_Reinfection           0
Vaccination_Status            0
Vaccine_Type               1708
Doses_Received                0
Date_of_Last_Dose             0
Long_COVID_Symptoms        2612
Occupation                    0
Smoking_Status                0
BMI                           0
Recovery_Classification       0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2832 entries, 0 to 2831
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      2832 non-null   int64  
 1   Gender                   2832 non-null   object 
 2   Region                   2832 non-null   object 
 3   Preexisting_Condition    2398 non-null   object 
 4   Date_of_Infection        2832 non-null   object 
 5   COVID_Strain             2832 non-null   object 
 6   Symptoms                 2832 non-null   object 
 7   Severity                 2832 non-null   object 
 8   Hospitalized             2832 non-null   object 
 9   Hospital_Admission_Date  2832 non-null   object 
 10  Hospital_Discharge_Date  2832 non-null   object 
 11  ICU_Admission            2832 non-null   object 
 12  Ventilator_Support       2832 non-null   object 
 13  Recovered                2832 non-null   object 
 14  Date_of_Recovery        

In [7]:
date_cols = [col for col in df.columns if "Date" in col]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2832 entries, 0 to 2831
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Age                      2832 non-null   int64         
 1   Gender                   2832 non-null   object        
 2   Region                   2832 non-null   object        
 3   Preexisting_Condition    2398 non-null   object        
 4   Date_of_Infection        2832 non-null   datetime64[ns]
 5   COVID_Strain             2832 non-null   object        
 6   Symptoms                 2832 non-null   object        
 7   Severity                 2832 non-null   object        
 8   Hospitalized             2832 non-null   object        
 9   Hospital_Admission_Date  2832 non-null   datetime64[ns]
 10  Hospital_Discharge_Date  2832 non-null   datetime64[ns]
 11  ICU_Admission            2832 non-null   object        
 12  Ventilator_Support       2832 non-

# Encoding Features

## Convert Binary Columns to Numeric Format

In [11]:
binary_cols = ['Hospitalized', 'ICU_Admission', 'Ventilator_Support', 'Recovered', 'Vaccination_Status']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

## Encode Reinfection Column (Yes/No → 1/0)

In [13]:
df['Reinfection'] = df['Reinfection'].map({'Yes': 1, 'No': 0})

## Encode Remaining Categorical Features Using Label Encoding

In [15]:
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

## Infection & Recovery Durations


In [17]:
df["Recovery_Duration"] = (df["Date_of_Recovery"] - df["Date_of_Infection"]).dt.days
df["Time_to_Reinfection"] = (df["Date_of_Reinfection"] - df["Date_of_Recovery"]).dt.days
df["Reinfected_Later"] = df["Time_to_Reinfection"].apply(lambda x: 1 if x > 0 else 0)


## Vaccine and Infection Timing


In [19]:
df["Vaccine_to_Infection_Days"] = (df["Date_of_Infection"] - df["Date_of_Last_Dose"]).dt.days
df["Infected_soon_after_vaccine"] = df["Vaccine_to_Infection_Days"].apply(lambda x: 1 if 0 <= x <= 14 else 0)


## Hospital Stay Information

In [21]:
df["Hospital_Stay_Duration"] = (df["Hospital_Discharge_Date"] - df["Hospital_Admission_Date"]).dt.days
df["Hospital_Stay_Duration"] = df["Hospital_Stay_Duration"].apply(lambda x: x if x >= 0 else 0)


## Feature Engineering – Pre-Train/Test Split


In [23]:
df["Age_Preexisting"] = df["Age"] * df["Preexisting_Condition"]
df["Vaccine_Infection"] = df["Vaccination_Status"] * df["Vaccine_to_Infection_Days"]
df['Smoke_Preexist'] = df['Smoking_Status'] * df['Preexisting_Condition']


## Vaccine and Reinfection Relations

In [25]:
df['Vaccine_to_Reinfection'] = df['Time_to_Reinfection'] - df['Vaccine_to_Infection_Days']
df['Vaccinated_Before_Infection'] = (df['Vaccine_to_Infection_Days'] > 0).astype(int)


## Ratio-Based Features

In [27]:
df['Recovery_per_Stay'] = df['Recovery_Duration'] / (df['Hospital_Stay_Duration'] + 1)
df['Time_to_Reinfection_per_Recovery'] = df['Time_to_Reinfection'] / (df['Recovery_Duration'] + 1)


## Drop Date Columns

In [29]:
df.drop(columns=[
    "Date_of_Infection", "Date_of_Recovery", "Date_of_Reinfection",
    "Hospital_Admission_Date", "Hospital_Discharge_Date", "Date_of_Last_Dose"
], inplace=True)

In [30]:
df.shape

(2832, 33)

In [31]:
df.head()

Unnamed: 0,Age,Gender,Region,Preexisting_Condition,COVID_Strain,Symptoms,Severity,Hospitalized,ICU_Admission,Ventilator_Support,...,Vaccine_to_Infection_Days,Infected_soon_after_vaccine,Hospital_Stay_Duration,Age_Preexisting,Vaccine_Infection,Smoke_Preexist,Vaccine_to_Reinfection,Vaccinated_Before_Infection,Recovery_per_Stay,Time_to_Reinfection_per_Recovery
0,69,1,0,4,2,0,3,1,0,0,...,-93,0,13,276,-93,8,-32,0,21.571429,-0.412541
1,38,1,3,0,4,0,3,0,0,0,...,165,0,4,0,0,0,-61,1,4.6,4.333333
2,41,0,4,3,1,0,1,1,1,1,...,-352,0,50,123,-352,6,509,0,0.941176,3.204082
3,81,0,0,0,2,2,1,0,0,0,...,-445,0,9,0,-445,0,276,0,54.6,-0.308958
4,50,0,4,1,2,0,1,0,0,0,...,-117,0,3,50,-117,2,258,0,10.25,3.357143


In [32]:
df.to_csv("../data/Reinfection Engineered Dataset.csv", index=False)
