# **Data Splitting Notebook**

This notebook handles:


 1. Splitting the data to training and testing for the features and the target column
 2. Handling the unbalanced number of samples for the target column
 3. Scaling the features

# Load Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("../data/Reinfection Engineered Dataset.csv")
df.head(15)

Unnamed: 0,Age,Gender,Region,Preexisting_Condition,COVID_Strain,Symptoms,Severity,Hospitalized,ICU_Admission,Ventilator_Support,...,Vaccine_to_Infection_Days,Infected_soon_after_vaccine,Hospital_Stay_Duration,Age_Preexisting,Vaccine_Infection,Smoke_Preexist,Vaccine_to_Reinfection,Vaccinated_Before_Infection,Recovery_per_Stay,Time_to_Reinfection_per_Recovery
0,69,1,0,4,2,0,3,1,0,0,...,-93,0,13,276,-93,8,-32,0,21.571429,-0.412541
1,38,1,3,0,4,0,3,0,0,0,...,165,0,4,0,0,0,-61,1,4.6,4.333333
2,41,0,4,3,1,0,1,1,1,1,...,-352,0,50,123,-352,6,509,0,0.941176,3.204082
3,81,0,0,0,2,2,1,0,0,0,...,-445,0,9,0,-445,0,276,0,54.6,-0.308958
4,50,0,4,1,2,0,1,0,0,0,...,-117,0,3,50,-117,2,258,0,10.25,3.357143
5,66,1,3,1,3,1,3,0,0,0,...,-977,0,5,66,-977,2,590,0,103.333333,-0.623188
6,76,0,3,4,3,1,0,1,1,0,...,200,0,122,304,0,4,-147,1,0.349593,1.204545
7,77,0,3,2,4,1,2,0,0,0,...,-409,0,6,154,-409,2,331,0,34.142857,-0.325
8,79,0,2,3,4,0,2,0,0,0,...,63,0,4,237,0,3,114,1,3.0,11.0625
9,72,0,3,1,0,2,1,0,0,0,...,-588,0,6,72,-588,0,122,0,88.428571,-0.751613


# Split Data

In [6]:
X = df.drop(columns=['Reinfection'])
y = df['Reinfection']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
smote = SMOTE(random_state=42, sampling_strategy=0.5)  # 1:2 ratio
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [10]:
print("After SMOTE:", y_train_resampled.value_counts())

After SMOTE: Reinfection
0    2052
1    1026
Name: count, dtype: int64


# Save the splitted data

In [12]:
X_train_resampled.to_csv("../data/splitted_data/X_train.csv", index=False)
y_train_resampled.to_csv("../data/splitted_data/y_train.csv", index=False)
X_test.to_csv("../data/splitted_data/X_test.csv", index=False)
y_test.to_csv("../data/splitted_data/y_test.csv", index=False)