In [None]:
# Imports libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import sys

sys.path.append("../../")

# Import user-defined packages
from scripts.outlier_detector import OutlierDetector as old

In [None]:
# Step 1: Load data
df = pd.read_csv("../../data/raw/data_train.csv")

df

In [None]:
# Step 2: Inspect data
# Note: Given that all features have 202944 non-null counts and the dataset has 202944 rows, it's likely that there are no null values present.
df.info(verbose=True)

In [None]:
# Step 3.1: Remove unused column(s)
df.drop(
    columns=["Id"],
    inplace=True,
)

# Step 3.2: Remove duplicate row(S)
df.drop_duplicates(inplace=True)

In [None]:
# Step 4: Handle invalid datas (non numerical)

# Convert all features to numeric
df = df.apply(pd.to_numeric, errors="coerce")

# Summarize invalid data per feature
df.isna().sum()

In [None]:
# # Step 5: Feature scaling

# # Min-max scaling is applied to maintain uniformity within the data range.
# # Formular: X = (X - min) / (max - min)
# min_max_scaler = MinMaxScaler()

# columns_to_scale = ["BMI", "ExtraMedTest", "ExtraAlcoholTest", "MentHlth", "PhysHlth"]
# df[columns_to_scale] = min_max_scaler.fit_transform(df[columns_to_scale])

# # Inspect data after scaling
# df.describe().round(3)

In [None]:
# Step 6: Handle Outliers

# Grid initilization
rows, cols = len(df.columns) // 4, 4

fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 3))

for i, (column, ax) in enumerate(zip(df.columns, axes.flatten())):
    sns.boxplot(data=df[column], ax=ax)
    ax.set_title(f"KDE Plot of {column}")
    ax.set_xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
# Step 7: Handle categorical data

# Create one-hot encoding
age_dummies = pd.get_dummies(df["Age"]).astype(int)
gen_health_dummies = pd.get_dummies(df["GenHlth"]).astype(int)
education_dummies = pd.get_dummies(df["Education"]).astype(int)
income_dummies = pd.get_dummies(df["Income"]).astype(int)

# Rename the columns
gen_health_dummies.columns = [
    "Genhlth_1",
    "Genhlth_2",
    "Genhlth_3",
    "Genhlth_4",
    "Genhlth_5",
]

age_dummies.columns = [
    "Age_18_24",
    "Age_25_29",
    "Age_30_34",
    "Age_35_39",
    "Age_40_44",
    "Age_45_49",
    "Age_50_54",
    "Age_55_59",
    "Age_60_64",
    "Age_65_69",
    "Age_70_74",
    "Age_75_79",
    "Age_80_or_older",
]

education_dummies.columns = [
    "Educ_Never",
    "Educ_G1_8",
    "Educ_G9_11",
    "Educ_G12_GED",
    "Educ_Col_1_3",
    "Educ_Col_4_more",
]

income_dummies.columns = [
    "Inc_1",
    "Inc_2",
    "Inc_3",
    "Inc_4",
    "Inc_5",
    "Inc_6",
    "Inc_7",
    "Inc_8",
]

# Drop the original
df.drop(["Education", "Income", "Age", "GenHlth"], axis=1, inplace=True)

# Concatenate the original DataFrame
df = pd.concat(
    [df, age_dummies, gen_health_dummies, education_dummies, income_dummies], axis=1
)

In [None]:
df.to_csv("../../data/processed/data_train_processed.csv")