In [None]:
import pandas as pd
from src.utils import get_kfold_data, print_statistics, plot_distribution
from src.constants import *

Load data

In [None]:
data = pd.read_csv(DATA_PATH)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

Checking for missing values

In [None]:
for column in all_columns:
    num_nan = data[column].isna().sum()
    num_none = data[column].isnull().sum()
    print(f"{column} | Number of NaN: {num_nan} | Number of None: {num_none}")

Data statistics

In [None]:
# for column in numeric_columns:
#     print_statistics(data[column], column)

Data visualisation

In [None]:
# for column in numeric_columns:
#     plot_distribution(data[column], column, f"Distribution of {column}")

In [None]:
plot_distribution(data["outcome"], "outcome", "Distribution of outcome (target variable)")

Data splitting (K-Fold Cross Validation)

In [None]:
kfold_data = get_kfold_data(data=data, k=NUM_FOLDS, reproducibility_seed=REPRODUCIBILITY_SEED)

Data normalisation (using mean, std standardisation)

In [None]:
for fold in range(NUM_FOLDS):
    training_data = kfold_data[fold]["train"]
    for column in numeric_columns:
        mean = training_data[column].mean()
        std = training_data[column].std()
        print("Before standardization:")
        plot_distribution(training_data[column], column, f"Distribution of column '{column}'")
        print_statistics(training_data[column], column)
        
        print("After standardization:")
        training_data[column] = (training_data[column] - mean) / (std + 1e-8)
        plot_distribution(training_data[column], column, f"Standardized distribution of column '{column}'")
        print_statistics(training_data[column], column)
    
    break # Temporary break to only show the first fold