In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()

In [None]:
X = pd.read_csv("../data/train_values.csv")
y = pd.read_csv("../data/train_labels.csv")


cat_cols = X.select_dtypes(include="object").columns
numeric_cols = X.select_dtypes(include="int64").columns
binary_cols = [col for col in X.columns if col.startswith("has")]
numeric_cols = [col for col in numeric_cols if col not in binary_cols]

In [None]:
X[numeric_cols].hist(figsize=(20, 20))

In [None]:
X[numeric_cols].plot(kind="box", subplots=True, layout=(4, 4), figsize=(20, 20))

In [None]:
from sklearn.preprocessing import (
    PowerTransformer,
    RobustScaler,
)

In [None]:
def compare_distribution(original, scaled, title="Effects of scaling"):
    """Compare the distribution of the original data to the scaled data."""
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    sns.boxplot(original, ax=ax[0])
    sns.boxplot(scaled, ax=ax[1])
    ax[0].set_title("Original Data")
    ax[1].set_title("Normalized Data")
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

In [None]:
col = "age"

scaler = RobustScaler()

scaled = scaler.fit_transform(X[[col]]).ravel()

compare_distribution(X[col], scaled, title=f"RobustScaler on {col}")

In [None]:
sns.ecdfplot(X[col])

In [None]:
np.percentile(X[col], [90, 95, 98, 99, 99.5])

In [None]:
# clip at 99th percentile
clip_value = np.percentile(X[col], 99)

In [None]:
scaled = X[col].clip(0, clip_value)
compare_distribution(X[col], scaled, title=f"Clipped {col}")

In [None]:
scaler = PowerTransformer(method="yeo-johnson")
scaled = scaler.fit_transform(X[[col]]).ravel()
compare_distribution(X[col], scaled, title=f"PowerTransformer on {col}")

In [None]:
# alternative: turn age into categorical variable

age_cat = pd.cut(
    X["age"],
    bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, X["age"].max()],
    ordered=True,
)
age_cat.value_counts().sort_index().plot(kind="bar")