In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pathlib
import os

import polars as pl
import sklearn

import kego.plotting

In [None]:
PATH_COMPETITION = pathlib.Path("../../data/playground/playground-series-s5e9")
PATH_TRAIN = PATH_COMPETITION / "train.csv"
PATH_TEST = PATH_COMPETITION / "test.csv"
os.listdir(PATH_COMPETITION)

In [None]:
train = pl.read_csv(PATH_TRAIN)
train

In [None]:
kego.plotting.plot_histogram("BeatsPerMinute", df=train)

In [None]:
# Calculate correlation matrix for all features using Polars
correlation_matrix = train.select(pl.all().exclude("id")).corr()
correlation_matrix

In [None]:
# Plot the correlation matrix using matplotlib
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(10, 8))
cax = ax.matshow(np.array(correlation_matrix), cmap="bwr", vmin=-0.1, vmax=0.1)
plt.xticks(
    range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90
)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
fig.colorbar(cax)
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
TARGET = "BeatsPerMinute"
FEATURES = [col for col in train.columns if col not in ("id", TARGET)]
FEATURES

In [None]:
# Correct way to split Polars DataFrame into train, validation, and test sets
train_with_rand = train.with_columns(pl.col("id").shuffle(seed=42).alias("_rand"))
n = train_with_rand.height
train_split = train_with_rand.filter(pl.col("_rand") < int(0.7 * n)).drop("_rand")
validate_split = train_with_rand.filter(
    (pl.col("_rand") >= int(0.7 * n)) & (pl.col("_rand") < int(0.85 * n))
).drop("_rand")
test_split = train_with_rand.filter(pl.col("_rand") >= int(0.85 * n)).drop("_rand")
train_split.shape, validate_split.shape, test_split.shape

In [None]:
def validate_model(model, validate_split, features=FEATURES, target=TARGET):
    from sklearn.metrics import mean_squared_error, r2_score

    X_validate = validate_split[features].to_numpy()
    y_validate = validate_split[target].to_numpy()

    y_pred = model.predict(X_validate)

    mse = mean_squared_error(y_validate, y_pred)
    r2 = r2_score(y_validate, y_pred)

    return mse, r2

In [None]:
# train histogramgradientboost regressor
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
# model.fit(train_split.select(FEATURES).to_numpy(), train_split[TARGET].to_numpy())
# sklearn.metrics.mean_squared_error(model.predict(train_split[FEATURES].to_numpy()), train_split[TARGET])

In [None]:
# Hyperparameter tuning with HalvingGridSearchCV for GradientBoostingRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}
base_model = GradientBoostingRegressor()
halving_cv = HalvingGridSearchCV(
    base_model, param_grid, factor=2, random_state=42, n_jobs=-1, verbose=1
)
# halving_cv.fit(
#     train_split.select(FEATURES).to_numpy(), train_split[TARGET].to_numpy()
# )
# print("Best parameters:", halving_cv.best_params_)
# print("Best score:", halving_cv.best_score_)
# validate_model(halving_cv.best_estimator_, validate_split)
# validate_model(halving_cv.best_estimator_, train_split)

In [None]:
# pl.DataFrame(halving_cv.cv_results_).select(
#     [
#         "param_n_estimators",
#         "param_max_depth",
#         "param_learning_rate",
#         "mean_test_score",
#         "rank_test_score",
#     ]
# ).sort("mean_test_score")

In [None]:
# # Try RandomForestRegressor for comparison
# from sklearn.ensemble import RandomForestRegressor
# rf_model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=42, n_jobs=-1)
# rf_model.fit(train_split.select(FEATURES).to_numpy(), train_split[TARGET].to_numpy())
# rf_mse, rf_r2 = validate_model(rf_model, validate_split)
# print(f"RandomForestRegressor MSE: {rf_mse:.2f}, R2: {rf_r2:.3f}")

In [None]:
# # Feature scaling with StandardScaler
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(train_split.select(FEATURES).to_numpy())
# X_validate_scaled = scaler.transform(validate_split.select(FEATURES).to_numpy())
# y_train = train_split[TARGET].to_numpy()
# y_validate = validate_split[TARGET].to_numpy()
# model_scaled = GradientBoostingRegressor()
# model_scaled.fit(X_train_scaled, y_train)
# mse_scaled, r2_scaled = validate_model(model_scaled, validate_split)
# print(f"Scaled GradientBoostingRegressor MSE: {mse_scaled:.2f}, R2: {r2_scaled:.3f}")

In [None]:
# # Train and evaluate XGBoost regressor
# import xgboost as xgb
# xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42, n_jobs=-1)
# xgb_model.fit(train_split.select(FEATURES).to_numpy(), train_split[TARGET].to_numpy())
# xgb_mse, xgb_r2 = validate_model(xgb_model, validate_split)
# print(f"XGBoostRegressor MSE: {xgb_mse:.2f}, R2: {xgb_r2:.3f}")

In [None]:
# Visualize feature distributions and relationships
import seaborn as sns
import matplotlib.pyplot as plt

df_sample = train.sample(n=min(1000, train.height))
plt.figure(figsize=(12, 6))
sns.histplot(df_sample["BeatsPerMinute"], bins=30, kde=True)
plt.title("Distribution of BeatsPerMinute")
plt.show()
plt.figure(figsize=(12, 6))
sns.pairplot(df_sample.to_pandas(), vars=FEATURES + [TARGET])
plt.suptitle("Pairplot of Features and Target", y=1.02)
plt.show()

In [None]:
# Data quality checks: missing values, duplicates, outliers
print("Missing values per column:")
print(train.null_count())
print("\nNumber of duplicate rows:")
print(train.is_duplicated().sum())
print("\nSummary statistics:")
print(train.describe())
# Rescale features to max value of 1 for boxplot
train_rescaled = train.with_columns(
    [pl.col(col) / pl.col(col).max() for col in FEATURES + [TARGET]]
)
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_rescaled.to_pandas()[FEATURES + [TARGET]])
plt.title("Boxplot of Features and Target (Rescaled)")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Remove outliers using IQR method for all features and target
def remove_outliers(df, columns):
    for col in columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
    return df


train_no_outliers = remove_outliers(train, FEATURES + [TARGET])
print(
    f"Original shape: {train.shape}, After outlier removal: {train_no_outliers.shape}"
)
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_no_outliers.to_pandas()[FEATURES + [TARGET]])
plt.title("Boxplot After Outlier Removal")
plt.xticks(rotation=90)
plt.show()

In [None]:
def train_validate_gradient_boost(df):
    model = GradientBoostingRegressor()
    model.fit(df.select(FEATURES).to_numpy(), df[TARGET].to_numpy())
    return model, sklearn.metrics.mean_squared_error(
        model.predict(df[FEATURES].to_numpy()), df[TARGET]
    )

In [None]:
train_validate_gradient_boost(train_no_outliers)

In [None]:
train_validate_gradient_boost(train)

In [None]:
# Feature engineering: add polynomial features and interactions
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(train_no_outliers.select(FEATURES).to_numpy())
print(
    f"Original feature count: {len(FEATURES)}, After polynomial expansion: {X_poly.shape[1]}"
)
model_poly = GradientBoostingRegressor()
model_poly.fit(X_poly, train_no_outliers[TARGET].to_numpy())
print("Trained GradientBoostingRegressor with polynomial features")
from sklearn.metrics import mean_squared_error, r2_score

y_pred_poly = model_poly.predict(X_poly)
mse_poly = mean_squared_error(train_no_outliers[TARGET].to_numpy(), y_pred_poly)
r2_poly = r2_score(train_no_outliers[TARGET].to_numpy(), y_pred_poly)
print(
    f"GradientBoostingRegressor with polynomial features MSE: {mse_poly:.2f}, R2: {r2_poly:.3f}"
)