## Imports

In [None]:
from nb_utils import set_root

PROJECT_ROOT = set_root(4, "src")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.base import BaseEstimator
from sklearn.datasets import fetch_california_housing
# from sklearn.decomposition import PCA
# from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression  # LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from mlpr.ml.supervisioned.regression import metrics, plots
from mlpr.ml.supervisioned.surrogates.surrogates import Surrogate
from mlpr.ml.supervisioned.tunning.grid_search import GridSearch

## Methods

In [None]:
def custom_rmse(y_test, y_pred, **kwargs):
    return -np.sqrt(mean_squared_error(y_test, y_pred, **kwargs))


def mean_absolute_percentage_error(y_true, y_pred):
    return -np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Parameters

In [None]:
cv: int = 5
random_state: int = 42

In [None]:
params_split: dict[str, any] = {'test_size': 0.2, 'random_state': 42}
params_norm: dict[str, bool] = {'with_mean': True, 'with_std': True}
model_metrics: dict[str, dict[str, any]] = {
    'regression': {
        "custom_rmse": custom_rmse,
        "custom_mape": mean_absolute_percentage_error,
        'mse': mean_squared_error,
        'mae': mean_absolute_error,
        'r2': r2_score,
    }
}

In [None]:
black_box: dict[str, dict[BaseEstimator, dict]] = {
    "regression": {
        # RandomForestRegressor: {
        #     'n_estimators': [200, 500],
        #     'max_depth': [None, 20],
        #     'min_samples_split': [2, 10],
        #     'min_samples_leaf': [1, 4],
        #     'random_state': [random_state]
        # },
        # GradientBoostingRegressor: {
        #     'n_estimators': [200, 500],
        #     'learning_rate': [0.1, 0.01],
        #     'subsample': [0.5, 1.0],
        #     'random_state': [random_state]
        # },
        # SVR: {
        #     'C': [1.0, 100.0],
        #     'kernel': ['linear', 'rbf'],
        #     'degree': [2, 4],
        #     'gamma': ['scale']
        # },
        LGBMRegressor: {
            'num_leaves': [31, 127],
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200],
            'random_state': [random_state],
            'verbose': [-1]
        },
        XGBRegressor: {
            'max_depth': [6, 10],
            'learning_rate': [0.3, 0.1],
            'n_estimators': [100, 200],
            'random_state': [random_state],
            'verbosity': [0]
        }
    }
}

white_box: dict[str, dict[BaseEstimator, dict]] = {
    "regression": {
        DecisionTreeRegressor: {
            'random_state': [42],
            'max_depth': [1, 5, 10],
            'min_samples_split': [2, 5, 10]
        },
        LinearRegression: {}
    }
}

## Read dataset

In [None]:
content: dict = fetch_california_housing()
data: np.ndarray = content['data']
features: list = content["feature_names"]
target: np.ndarray = content["target"]
target_name: str | list | list[str] = content["target_names"]

In [None]:
data_sample, _, target_sample, _ = train_test_split(data, target, train_size=10000, random_state=random_state)

In [None]:
X = data  # _sample
y = target  # _sample

## Plot dataset

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 6))

ax[0].plot(
    y,
    color="#FF4B3E",
)
ax[1].hist(
    y,
    color="#FF4B3E",
    bins=60
)
ax[0].axhline(y.mean(), color="black", linestyle="--", label="mean")
ax[1].axvline(y.mean(), color="black", linestyle="--", label="mean")

ax[0].set_title("Dataset")
ax[1].set_title("Histogram")

ax[0].set_frame_on(False)
ax[0].set_xticks([])
ax[0].set_yticks([])
for _ax in ax:
    _ax.legend()

fig.tight_layout()

## Grid and tunning

In [None]:
grid_search = GridSearch(
    X,
    y,
    params_split=params_split,
    models_params=black_box["regression"],
    normalize=True,
    scoring='custom_rmse',
    metrics=model_metrics["regression"],
    params_norm=params_norm
)
grid_search.search(cv=5, n_jobs=-1)

best_model, best_params = \
    grid_search \
    .get_best_model()

## Results

In [None]:
data_train = pd.DataFrame(
    grid_search.X_train,
    columns=features
)
data_train["y_true"] = grid_search.y_train
data_train["y_pred"] = grid_search.best_model.predict(grid_search.X_train)

## Metrics

In [None]:
k = 3
rm = metrics.RegressionMetrics(data_train, *["y_true", "y_pred"])

In [None]:
results: dict = rm.calculate_metrics(
    ["mape", "rmse", "kolmogorov_smirnov", "confusion_matrix", "calculate_kappa"],
    {
        "mape": {},
        "rmse": {},
        "kolmogorov_smirnov": {},
        "confusion_matrix": {"n_bins": k},
        "calculate_kappa": {"n_bins": k}
    }
)

## Plots

In [None]:
rp = plots.RegressionPlots(data_train, color_palette=["#FF4B3E", "#1C2127"])
fig, axs = rp.grid_plot(
    plot_functions=[
        ['graph11', 'graph12', 'graph13'],
        ['graph21', 'graph22', ''],
        ['graph23', '', '']
    ],
    plot_args={
        'graph11': {
            "plot": "scatter",
            "params": {
                'y_true_col': 'y_true',
                'y_pred_col': 'y_pred',
                'linecolor': '#1C2127',
                'worst_interval': True,
                'metrics': rm.metrics["calculate_kappa"],
                'class_interval': rm._class_intervals,
                'method': 'recall',
                'positive': True
            }
        },
        'graph12': {
            "plot": "plot_ecdf",
            "params": {
                'y_true_col': 'y_true',
                'y_pred_col': 'y_pred'
            }
        },
        'graph21': {
            "plot": "plot_kde",
            "params": {
                'columns': ['y_true', 'y_pred']
            }
        },
        'graph22': {
            "plot": "plot_error_hist",
            "params": {
                'y_true_col': 'y_true',
                'y_pred_col': 'y_pred',
                'linecolor': '#1C2127'
            }
        },
        'graph13': {
            "plot": "plot_fitted",
            "params": {
                'y_true_col': 'y_true',
                'y_pred_col': 'y_pred',
                'condition': (
                    (
                        rm._worst_interval_kappa[0] <= data_train["y_true"]
                    ) & (
                        data_train["y_true"] <= rm._worst_interval_kappa[1]
                    )
                ),
                'sample_size': None
            }
        },
        'graph23': {
            "plot": "plot_fitted",
            "params": {
                'y_true_col': 'y_true',
                'y_pred_col': 'y_pred',
                'condition': None,
                'sample_size': None
            }
        },
    },
    show_inline=True
)

## Surrogates

In [None]:
surrogate = Surrogate(
    normalize=True,
    scoring="custom_rmse",
    white_box=white_box["regression"],
    black_box=black_box["regression"],
    params_split=params_split,
    params_norm=params_norm,
    metrics=model_metrics["regression"]
)

In [None]:
surrogate.fit(X, y, cv=cv, n_jobs=-1)

In [None]:
pd.DataFrame(surrogate.grid_search_black._metrics).T

In [None]:
pd.DataFrame(surrogate.grid_search_white._metrics).T

In [None]:
surrogate.best_model_black

In [None]:
surrogate.best_model_white

In [None]:
black_box_predictions, white_box_predictions = surrogate.predict(surrogate.grid_search_black.X_test)

In [None]:
fig_n2, ax_n2 = plt.subplots(1, 1, figsize=(20, 6))
plt.plot(
    black_box_predictions,
    white_box_predictions,
    "*",
    color="#FF4B3E"
)