# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    roc_auc_score,
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import optuna


In [None]:
RANDOM_SEED = 42

# Loading Data

In [None]:
kaggle = "KAGGLE" in "".join(os.environ.keys())

In [None]:
if kaggle:
    train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
    test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
    greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
else:
    train = pd.read_csv("data/train.csv")
    test = pd.read_csv("data/test.csv")
    greeks = pd.read_csv("data/greeks.csv")

# Data Pipeline

## Preliminaries

Label encode the `EJ` column

In [None]:
train["EJ"].replace({"A": 0, "B": 1}, inplace=True)
test["EJ"].replace({"A": 0, "B": 1}, inplace=True)

Drop extra columns:

In [None]:
train_id = train["Id"]
test_id = test["Id"]
y = train["Class"]

train = train.drop(["Class"], axis=1)
train = train.drop(["Id"], axis=1)
test  = test.drop(["Id"], axis=1)

Create the columns for imputation:

In [None]:
train_null_columns = list(train.columns[train.isna().sum() != 0])
test_null_columns = list(test.columns[test.isna().sum() != 0])

In [None]:
thresh = 2
train_columns_to_fill_via_mean = []
train_columns_to_fill_via_knn = []

null_count = train[train_null_columns].isna().sum()
for column in train_null_columns:
    if null_count[column] <= thresh:
        train_columns_to_fill_via_mean.append(column)
    else:
        train_columns_to_fill_via_knn.append(column)

train_columns_to_fill_via_mean, train_columns_to_fill_via_knn

In [None]:
thresh = 2
test_columns_to_fill_via_mean = []
test_columns_to_fill_via_knn = []

null_count = test[test_null_columns].isna().sum()
for column in test_null_columns:
    if null_count[column] <= thresh:
        test_columns_to_fill_via_mean.append(column)
    else:
        test_columns_to_fill_via_knn.append(column)

test_columns_to_fill_via_mean, test_columns_to_fill_via_knn

## Pipelines

Now, the column transformations:

In [None]:
simple_imputer = SimpleImputer(strategy="mean")
knn_imputer = KNNImputer(n_neighbors=5)

train_imputer = ColumnTransformer(
    [
        ("mean_imputer", simple_imputer, train_columns_to_fill_via_mean),
        ("knn_imputer", knn_imputer, train_columns_to_fill_via_knn),
    ],
    remainder="passthrough",
)

test_imputer = ColumnTransformer(
    [
        ("mean_imputer", simple_imputer, test_columns_to_fill_via_mean),
        ("knn_imputer", knn_imputer, test_columns_to_fill_via_knn),
    ],
    remainder="passthrough",
)

standard_scaler = StandardScaler()
power_transformer = PowerTransformer()

train_scaling_pipe = Pipeline(
    [
        ("standard_scaler", standard_scaler),
        ("power_transformer", power_transformer),
    ],
)

test_scaling_pipe = Pipeline(
    [
        ("standard_scaler", standard_scaler),
        ("power_transformer", power_transformer),
    ],
)

And the final data preprocessor:

In [None]:
train_final_preprocessing_pipe = Pipeline([
    ("imputer", train_imputer),
    ("scaling_pipe", train_scaling_pipe)
])

test_final_preprocessing_pipe = Pipeline([
    ("imputer", test_imputer),
    ("scaling_pipe", test_scaling_pipe)
])

train_final_preprocessing_pipe

Excellent! Now, we will create the dataset to be trained:

In [None]:
X = train_final_preprocessing_pipe.fit_transform(train)
test_final = test_final_preprocessing_pipe.fit_transform(test)

In [None]:
np.isnan(X).sum()

We will make sure that both the training and test sets have the same columns:

In [None]:
assert X.shape[1] == test_final.shape[1]

## Train Test Split

Great Let's do a train test split:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Helper Functions  

We will create some helper functions to help us train and evaluate our models:

In [None]:
def balanced_log_loss(y_true, y_hat):
    """
    Compute the balanced log loss between y_true and y_hat.

    Parameters:
    y_true (array-like): True labels of shape (n_samples, n_classes).
    y_hat (array-like): Predicted probabilities of shape (n_samples, n_classes).

    Returns:
    float: The balanced log loss between y_true and y_hat.
    """
    eps = 1e-15
    y_hat = np.clip(y_hat, eps, 1 - eps)
    if isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_hat, pd.Series):
        y_hat = y_hat.values
    
    y_true = y_true.astype(int)
    N0 = np.sum(y_true == 0)
    N1 = np.sum(y_true == 1)
    w0 = 1/N0
    w1 = 1/N1
    yhat0 = y_hat[:, 0]
    yhat1 = y_hat[:, 1]
    loss0 = -w0 * np.sum((1 -y_true) * np.log(yhat0))
    loss1 = -w1 * np.sum((y_true) * np.log(yhat1))
    return (loss0 + loss1)/2

In [None]:
# def balanced_log_loss(y_true, y_pred):
#     # calculate the number of observations for each class
#     N_0 = np.sum(1 - y_true)
#     N_1 = np.sum(y_true)
#     # calculate the weights for each class
#     w_0 = 1 / N_0
#     w_1 = 1 / N_1
#     # calculate the predicted probabilities for each class
#     p_0 = np.clip(y_pred[:, 0], 1e-15, 1 - 1e-15)
#     p_1 = np.clip(y_pred[:, 1], 1e-15, 1 - 1e-15)
#     # calculate the log loss for each class
#     log_loss_0 = -w_0 * np.sum((y_true) * np.log(p_0))
#     log_loss_1 = -w_1 * np.sum(y_true * np.log(p_1))
#     # calculate the balanced logarithmic loss
#     balanced_log_loss = (log_loss_0 + log_loss_1) / (w_0 + w_1)
#     return balanced_log_loss

In [None]:
def cm_to_metrics(cm):
    """Calculate accuracy, precision, recall and f1 score from confusion matrix.

    Parameters
    ----------
    cm : array-like
        Confusion matrix.

    Returns
    -------
    accuracy : float
        Accuracy score.
    precision : float
        Precision score.
    recall : float
        Recall score.
    f1 : float
        F1 score.
    """
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return accuracy, precision, recall, f1


def evaluate_model(
    model,
    on="train",
    plot_cmat=False,
    verbose=True,
):
    """
    This function evaluates a model and returns the metrics.
    It can be used to evaluate the model on the training set or the test set.
    It can also plot the confusion matrix.
    Parameters
    ----------
    model : object
        The model to be evaluated.
    on : str, optional
        The set on which the model will be evaluated. The default is "train".
    plot_cmat : bool, optional
        Whether to plot the confusion matrix. The default is False.
    verbose : bool, optional
        Whether to print the metrics. The default is True.

    Returns
    -------
    result : dict
        A dictionary with the metrics.

    Example
    -------
    >>> result = evaluate_model(model)
    >>> print(result)
    {'accuracy': 0.8, 'precision': 0.8, 'recall': 0.8, 'f1': 0.8, 'auc': 0.8}

    >>> result = evaluate_model(model, on="test")
    >>> print(result)
    {'accuracy': 0.8, 'precision': 0.8, 'recall': 0.8, 'f1': 0.8, 'auc': 0.8}

    >>> result = evaluate_model(model, plot_cmat=True)
    >>> print(result)
    {'accuracy': 0.8, 'precision': 0.8, 'recall': 0.8, 'f1': 0.8, 'auc': 0.8}

    >>> result = evaluate_model(model, on="test", plot_cmat=True)
    >>> print(result)
    {'accuracy': 0.8, 'precision': 0.8, 'recall': 0.8, 'f1': 0.8, 'auc': 0.8}
    """
    if on == "train":
        X = X_train
        y = y_train
    else:
        X = X_test
        y = y_test
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    accuracy, precision, recall, f1 = cm_to_metrics(cm)
    auc_score = roc_auc_score(y, y_pred)
    y_pred_prob = model.predict_proba(X)
    balanced_ll = balanced_log_loss(y, y_pred_prob)
    

    if plot_cmat:
        disp = ConfusionMatrixDisplay(
            confusion_matrix=cm, display_labels=["0", "1"]
        )
        disp.plot()
        plt.show()
    if verbose:
        try:
            model_name = model.__class__.__name__
        except:
            model_name = ""
        print(f"Accuracy on {on} set of the model {model_name}: {accuracy:.4f}")
        print(f"Log Loss on {on} set of the model {model_name}: {balanced_ll:.4f}")
        print(f"Precision on {on} set of the model {model_name}: {precision:.4f}")
        print(f"Recall on {on} set of the model {model_name}: {recall:.4f}")
        print(f"F1 on {on} set of the model {model_name}: {f1:.4f}")
        print(f"AUC on {on} set of the model {model_name}: {auc_score:.4f}\n")
        cr = classification_report(y, y_pred)
        print(cr)
    result = {
        "accuracy": accuracy,
        "log_loss": balanced_ll,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc_score,
    }
    return result

# Models

In [None]:
one_weight = 1/y.mean()
zero_weight = 1/(1- y.mean())
class_weight = {0:1, 1:one_weight/zero_weight}
class_weight

In [None]:
y.value_counts()

## Base Model

Let's train a base model. This will be a simple linear regression model:

In [None]:
lr = LogisticRegression(class_weight=class_weight)
lr.fit(X_train, y_train)

In [None]:
lr_result_train = evaluate_model(lr, plot_cmat=True)

In [None]:
lr_result_test = evaluate_model(lr, plot_cmat=True, on = "test")

## Random Forest

In [None]:
rf_base = RandomForestClassifier(class_weight=class_weight, max_depth=5, n_estimators=100)
rf_base.fit(X_train, y_train)
evaluate_model(rf_base, plot_cmat=True)

In [None]:
evaluate_model(rf_base, plot_cmat=True, on = "test")

### RF Tuning Round 1

In [None]:
def custom_scorere_func(model, X, y):
    y_pred = model.predict_proba(X)
    return balanced_log_loss(y, y_pred)
    

In [None]:
def grid_search(params, base_model, cv = 5, verbose = 1, **kwargs):
    grid = GridSearchCV(base_model, params, cv = cv, scoring = custom_scorere_func, verbose = verbose, **kwargs)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

In [None]:
# rfc_param_grid = {
#     "n_estimators": [100, 200, 300],
#     "max_depth": [5, 10, 15],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 5],
#     "max_features": ["sqrt", "log2"],
# }

# rfc_grid = grid_search(rfc_param_grid, RandomForestClassifier(random_state=RANDOM_SEED), verbose = 1, n_jobs = -1)

### RF Tuning Round 2

In [None]:
# rfc_param_grid = {
#     "n_estimators": [170, 200, 230],
#     "max_depth": [4, 5, 7],
#     "min_samples_split": [4, 5, 6],
#     "min_samples_leaf": [1],
#     "max_features": ["log2"],
# }

# rfc_grid = grid_search(rfc_param_grid, RandomForestClassifier(random_state=RANDOM_SEED), verbose = 1, n_jobs = -1)

# RF Tuning Round 3

In [None]:
# rfc_param_grid = {
#     "n_estimators": [190, 200, 210],
#     "max_depth": [3, 4],
#     "min_samples_split": [6, 7, 8],
#     "min_samples_leaf": [1],
#     "max_features": ["log2"],
# }

# rfc_grid = grid_search(rfc_param_grid, RandomForestClassifier(random_state=RANDOM_SEED), verbose = 1, n_jobs = -1)

These are the same parameters as we got in round 1 and hence we will not proceed to a round 3 of fine tuning. Here are the best parameters:
```python
{'max_depth': 17, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 190}
```

In [None]:
# # rf_best_params = rfc_grid.best_params_
# # rf_best_params = {'max_depth': 17, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 190}
# rf_best_params = {'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 190}
# rf_best = RandomForestClassifier(class_weight=class_weight, random_state=RANDOM_SEED, **rf_best_params)
# rf_best.fit(X_train, y_train)

In [None]:
# evaluate_model(rf_best, plot_cmat=True, on = "train")
# evaluate_model(rf_best, plot_cmat=True, on = "test")

## Catboost

Next, we will consider the Catboost model.

In [None]:
cat_base = CatBoostClassifier(class_weights=class_weight, random_state=RANDOM_SEED)

In [None]:
cat_base.fit(X_train, y_train, verbose = 0)
evaluate_model(cat_base, plot_cmat=True, on = "train")
evaluate_model(cat_base, plot_cmat=True, on = "test")

### Cat Tuning Round 1

In [None]:
# cat_param_grid = {
#     "iterations": [50, 100, 150],
#     "depth": [5, 8,  10],
#     "learning_rate": [0.01, 0.05, 0.1],
#     "l2_leaf_reg": [1, 3, 5],
# }

# loss_function = "Logloss"

# cat_grid = grid_search(cat_param_grid, CatBoostClassifier(loss_function=loss_function, class_weights=class_weight, random_state=RANDOM_SEED, verbose = 0), verbose = 10)

### Cat Tuning Round 2

In [None]:
# cat_param_grid = {
#     "iterations": [40, 50, 70],
#     "depth": [8, 10, 12],
#     "learning_rate": [0.005, 0.01, 0.02],
#     "l2_leaf_reg": [4, 5, 6],
# }

# loss_function = "Logloss"

# cat_grid = grid_search(cat_param_grid, CatBoostClassifier(loss_function=loss_function, class_weights=class_weight, random_state=RANDOM_SEED, verbose = 0),
#                        verbose = 10)

### Cat Tuning Round 3

In [None]:
# cat_param_grid = {
#     "iterations": [30, 40, 50],
#     "depth": [12, 15],
#     "learning_rate": [0.005],
#     "l2_leaf_reg": [6],
# }

# loss_function = "Logloss"

# cat_grid = grid_search(cat_param_grid, CatBoostClassifier(loss_function=loss_function, class_weights=class_wieght, random_state=RANDOM_SEED, verbose = 0),
#                        verbose = 10)

In [None]:
# evaluate_model(cat_best, plot_cmat=True, on = "train")
# evaluate_model(cat_best, plot_cmat=True, on = "test")

In [None]:
# cat_param_grid = {
#     "iterations": [40, 50, 70],
#     "depth": [8, 10, 12],
#     "learning_rate": [0.005, 0.01, 0.02],
#     "l2_leaf_reg": [4, 5, 6],
# }

# loss_function = "Logloss"

# cat_grid = grid_search(cat_param_grid, CatBoostClassifier(loss_function=loss_function, class_weights=class_wieght, random_state=RANDOM_SEED, verbose = 0),
#                        verbose = 10)

In [None]:
from catboost.metrics import Logloss

log_loss = Logloss(use_weights=True)

In [None]:
def objective(trial):
    param = {
        "iterations": trial.suggest_int("iterations", 30, 100),
        "depth": trial.suggest_int("depth", 5, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 10),
    }
    cat = CatBoostClassifier(loss_function=log_loss, class_weights=class_weight, random_state=RANDOM_SEED,
                             verbose = 0, **param)
    cat.fit(X_train, y_train)
    return balanced_log_loss(y_test, cat.predict_proba(X_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)