In [None]:
%cd /src/code
from typing import Dict, Any, List, Optional, Union
import subprocess
import hashlib
import os
import logging
import json
import random
import time
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, STATUS_FAIL
from hyperopt import hp
from hyperopt.pyll import scope
from hyperopt import space_eval

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import mlflow
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

from libs import (
    PandasStandardScaler,
    PandasPCA,
    RemoveUncorrelated,
    calculate_params_hash,
    train_model,
    objective
)

random.seed(42)
logging.basicConfig(level=logging.INFO)
pd.set_option('display.max_columns', None)
api = KaggleApi()
api.authenticate()

target_col: str = 'failure_prone'
submission_id: str = 'U8AXLZ'

# EDA

In [None]:
X = pd.read_csv('../assets/X_train.csv', index_col='id')
y = pd.read_csv('../assets/y_train.csv', index_col='id')[target_col].astype(int)

print(X.shape)
print(X.info())
X.head()

In [None]:
print(y.info())
y.value_counts()

In [None]:
%%time
profile = ProfileReport(
    df = pd.concat(
        [X, y],
        axis=1,
    ).sample(n=10000, random_state=1).rename(columns={'failure_prone': "Target"}),  # Reduce size
    title="Profiling Report",
    # minimal=True,  # Reduces the size by showing less graphs; Not needed in our case
)
profile.config.interactions.targets = ["Target"]  # Reduce size by disabling most interactions
profile.to_file("../assets/profiling_report.html")

In [None]:
%%time
profile = ProfileReport(
    df = pd.concat(
        [X, y],
        axis=1,
    ).sample(n=30000, random_state=1).rename(columns={'failure_prone': "Target"}),
    title="Profiling Report - Full",
    explorative=True,
)

profile.to_file("../assets/profiling_report_full.html")

# Hyperparameter tuning

tricks for conditional spaces: https://stackoverflow.com/questions/43859465/problems-setting-up-conditional-search-space-in-hyperopt

mlflow code copied from: https://github.com/LeonardoSanBenitez/tutorial-mlflow/blob/main/code/2.0%20-%20AutoML%20with%20hyperopt.ipynb



In [None]:
%%time
space = hp.choice('classifier', [
    ############
    # Run 3
    # 0.8876
    {
        'model': XGBClassifier,
        'framework': 'xgboost~=2.1',
        'learning_rate': hp.uniform('xgb_learning_rate', 0.4, 0.5),
        'n_estimators': scope.int(hp.quniform('xgb_n_estimators', 1400, 1600, 25)),
        'max_depth': scope.int(hp.quniform('xgb_max_depth', 10, 20, 2)),
        'random_state': 0,
        'preprocessing_remove_uncorrelated': hp.uniform('gb_preprocessing_remove_uncorrelated', 0.1, 0.15),
        'preprocessing_scale': True,
        'preprocessing_pca': hp.uniform('gb_preprocessing_pca', 0.8, 0.9),
    }

    ################
    # Run 2
    # 0.8828
    #{
    #    'model': XGBClassifier,
    #    'framework': 'xgboost~=2.1',
    #    'learning_rate': 0.4928067035150168,
    #    'n_estimators': 200,
    #    'max_depth': 10,        
    #    'random_state': 0,
    #    'preprocessing_remove_uncorrelated': hp.uniform('gb_preprocessing_remove_uncorrelated', 0.0, 0.2),
    #    'preprocessing_scale': True,
    #    'preprocessing_pca': hp.uniform('gb_preprocessing_pca', 0.1, 1.0),
    #}

    ################
    # Run 1
    # 0.8813
    #{
    #    'model': RandomForestClassifier,
    #    'framework': 'scikit-learn~=1.4',
    #    'n_estimators': scope.int(hp.quniform('rf_n_estimators', 50, 200, 50)),  # discrite values from 50 to 500, every 50
    #    'max_depth': scope.int(hp.quniform('rf_max_depth', 10, 100, 10)),
    #    'random_state': 0,
    #},
    #{
    #    'model': GradientBoostingClassifier,
    #    'framework': 'scikit-learn~=1.4',
    #    'learning_rate': hp.uniform('gb_learning_rate', 0.1, 0.5),  # continuous range from 0.01 to 0.5
    #    'n_estimators': scope.int(hp.quniform('gb_n_estimators', 100, 200, 50)),
    #    'max_depth': 10, #scope.int(hp.quniform('gb_max_depth', 3, 30, 2)),
    #    'random_state': 0,
    #},
    #{
    #    'model': KNeighborsClassifier,
    #    'framework': 'scikit-learn~=1.4',
    #    'n_neighbors': scope.int(hp.quniform('knn_n_neighbors', 5, 50, 2)),
    #},
    #{
    #    'model': SVC,
    #    'framework': 'scikit-learn~=1.4',
    #    'C': hp.uniform('svm_C', 0.1, 10),  # continuous range from 0.1 to 10
    #    'kernel': hp.choice('svm_kernel', ['poly', 'rbf']),
    #    #'degree': scope.int(hp.quniform('svm_degree', 2, 5, 1)),  # only used for 'poly' kernel
    #    #'gamma': hp.choice('svm_gamma', ['scale', 'auto']),
    #    'random_state': 0,
    #},
    #{
    #    'model': MLPClassifier,
    #    'framework': 'scikit-learn~=1.4',
    #    'hidden_layer_sizes': scope.int(hp.quniform('mlp_hidden_layer_sizes', 50, 200, 50)),
    #    'activation': hp.choice('mlp_activation', ['tanh', 'relu']),
    #    #'solver': hp.choice('mlp_solver', ['sgd', 'adam']),
    #    #'alpha': hp.uniform('mlp_alpha', 0.0001, 0.01),
    #    #'learning_rate': hp.choice('mlp_learning_rate', ['invscaling', 'adaptive']),
    #    'random_state': 0,
    #},
    #{
    #    'model': XGBClassifier,
    #    'framework': 'xgboost~=2.1',
    #    'learning_rate': hp.uniform('xgb_learning_rate', 0.3, 0.5),
    #    'n_estimators': scope.int(hp.quniform('xgb_n_estimators', 100, 200, 50)),
    #    'max_depth': 10, #scope.int(hp.quniform('xgb_max_depth', 10, 30, 2)),
    #    'random_state': 0,
    #}
])

mlflow.set_tracking_uri('http://mlflow:5000')
mlflow.set_experiment("round_3")
with mlflow.start_run(description='Parent run for hyperopt', nested=True):
    best_hyperparameters: Dict[str, Any] = space_eval(space, fmin(
        fn=lambda params: objective(params, X, y),  # Objective function to minimize
        space=space,                                # Hyperparameter space
        algo=tpe.suggest,                           # Optimization algorithm (Tree of Parzen Estimators)
        max_evals=700,                              # Number of evaluations
        trials=Trials()                             # Store trial history for later inspection
    ))

    assert type(best_hyperparameters) == dict
    print("Best hyperparameters found:", best_hyperparameters)

# Train final model
expects a dictionary `best_hyperparameters`

In [None]:
%%time
# TODO: register at mlflow
best_hash = calculate_params_hash(best_hyperparameters)
print(f'Search experiment by param hash in the UI: http://localhost:5001/#/experiments/127862172636814249?searchFilter=params.hyperparameters_hash+%3D"{best_hash}"')

model = train_model(best_hyperparameters, X, y, analyze=True)
model

In [None]:
importances = model.named_steps['model'].feature_importances_
columns = model.named_steps['pca'].columns

# Convert importances to percentage
importances_percentage = importances / importances.sum() * 100

# Create a dataframe for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': columns,
    'Importance (%)': importances_percentage
}).sort_values(by='Importance (%)', ascending=False)

# Display the dataframe
feature_importances_df.head(10)  # Display top 10 features

# Submit best model
Expects a variable `model`, with a method `predict` that follows a sklean-like interface, and a dictionary `hyperparameters`

In [None]:
assert model is not None
assert hasattr(model, 'predict')

X_test = pd.read_csv('../assets/X_test.csv', index_col='id')
assert X_test.shape[1] == X.shape[1]
assert all(X_test.columns == X.columns)

y_test_hat = model.predict(X_test)
assert len(y_test_hat.shape) == 1
assert y_test_hat.shape[0] == X_test.shape[0]

assert y_test_hat.min() == 0
assert y_test_hat.max() == 1
assert y_test_hat.dtype == np.dtype('int64')

In [None]:
# Submission params
competition = 'ppcu-data-mining-and-machine-learning-2024'
file_name = f'../assets/y_test_submission_{submission_id}.csv'
message = f"test trained submission"
message += f"; commit_hash: {subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()}"
message += f"; hyperparameters_hash: {calculate_params_hash(best_hyperparameters)}"

# Run inference
if os.path.exists(file_name):
    os.remove(file_name)
pd.DataFrame(y_test_hat, index=X_test.index, columns=["failure_prone"]).to_csv(file_name, index=True)
assert os.path.exists(file_name)

# Submit the file to the competition
api.competition_submit(file_name, message, competition)