In [1]:
!python -V

Python 3.8.19


In [2]:
# Standard libraries
import os
import datetime
import warnings
from copy import deepcopy
from utils import plot_metrics
import json

# Data manipulation and analysis
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp

# Machine learning and optimization
import optuna
from optuna.trial import TrialState
from optuna.storages import JournalStorage, JournalFileStorage
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    RobustScaler,
    QuantileTransformer
)
from sklearn.impute import SimpleImputer
from category_encoders import WOEEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import mlflow

# Rich console output
from rich.console import Console

# Ignore warnings
warnings.filterwarnings("ignore")

# Initialize console
console = Console()

In [3]:
# Set general directory of the project
def general_directory():
    """
    Move up one level in the directory hierarchy.
    """
    # Get the current directory
    current_dir = os.getcwd()

    # Get the parent directory
    parent_dir = os.path.dirname(current_dir)

    # Change to the parent directory
    os.chdir(parent_dir)

# Set actual directory of the project
def actual_directory():
    """
    Move down to the 'experiments' subdirectory from the current directory.
    """
    # Get the current directory
    current_dir = os.getcwd()

    # Name of the subdirectory to move into
    sub_dir_name = "experiments"  # Replace with the name of your subdirectory

    # Construct the full path to the subdirectory
    sub_dir_path = os.path.join(current_dir, sub_dir_name)

    # Change to the subdirectory
    os.chdir(sub_dir_path)

# Get Baseline

In [4]:
general_directory()

df_features = pd.read_parquet(os.getcwd() + '\\data\\silver\\BRA_baseline.csv')
df_features['ratio_odds'] = df_features['AvgH']/df_features['AvgA']

print(df_features.shape)
df_features.head()

(4559, 22)


Unnamed: 0,Country,League,Season,Date,Time,Home,Away,HG,AG,Res,...,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,Target,is_test,ratio_odds
0,Brazil,Serie A,2012,19/05/2012,22:30,Palmeiras,Portuguesa,1.0,1.0,D,...,5.25,1.76,3.87,5.31,1.69,3.5,4.9,0,0,0.344898
1,Brazil,Serie A,2012,19/05/2012,22:30,Sport Recife,Flamengo RJ,1.0,1.0,D,...,2.68,2.83,3.42,2.7,2.59,3.23,2.58,0,1,1.003876
2,Brazil,Serie A,2012,20/05/2012,01:00,Figueirense,Nautico,2.0,1.0,H,...,6.72,1.67,4.05,7.22,1.59,3.67,5.64,1,0,0.281915
3,Brazil,Serie A,2012,20/05/2012,20:00,Botafogo RJ,Sao Paulo,4.0,2.0,H,...,3.15,2.49,3.39,3.15,2.35,3.26,2.84,1,0,0.827465
4,Brazil,Serie A,2012,20/05/2012,20:00,Corinthians,Fluminense,0.0,1.0,A,...,4.41,1.96,3.53,4.41,1.89,3.33,3.89,0,0,0.485861


In [5]:
class RegLogOptimizator:
    def __init__(
        self,
        x,
        y,
        target,
        features,
        categorical_features=[],
        niter=10,
        metric_eval="AUC",
        metric_method="default",
        thr=0.5,
        col_safra=None,
        early_stopping_rounds=3,
        eval_n_features=False,
        filename_storage="reg_log_search",
        save_in_txt=True,
        verbose=True,
    ):
        """
        metric_eval: 'AUC' or 'KS'
        metric_method: 'default', 'min', 'range'
        """
        self.x = x
        self.y = y
        self.features = features
        self.categorical_features = categorical_features
        self.target = target
        self.col_safra = col_safra
        self.niter = niter
        self.metric_eval = metric_eval
        self.metric_method = metric_method
        self.thr = thr
        self.early_stopping_rounds = early_stopping_rounds
        self.eval_n_features = eval_n_features
        self.verbose = verbose
        self.best_auc = 0.0
        self.score = 0.0
        self.iterations_not_improving = 0
        self.iterations = 0
        self.filename_storage = filename_storage
        self.save_in_txt = save_in_txt

    def get_optimal_params(self):
        if self.save_in_txt:
            self.create_log()
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        objective = self.generate_objective_function(
            self.x,
            self.y,
            self.target,
            self.features,
            self.categorical_features,
        )
        storage = JournalStorage(JournalFileStorage(f"{self.filename_storage}.log"))
        study = optuna.create_study(
            direction="maximize",
            study_name="Hyperparameter search",
            storage=storage,
            load_if_exists=True,
        )
        study.optimize(
            objective,
            n_trials=self.niter,
            callbacks=[self.early_stopping_fn],
            n_jobs=1,
        )
        best_trial = study.best_trial
        return best_trial, study

    def early_stopping_fn(
        self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial
    ):
        if self.iterations_not_improving >= self.early_stopping_rounds:
            study.stop()

    def update_best_params(self, score, test_metrics):
        self.iterations += 1
        if score > self.best_auc:
            self.iterations_not_improving = 0
            self.best_auc = score
            if self.verbose:
                console.print(
                    f"|Iteration {self.iterations}| New parameters found - {self.metric_eval} of {np.mean(test_metrics):.4f} ({self.best_auc:.4f})"
                )
        else:
            self.iterations_not_improving += 1

    def ks(self, y, y_pred):
        return ks_2samp(y_pred[y == 1], y_pred[y != 1]).statistic

    def predict(self, model, x):
        return model.predict_proba(x)[:, 1]

    def get_metric_min(self, df, target, metric_eval, col_safra):
        if metric_eval == "KS":
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .min()
            )
        else:
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .min()
            )
        return score_min

    def get_metric_range(self, df, target, metric_eval, col_safra):
        if metric_eval == "KS":
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .min()
            )
            score_max = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .max()
            )
        else:
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .min()
            )
            score_max = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .max()
            )
        range_ = score_max - score_min
        return range_

    def get_metric(self, df, target, metric_eval):
        if metric_eval == "KS":
            score = self.ks(df[target], df["prob"]) * 100
        else:
            score = metrics.roc_auc_score(df[target], df["prob"])
        return score

    def decision(self, metric_train, metric_test, metric_otm, thr=5):
        return 0 if np.abs(metric_train - metric_test) > thr else metric_otm

    def generate_objective_function(self, x, y, target, features, categorical_features):
        def objective(
            trial,
            x=x,
            y=y,
            target=target,
            features=features,
            categorical_features=categorical_features,
        ):
            numerical_features = [x for x in features if x not in categorical_features]

            parameters = {
                "penalty": trial.suggest_categorical(
                    "penalty", ["l1", "l2", "elasticnet"]
                ),
                "C": trial.suggest_loguniform("C", 0.001, 10),
                "class_weight": trial.suggest_categorical(
                    "class_weight", ["balanced"]
                ),
                "max_iter": trial.suggest_int("max_iter", 100, 1000, 50),
            }

            if parameters["penalty"] == "l1":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_l1", ["liblinear", "saga"]
                )
            elif parameters["penalty"] == "l2":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_l2",
                    [
                        "lbfgs",
                        "liblinear",
                        "newton-cg",
                        "newton-cholesky",
                        "sag",
                        "saga",
                    ],
                )
            elif parameters["penalty"] == "elasticnet":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_elasticnet", ["saga"]
                )
                parameters["l1_ratio"] = trial.suggest_float(
                    name="l1_ratio", low=0.0, high=1.0, step=0.05
                )
            else:
                parameters["solver"] = trial.suggest_categorical(
                    "solver_none",
                    [
                        "lbfgs",
                        "liblinear",
                        "newton-cg",
                        "newton-cholesky",
                        "sag",
                        "saga",
                    ],
                )


            # Transformer das categóricas
            encoder = trial.suggest_categorical(
                "encoder_categorical", ["woe", "onehot", "ordenc"]
            )

            if encoder == "woe":
                encoder_sel = WOEEncoder()
            if encoder == "onehot":
                encoder_sel = OneHotEncoder()
            if encoder == "ordenc":
                encoder_sel = OrdinalEncoder()

            cat_transformer = Pipeline(steps=[("encoder", encoder_sel)])

            # Transformer das numéricas
            imputer = trial.suggest_categorical(
                "imputer_missing",
                [
                    "simple_med",
                    "simple_cte",
                ]  
            )

            scaler = trial.suggest_categorical(
                "scaler_val",
                [
                    "standard",
                    "robust"
                ]
            )

            if imputer == "simple_med":
                imputer_sel = SimpleImputer(strategy="median")
            if imputer == "simple_cte":
                imputer_sel = SimpleImputer(strategy="constant", fill_value=-999)
            if scaler == "standard":
                scaler_sel = StandardScaler()
            if scaler == "robust":
                scaler_sel = RobustScaler()

            num_transformer = Pipeline(steps=[("imputer", imputer_sel), ("scaler", scaler_sel)])

            # Compondo os pré-processadores
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", num_transformer, numerical_features),
                    ("cat", cat_transformer, categorical_features),
                ]
            )

            # Definição do modelo
            model = LogisticRegression(**parameters)

            pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", model)])

            if self.eval_n_features:
                
                pipeline.fit(x[features], y[[target]])
                
                if encoder == "onehot":
                    features_ = [
                        x for x in features if x not in categorical_features
                    ] + list(
                        pipeline["preprocessor"]
                        .transformers_[1][1]["encoder"]
                        .get_feature_names_out()
                    )
                    coef = pipeline.steps[-1][1].coef_[0]
                    feature_importance = pd.DataFrame(
                        {"var": features_, "Importance": np.abs(coef)}
                    )
                    feature_importance = feature_importance.sort_values(
                        "Importance", ascending=False
                    ).reset_index(drop=True)
                
                else:
                    coef = pipeline.steps[-1][1].coef_[0]
                    feature_importance = pd.DataFrame(
                        {"var": features, "Importance": np.abs(coef)}
                    )
                    feature_importance = feature_importance.sort_values(
                        "Importance", ascending=False
                    ).reset_index(drop=True)
                
                features_fi = list(feature_importance["var"])
                
                for i in range(len(features_fi)):
                    for k in range(len(features)):
                        if features[k] in features_fi[i]:
                            features_fi[i] = features[k]
                
                features_fi = list(dict.fromkeys(features_fi))
                
                n_features = trial.suggest_int("n_features", 1, len(features))
                
                selected_features = features_fi[:n_features]
                
                trial.set_user_attr("selected_features", selected_features)
                
                features = [x for x in selected_features]
                
                categorical_features = [x for x in self.categorical_features if x in selected_features]
                
                preprocessor = ColumnTransformer(
                    transformers=[
                        (
                            "num",
                            num_transformer,
                            [x for x in features if x not in categorical_features],
                        ),
                        ("cat", cat_transformer, categorical_features),
                    ]
                )
                
                pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", model)])

            train_metrics = np.zeros(5)
            test_metrics = np.zeros(5)

            kf = StratifiedKFold(shuffle=True, random_state=42)

            for i, (idx_train, idx_test) in enumerate(kf.split(x, y)):
                x_train, y_train = x.iloc[idx_train], y.iloc[idx_train]
                x_test, y_test = x.iloc[idx_test], y.iloc[idx_test]

                pipeline.fit(x_train, y_train)
                y_pred_train = self.predict(pipeline, x_train)
                y_pred_test = self.predict(pipeline, x_test)
                y_train["prob"] = y_pred_train
                y_test["prob"] = y_pred_test

                if self.metric_method == "min":
                    train_metrics[i] = self.get_metric_min(
                        y_train, target, self.metric_eval, self.col_safra
                    )
                    test_metrics[i] = self.get_metric_min(
                        y_test, target, self.metric_eval, self.col_safra
                    )
                elif self.metric_method == "range":
                    train_metrics[i] = self.get_metric_range(
                        y_train, target, self.metric_eval, self.col_safra
                    )
                    test_metrics[i] = self.get_metric_range(
                        y_test, target, self.metric_eval, self.col_safra
                    )
                else:
                    train_metrics[i] = self.get_metric(
                        y_train, target, self.metric_eval
                    )
                    test_metrics[i] = self.get_metric(
                        y_test, target, self.metric_eval
                    )

            metrics = {
                "train_metrics": list(train_metrics),
                "test_metrics": list(test_metrics),
                "train_metric": np.mean(train_metrics),
                "test_metric": np.mean(test_metrics),
            }

            for key in metrics:
                trial.set_user_attr(key, metrics[key])

            metric_otm = np.mean(test_metrics) - np.std(test_metrics)
            value = self.decision(
                np.mean(train_metrics), np.mean(test_metrics), metric_otm, thr=self.thr
            )
            self.update_best_params(value, test_metrics)
            if self.save_in_txt:
                f = open(f"{self.log_file}.txt", "a")
                f.write(
                    f"{self.iterations};{parameters};{list(metrics['train_metrics'])};{list(metrics['test_metrics'])}\n"
                )
                f.close()
            return value

        return objective

    def create_log(self):
        time_ref = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
        self.log_file = f"resume_reg_log_opt_{time_ref}"
        f = open(f"{self.log_file}.txt", "w")
        f = open(f"{self.log_file}.txt", "a")
        f.write(f"iter;parameters;train_metrics;test_metrics\n")
        f.close()

In [6]:
actual_directory()

features = ['ratio_odds']
target = ['Target']

features_dictionary = {
    "ratio_odds": "Razão entre as odds do mandante e do visitante",
}

with open("1st_run_features_dictionary.json", "w") as f:
    json.dump(features_dictionary, f)

model_opt = RegLogOptimizator (

            x = df_features.loc[df_features['is_test'] == 0,features],
            y = df_features.loc[df_features['is_test'] == 0,target],
            target = 'Target',
            features = features,
            categorical_features = [],
            niter = 1000,
            metric_eval = "AUC",
            metric_method = "default",
            thr = 0.05, 
            col_safra = None,
            early_stopping_rounds = 200,
            filename_storage = "1st_run_experiment",
            save_in_txt = True

        )
    
best_trial, study = model_opt.get_optimal_params()

In [7]:
X_train, y_train = df_features.loc[df_features['is_test'] == 0, features], df_features.loc[df_features['is_test'] == 0, target]
X_test, y_test = df_features.loc[df_features['is_test'] == 1, features], df_features.loc[df_features['is_test'] == 1, target]

df_results = {

    'id': [],
    'params': [],
    'qtd_features': [],
    'features': [],
    'transformer': [],
    'auc_train_cv': [],
    'auc_test_cv': [],
    'auc_train': [],
    'auc_test': [],

}

for j, study_trial in enumerate(study.trials):

    if study_trial.state != TrialState.COMPLETE or study_trial.value == 0:
        continue  # Skip the trials that did not complete successfully

    model_params = {k: v for k, v in study_trial.params.items() if k not in ['encoder_categorical','imputer_missing','scaler_val','n_features']}
    solver_key = next((key for key in model_params if 'solver' in key), None)
    if solver_key:
        model_params['solver'] = model_params.pop(solver_key)
    model_params['random_state'] = 42

    scaler_dict = {k: v for k, v in study_trial.params.items() if k == 'scaler_val'}
    
    if scaler_dict['scaler_val'] == 'standard':
        transformer = Pipeline(steps=[("scaler", StandardScaler())])
    elif scaler_dict['scaler_val'] == 'robust':
        transformer = Pipeline(steps=[("scaler", RobustScaler())])
    
    preprocessor = ColumnTransformer(
                transformers=[
                    ("features", transformer, features),
                ]
            )
    
    clf = LogisticRegression(**model_params)
    pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", clf)])

    pipeline.fit(X_train[features], y_train)
    y_prob_train = pipeline.predict_proba(X_train[features])[:, 1]
    auc_train = metrics.roc_auc_score(y_train, y_prob_train)

    y_prob_test = pipeline.predict_proba(X_test[features])[:, 1]
    auc_test = metrics.roc_auc_score(y_test, y_prob_test)

    num_features = len(features)
    
    for i, k in enumerate(df_results.keys()): df_results[k].append([f"{j:03d}", 
                                                                    model_params, 
                                                                    num_features, 
                                                                    features, 
                                                                    transformer,
                                                                    round(study_trial.user_attrs['train_metric'] * 100, 2),
                                                                    round(study_trial.user_attrs['test_metric'] * 100, 2),
                                                                    round(auc_train * 100, 2), 
                                                                    round(auc_test * 100, 2)][i])

df_results = pd.DataFrame(df_results)

In [8]:
df_results.sort_values(by = 'auc_test', ascending = False)

Unnamed: 0,id,params,qtd_features,features,transformer,auc_train_cv,auc_test_cv,auc_train,auc_test
0,000,"{'penalty': 'l1', 'C': 1.279196107764435, 'cla...",1,[ratio_odds],(StandardScaler()),66.33,66.29,66.33,65.34
126,126,"{'penalty': 'elasticnet', 'C': 0.0088201286385...",1,[ratio_odds],(StandardScaler()),66.33,66.29,66.33,65.34
128,128,"{'penalty': 'l1', 'C': 0.006345123324989716, '...",1,[ratio_odds],(RobustScaler()),66.33,66.29,66.33,65.34
129,129,"{'penalty': 'elasticnet', 'C': 0.0043378089022...",1,[ratio_odds],(StandardScaler()),66.33,66.29,66.33,65.34
130,130,"{'penalty': 'l2', 'C': 3.4710950696276552, 'cl...",1,[ratio_odds],(RobustScaler()),66.33,66.29,66.33,65.34
...,...,...,...,...,...,...,...,...,...
71,071,"{'penalty': 'l1', 'C': 3.785112498367953, 'cla...",1,[ratio_odds],(StandardScaler()),66.33,66.29,66.33,65.34
72,072,"{'penalty': 'l1', 'C': 6.567960149863006, 'cla...",1,[ratio_odds],(StandardScaler()),66.33,66.29,66.33,65.34
115,115,"{'penalty': 'l1', 'C': 0.0013032337543173276, ...",1,[ratio_odds],(RobustScaler()),50.00,50.00,50.00,50.00
161,161,"{'penalty': 'elasticnet', 'C': 0.0010009752338...",1,[ratio_odds],(RobustScaler()),50.00,50.00,50.00,50.00


In [9]:
params = df_results.loc[df_results['id'] == '000', ['params']].values[0][0]

transformer = Pipeline(steps=[("scaler", StandardScaler())])

preprocessor = ColumnTransformer(
                transformers=[
                    ("features", transformer, features),
                ]
            )
X_train, y_train = df_features.loc[df_features['is_test'] == 0, features], df_features.loc[df_features['is_test'] == 0, target]
X_test, y_test = df_features.loc[df_features['is_test'] == 1, features], df_features.loc[df_features['is_test'] == 1, target]

clf = LogisticRegression(**params)

pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", clf)])

pipeline.fit(X_train, y_train)

y_prob_train = clf.predict_proba(X_train)[:, 1]
auc_train = metrics.roc_auc_score(y_train, y_prob_train)

y_prob_test = clf.predict_proba(X_test)[:, 1]
auc_test = metrics.roc_auc_score(y_test, y_prob_test)

In [10]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("ML_Brasileirao_A")

2024/07/31 13:46:03 INFO mlflow.tracking.fluent: Experiment with name 'ML_Brasileirao_A' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1722444363702, experiment_id='1', last_update_time=1722444363702, lifecycle_stage='active', name='ML_Brasileirao_A', tags={}>

In [11]:
with mlflow.start_run(run_name="1st-run-model-study", description=""):
    
    mlflow.log_params(model_params)

    mlflow.log_param("features", features)

    mlflow.log_metric("train_auc", auc_train)

    mlflow.log_metric("test_auc", auc_test)

    mlflow.sklearn.log_model(pipeline, "classifier")

    mlflow.log_artifact("1st_run_features_dictionary.json", artifact_path="classifier")
    
    plot_metrics(df = df_features[df_features['is_test'] == 1], clf = clf, features = features, run = 1, experiment = 'ML_Brasileirao_A')