In [None]:
!python -V

In [None]:
# Standard libraries
import os
import datetime
import warnings
from copy import deepcopy
from utils import plot_metrics
import json

# Data manipulation and analysis
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Machine learning and optimization
import optuna
from optuna.trial import TrialState
from optuna.storages import JournalStorage, JournalFileStorage
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    RobustScaler,
    QuantileTransformer
)
from sklearn.impute import SimpleImputer
from category_encoders import WOEEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import mlflow

# Rich console output
from rich.console import Console
from rich.progress import track

# Ignore warnings
warnings.filterwarnings("ignore")

# Initialize console
console = Console()

In [None]:
# Set general directory of the project
def general_directory():
    """
    Move up one level in the directory hierarchy.
    """
    # Get the current directory
    current_dir = os.getcwd()

    # Get the parent directory
    parent_dir = os.path.dirname(current_dir)

    # Change to the parent directory
    os.chdir(parent_dir)

# Set actual directory of the project
def actual_directory():
    """
    Move down to the 'experiments' subdirectory from the current directory.
    """
    # Get the current directory
    current_dir = os.getcwd()

    # Name of the subdirectory to move into
    sub_dir_name = "experiments"  # Replace with the name of your subdirectory

    # Construct the full path to the subdirectory
    sub_dir_path = os.path.join(current_dir, sub_dir_name)

    # Change to the subdirectory
    os.chdir(sub_dir_path)

# Get Baseline

In [None]:
general_directory()

df = pd.read_parquet(os.getcwd() + '\\data\\silver\\BRA_baseline.csv')

print(df.shape)
df.head()

In [None]:
df_clubes = pd.DataFrame()

df['Date'] = pd.to_datetime(df['Date'])

for i in list(df['Date'].unique()):

  df_mandante  = (df[df['Date'] == i]
                  .groupby(['Home','Season'])
                  .agg({'Date': 'first',
                        'HG':'first',
                        'AG':'first',
                        'AvgH':'first',
                        'AvgA':'first'})).reset_index()

  df_visitante  = (df[df['Date'] == i]
                   .groupby(['Away','Season'])
                   .agg({'Date':'first',
                         'AG':'first',
                         'HG':'first',
                         'AvgA':'first',
                         'AvgH':'first'})).reset_index()

  df_mandante['mando'] = 1
  df_mandante['saldo_de_gols'] = df_mandante['HG'] - df_mandante['AG']
  df_mandante['flag_vitoria'] = np.where(df_mandante['HG'] > df_mandante['AG'], 1, 0)
  df_mandante['flag_favorito'] = np.where(df_mandante['AvgH'] < df_mandante['AvgA'], 1, 0)

  df_visitante['mando'] = 0
  df_visitante['saldo_de_gols'] = df_visitante['AG'] - df_visitante['HG']
  df_visitante['flag_vitoria'] = np.where(df_visitante['AG'] > df_visitante['HG'], 1, 0)
  df_visitante['flag_favorito'] = np.where(df_visitante['AvgA'] < df_visitante['AvgH'], 1, 0)

  df_mandante.columns = [
                        'clube',
                        'temporada',
                        'data',
                        'gols_marcados',
                        'gols_sofridos',
                        'odd_vitoria',
                        'odd_derrota',
                        'mando',
                        'saldo_de_gols',
                        'flag_vitoria',
                        'flag_favorito',
                        ]
  
  df_visitante.columns = [
                         'clube',
                         'temporada',
                         'data',
                         'gols_marcados',
                         'gols_sofridos',
                         'odd_vitoria',
                         'odd_derrota',
                         'mando',
                         'saldo_de_gols',
                         'flag_vitoria',
                         'flag_favorito',
                         ]

  concatened_data = pd.concat([df_mandante, df_visitante])

  df_clubes = pd.concat([df_clubes,concatened_data])

df_clubes.reset_index(drop = True, inplace = True)

In [None]:
def trendline(valores, order=1):
    x = np.arange(1, len(valores) + 1)  
    coeffs = np.polyfit(x, valores, order)
    slope = coeffs[-2]
    return float(slope)

df_estats = df_clubes.copy()

janelas = [5, 38]

for janela in janelas:
    
    if janela == 5:
        
        df_estats[f'sum_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).sum())
        )

        df_estats[f'median_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).median())
        )

        df_estats[f'mean_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).mean())
        )

        df_estats[f'std_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).std())
        )

        df_estats[f'max_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).max())
        )

        df_estats[f'min_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).min())
        )

        df_estats[f'prop_jogos_como_favorito_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['flag_favorito']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).mean())
        )

        df_estats[f'prop_jogos_vitoriosos_nas_ultimas_{janela}_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['flag_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = janela).mean())
        )

        df_estats[f'delta_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats[f'max_odd_nas_ultimas_{janela}_partidas'] - df_estats[f'min_odd_nas_ultimas_{janela}_partidas']
        )

        df_estats[f'cv_odd_nas_ultimas_{janela}_partidas'] = (
            df_estats[f'std_odd_nas_ultimas_{janela}_partidas']/df_estats[f'mean_odd_nas_ultimas_{janela}_partidas']
        )
        
    elif janela == 38:

        df_estats[f'sum_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).sum())
        )

        df_estats[f'median_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).median())
        )

        df_estats[f'mean_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).mean())
        )

        df_estats[f'std_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).std())
        )

        df_estats[f'max_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).max())
        )

        df_estats[f'min_odd_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['odd_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).min())
        )

        df_estats[f'prop_jogos_como_favorito_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['flag_favorito']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).mean())
        )

        df_estats[f'prop_jogos_vitoriosos_nas_ultimas_partidas'] = (
            df_estats
            .sort_values(by = 'data')
            .groupby(['clube','temporada'])['flag_vitoria']
            .transform(lambda x: x.shift(1).rolling(window = janela, min_periods = 1).mean())
        )
        
        df_estats[f'delta_odd_nas_ultimas_partidas'] = (
            df_estats[f'max_odd_nas_ultimas_partidas'] - df_estats[f'min_odd_nas_ultimas_partidas']
        )

        df_estats[f'cv_odd_nas_ultimas_partidas'] = (
            df_estats[f'std_odd_nas_ultimas_partidas']/df_estats[f'mean_odd_nas_ultimas_partidas']
        )    

In [None]:
base_cols = ['clube','temporada','data']

target = ['Target']

public = ['is_test']

features = [x for x in df_estats.columns if 'partidas' in x]

mandantes = df_estats.loc[df_estats['mando'] == 1, base_cols+features]
visitantes = df_estats.loc[df_estats['mando'] == 0, base_cols+features]

mandantes = mandantes.rename(columns={'clube': 'Home', 'temporada': 'Season', 'data': 'Date'})
visitantes = visitantes.rename(columns={'clube': 'Away', 'temporada': 'Season', 'data': 'Date'})

relational_mandante = ['Season', 'Date', 'Home']
relational_visitante = ['Season', 'Date', 'Away']

filter = ['Season', 'Date', 'Home', 'Away', 'Res']

merge_origin_mandante = pd.merge(df[filter + target + public], mandantes, on = relational_mandante, how = 'left')

renamed_estat_features_mandante = [coluna + '_mandante' for coluna in features]

merge_origin_mandante.rename(columns=dict(zip(features, renamed_estat_features_mandante)), inplace=True)

merge_origin_visitante = pd.merge(df[filter + target + public], visitantes, on = relational_visitante, how = 'left')

renamed_estat_features_visitante = [coluna + '_visitante' for coluna in features]

merge_origin_visitante.rename(columns=dict(zip(features, renamed_estat_features_visitante)), inplace=True)

df_features = merge_origin_mandante.merge(merge_origin_visitante, on = filter + target + public, how = 'left')

for feature in features:
    if 'jogos' not in feature:
        df_features[f'ratio_{feature}'] = df_features[f'{feature}_mandante'] / df_features[f'{feature}_visitante']
        df_features[f'diff_{feature}'] = df_features[f'{feature}_mandante'] - df_features[f'{feature}_visitante']
    else:
        df_features[f'diff_{feature}'] = df_features[f'{feature}_mandante'] - df_features[f'{feature}_visitante']

df_features = df_features.drop(columns = renamed_estat_features_mandante + renamed_estat_features_visitante).dropna()

pd.set_option('display.max_columns', None)

print(df_features.shape)
df_features.head()

In [None]:
class RegLogOptimizator:
    def __init__(
        self,
        x,
        y,
        target,
        features,
        categorical_features=[],
        niter=10,
        metric_eval="AUC",
        metric_method="default",
        thr=0.5,
        col_safra=None,
        early_stopping_rounds=3,
        eval_n_features=False,
        filename_storage="reg_log_search",
        save_in_txt=True,
        verbose=True,
    ):
        """
        metric_eval: 'AUC' or 'KS'
        metric_method: 'default', 'min', 'range'
        """
        self.x = x
        self.y = y
        self.features = features
        self.categorical_features = categorical_features
        self.target = target
        self.col_safra = col_safra
        self.niter = niter
        self.metric_eval = metric_eval
        self.metric_method = metric_method
        self.thr = thr
        self.early_stopping_rounds = early_stopping_rounds
        self.eval_n_features = eval_n_features
        self.verbose = verbose
        self.best_auc = 0.0
        self.score = 0.0
        self.iterations_not_improving = 0
        self.iterations = 0
        self.filename_storage = filename_storage
        self.save_in_txt = save_in_txt

    def get_optimal_params(self):
        if self.save_in_txt:
            self.create_log()
        optuna.logging.set_verbosity(optuna.logging.WARNING)
        objective = self.generate_objective_function(
            self.x,
            self.y,
            self.target,
            self.features,
            self.categorical_features,
        )
        storage = JournalStorage(JournalFileStorage(f"{self.filename_storage}.log"))
        study = optuna.create_study(
            direction="maximize",
            study_name="Hyperparameter search",
            storage=storage,
            load_if_exists=True,
        )
        study.optimize(
            objective,
            n_trials=self.niter,
            callbacks=[self.early_stopping_fn],
            n_jobs=1,
        )
        best_trial = study.best_trial
        return best_trial, study

    def early_stopping_fn(
        self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial
    ):
        if self.iterations_not_improving >= self.early_stopping_rounds:
            study.stop()

    def update_best_params(self, score, test_metrics):
        self.iterations += 1
        if score > self.best_auc:
            self.iterations_not_improving = 0
            self.best_auc = score
            if self.verbose:
                console.print(
                    f"|Iteration {self.iterations}| New parameters found - {self.metric_eval} of {np.mean(test_metrics):.4f} ({self.best_auc:.4f})"
                )
        else:
            self.iterations_not_improving += 1

    def ks(self, y, y_pred):
        return ks_2samp(y_pred[y == 1], y_pred[y != 1]).statistic

    def predict(self, model, x):
        return model.predict_proba(x)[:, 1]

    def get_metric_min(self, df, target, metric_eval, col_safra):
        if metric_eval == "KS":
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .min()
            )
        else:
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .min()
            )
        return score_min

    def get_metric_range(self, df, target, metric_eval, col_safra):
        if metric_eval == "KS":
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .min()
            )
            score_max = (
                df.groupby(col_safra)
                .apply(lambda x: self.ks(x[target], x["prob"]) * 100)
                .max()
            )
        else:
            score_min = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .min()
            )
            score_max = (
                df.groupby(col_safra)
                .apply(lambda x: metrics.roc_auc_score(x[target], x["prob"]))
                .max()
            )
        range_ = score_max - score_min
        return range_

    def get_metric(self, df, target, metric_eval):
        if metric_eval == "KS":
            score = self.ks(df[target], df["prob"]) * 100
        else:
            score = metrics.roc_auc_score(df[target], df["prob"])
        return score

    def decision(self, metric_train, metric_test, metric_otm, thr=5):
        return 0 if np.abs(metric_train - metric_test) > thr else metric_otm

    def generate_objective_function(self, x, y, target, features, categorical_features):
        def objective(
            trial,
            x=x,
            y=y,
            target=target,
            features=features,
            categorical_features=categorical_features,
        ):
            numerical_features = [x for x in features if x not in categorical_features]

            parameters = {
                "penalty": trial.suggest_categorical(
                    "penalty", ["l1", "l2", "elasticnet"]
                ),
                "C": trial.suggest_loguniform("C", 0.001, 10),
                "class_weight": trial.suggest_categorical(
                    "class_weight", ["balanced"]
                ),
                "max_iter": trial.suggest_int("max_iter", 100, 1000, 50),
            }

            if parameters["penalty"] == "l1":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_l1", ["liblinear", "saga"]
                )
            elif parameters["penalty"] == "l2":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_l2",
                    [
                        "lbfgs",
                        "liblinear",
                        "newton-cg",
                        "newton-cholesky",
                        "sag",
                        "saga",
                    ],
                )
            elif parameters["penalty"] == "elasticnet":
                parameters["solver"] = trial.suggest_categorical(
                    "solver_elasticnet", ["saga"]
                )
                parameters["l1_ratio"] = trial.suggest_float(
                    name="l1_ratio", low=0.0, high=1.0, step=0.05
                )
            else:
                parameters["solver"] = trial.suggest_categorical(
                    "solver_none",
                    [
                        "lbfgs",
                        "liblinear",
                        "newton-cg",
                        "newton-cholesky",
                        "sag",
                        "saga",
                    ],
                )


            # Transformer das categóricas
            encoder = trial.suggest_categorical(
                "encoder_categorical", 
                [
                    "woe", 
                    "onehot", 
                    "ordenc"
                ]
            )

            if encoder == "woe":
                encoder_sel = WOEEncoder()
            if encoder == "onehot":
                encoder_sel = OneHotEncoder()
            if encoder == "ordenc":
                encoder_sel = OrdinalEncoder()

            cat_transformer = Pipeline(steps=[("encoder", encoder_sel)])

            # Transformer das numéricas
            imputer = trial.suggest_categorical(
                "imputer_missing",
                [
                    "simple_med",
                ]  
            )

            scaler = trial.suggest_categorical(
                "scaler_val",
                [
                    "standard",
                    "robust"
                ]
            )

            if imputer == "simple_med":
                imputer_sel = SimpleImputer(strategy="median")
            if imputer == "simple_cte":
                imputer_sel = SimpleImputer(strategy="constant", fill_value=-999)
            if scaler == "standard":
                scaler_sel = StandardScaler()
            if scaler == "robust":
                scaler_sel = RobustScaler()

            num_transformer = Pipeline(steps=[("imputer", imputer_sel), ("scaler", scaler_sel)])

            # Compondo os pré-processadores
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", num_transformer, numerical_features),
                    ("cat", cat_transformer, categorical_features),
                ]
            )

            # Definição do modelo
            model = LogisticRegression(**parameters)

            pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", model)])
            
            if self.eval_n_features:
                
                pipeline.fit(x[features], y[[target]])
                
                coef = pipeline.steps[-1][1].coef_[0]
                feature_importance = pd.DataFrame(
                    {"var": features, "Importance": np.abs(coef)}
                )
                feature_importance = feature_importance.sort_values(
                    "Importance", ascending=False
                ).reset_index(drop=True)
                
                features_fi = list(feature_importance["var"])
                
                for i in range(len(features_fi)):
                    for k in range(len(features)):
                        if features[k] in features_fi[i]:
                            features_fi[i] = features[k]
                
                features_fi = list(dict.fromkeys(features_fi))
                
                n_features = trial.suggest_int("n_features", 1, len(features))
                
                selected_features = features_fi[:n_features]
                
                trial.set_user_attr("selected_features", selected_features)
                
                features = [x for x in selected_features]
                
                categorical_features = [x for x in self.categorical_features if x in selected_features]
                
                preprocessor = ColumnTransformer(
                    transformers=[
                        (
                            "num",
                            num_transformer,
                            [x for x in features if x not in categorical_features],
                        ),
                        ("cat", cat_transformer, categorical_features),
                    ]
                )
                
                pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", model)])

            train_metrics = np.zeros(5)
            test_metrics = np.zeros(5)

            kf = StratifiedKFold(shuffle=True, random_state=42)

            for i, (idx_train, idx_test) in enumerate(kf.split(x, y)):
                x_train, y_train = x.iloc[idx_train], y.iloc[idx_train]
                x_test, y_test = x.iloc[idx_test], y.iloc[idx_test]

                pipeline.fit(x_train, y_train)
                y_pred_train = self.predict(pipeline, x_train)
                y_pred_test = self.predict(pipeline, x_test)
                y_train["prob"] = y_pred_train
                y_test["prob"] = y_pred_test

                if self.metric_method == "min":
                    train_metrics[i] = self.get_metric_min(
                        y_train, target, self.metric_eval, self.col_safra
                    )
                    test_metrics[i] = self.get_metric_min(
                        y_test, target, self.metric_eval, self.col_safra
                    )
                elif self.metric_method == "range":
                    train_metrics[i] = self.get_metric_range(
                        y_train, target, self.metric_eval, self.col_safra
                    )
                    test_metrics[i] = self.get_metric_range(
                        y_test, target, self.metric_eval, self.col_safra
                    )
                else:
                    train_metrics[i] = self.get_metric(
                        y_train, target, self.metric_eval
                    )
                    test_metrics[i] = self.get_metric(
                        y_test, target, self.metric_eval
                    )

            metrics = {
                "train_metrics": list(train_metrics),
                "test_metrics": list(test_metrics),
                "train_metric": np.mean(train_metrics),
                "test_metric": np.mean(test_metrics),
            }

            for key in metrics:
                trial.set_user_attr(key, metrics[key])

            metric_otm = np.mean(test_metrics) - np.std(test_metrics)
            value = self.decision(
                np.mean(train_metrics), np.mean(test_metrics), metric_otm, thr=self.thr
            )
            self.update_best_params(value, test_metrics)
            if self.save_in_txt:
                f = open(f"{self.log_file}.txt", "a")
                f.write(
                    f"{self.iterations};{parameters};{list(metrics['train_metrics'])};{list(metrics['test_metrics'])}\n"
                )
                f.close()
            return value

        return objective

    def create_log(self):
        time_ref = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
        self.log_file = f"resume_reg_log_opt_{time_ref}"
        f = open(f"{self.log_file}.txt", "w")
        f = open(f"{self.log_file}.txt", "a")
        f.write(f"iter;parameters;train_metrics;test_metrics\n")
        f.close()

In [None]:
actual_directory()

features = list(df_features.drop(columns = ['Season','Date','Home','Away', 'Res', 'Target','is_test']).columns)
target = ['Target']

model_opt = RegLogOptimizator (

            x = df_features.loc[df_features['is_test'] == 0,features],
            y = df_features.loc[df_features['is_test'] == 0,target],
            target = 'Target',
            features = features,
            categorical_features = [],
            niter = 1000,
            metric_eval = "AUC",
            metric_method = "default",
            thr = 0.05, 
            col_safra = None,
            early_stopping_rounds = 200,
            eval_n_features = True,
            filename_storage = "1st_run_experiment_1",
            save_in_txt = True

        )
    
best_trial, study = model_opt.get_optimal_params()

In [None]:
X_train, y_train = df_features.loc[df_features['is_test'] == 0, features], df_features.loc[df_features['is_test'] == 0, target]
X_test, y_test = df_features.loc[df_features['is_test'] == 1, features], df_features.loc[df_features['is_test'] == 1, target]

df_results = {

    'id': [],
    'params': [],
    'qtd_features': [],
    'features': [],
    'auc_train_cv': [],
    'auc_test_cv': [],
    'auc_train': [],
    'auc_test': [],

}

for j, study_trial in enumerate(study.trials):

    if study_trial.state != TrialState.COMPLETE or study_trial.value == 0:
        continue  # Skip the trials that did not complete successfully

    model_params = {k: v for k, v in study_trial.params.items() if k not in ['encoder_categorical','imputer_missing','scaler_val','n_features']}
    solver_key = next((key for key in model_params if 'solver' in key), None)
    if solver_key:
        model_params['solver'] = model_params.pop(solver_key)

    num_features = study_trial.params['n_features']
    model_features = study_trial.user_attrs['selected_features']

    scaler_dict = {k: v for k, v in study_trial.params.items() if k == 'scaler_val'}
    
    if scaler_dict['scaler_val'] == 'standard':
        transformer = Pipeline(steps=[("scaler", StandardScaler())])
    elif scaler_dict['scaler_val'] == 'robust':
        transformer = Pipeline(steps=[("scaler", RobustScaler())])
    
    preprocessor = ColumnTransformer(
                transformers=[
                    ("features", transformer, model_features),
                ]
            )
    
    clf = LogisticRegression(**model_params)
    pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", clf)])

    pipeline.fit(X_train[model_features], y_train)
    y_prob_train = pipeline.predict_proba(X_train[model_features])[:, 1]
    auc_train = metrics.roc_auc_score(y_train, y_prob_train)

    y_prob_test = pipeline.predict_proba(X_test[model_features])[:, 1]
    auc_test = metrics.roc_auc_score(y_test, y_prob_test)

    for i, k in enumerate(df_results.keys()): df_results[k].append([f"{j:03d}", 
                                                                    model_params, 
                                                                    num_features, 
                                                                    model_features, 
                                                                    round(study_trial.user_attrs['train_metric'] * 100, 2),
                                                                    round(study_trial.user_attrs['test_metric'] * 100, 2),
                                                                    round(auc_train * 100, 2), 
                                                                    round(auc_test * 100, 2)][i])

df_results = pd.DataFrame(df_results)

In [None]:
df_results.sort_values(by = 'auc_test', ascending = False).head(5)

In [None]:
df_results.loc[df_results['id'] == '555', 'features'].to_list()

In [None]:
X_train.corr()

In [None]:
# Hyperparameters
params = deepcopy(best_trial.params)
params['random_state'] = 42

params

In [None]:
model_params = {'penalty': params['penalty'],
 'C': params['C'],
 'class_weight': params['class_weight'],
 'max_iter': params['max_iter'],
 'solver': params[f'solver_{params["penalty"]}'],
 'random_state': params['random_state']}

#transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", RobustScaler())])
transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

sel_features = best_trial.user_attrs['selected_features']

# preprocessor
preprocessor = ColumnTransformer(
                transformers=[
                    ("features", transformer, sel_features),
                ]
            )

In [None]:
# Fit and Prediction
X_train, y_train = df_features.loc[df_features['is_test'] == 0, sel_features], df_features.loc[df_features['is_test'] == 0, target]
X_test, y_test = df_features.loc[df_features['is_test'] == 1, sel_features], df_features.loc[df_features['is_test'] == 1, target]

clf = LogisticRegression(**model_params)

pipeline = Pipeline([("preprocessor", preprocessor), ("RegLog", clf)])

pipeline.fit(X_train, y_train)

y_prob_train = clf.predict_proba(X_train)[:, 1]
auc_train = metrics.roc_auc_score(y_train, y_prob_train)

y_prob_test = clf.predict_proba(X_test)[:, 1]
auc_test = metrics.roc_auc_score(y_test, y_prob_test)

print("Auc de treino:", round(auc_train * 100, 2))
print("Auc de teste: ", round(auc_test * 100, 2))

In [None]:
# Get coefficients from the logistic regression model
coefficients = pipeline.named_steps['RegLog'].coef_[0]

# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
    'var': sel_features,
    'importance': np.abs(coefficients)
})

importance_df = importance_df.sort_values(by = 'importance', ascending = False)

importance_df

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("model_experiment")

In [None]:
with mlflow.start_run(run_name="1st-run-experiment-model-study", description="Model to predict home win in Brazilian Soccer Championship of first division using the informations of the last 5 matches"):
    
    mlflow.log_params(model_params)

    mlflow.log_param("features", features)

    mlflow.log_metric("train_auc", auc_train)

    mlflow.log_metric("test_auc", auc_test)

    mlflow.sklearn.log_model(pipeline, "classifier")
    
    plot_metrics(df = df_features[df_features['is_test'] == 1], clf = clf, features = features, run = 1, experiment = 'model_experiment', mlflow_path="classifier")