In [1]:
import os
import pandas as pd
import function_library as fl
from datetime import datetime


In [5]:
import os
import glob
import pandas as pd

def sort_files_by_precision(directory: str, inplace: bool = True) -> None:
    """
    Reads all CSV files in `directory`, sorts each by the 'Precision_Test' column
    in descending order, and writes the sorted DataFrame back.

    Args:
        directory (str): Path to the folder containing CSVs.
        inplace (bool):
            - If True, overwrite each original file.
            - If False, write to new files prefixed with 'sorted_'.
    """
    pattern = os.path.join(directory, "*.csv")
    for filepath in glob.glob(pattern):
        df = pd.read_csv(filepath)
        if 'Precision_Test' not in df.columns:
            print(f"Skipping {os.path.basename(filepath)}: no 'Precision_Test' column")
            continue

        # sort so highest Precision_Test is at the top
        df_sorted = df.sort_values(by='Precision_Test', ascending=False)

        if inplace:
            df_sorted.to_csv(filepath, index=False)
            print(f"✔ Sorted (in‐place): {os.path.basename(filepath)}")
        else:
            dirname, filename = os.path.split(filepath)
            new_filename = f"sorted_{filename}"
            df_sorted.to_csv(os.path.join(dirname, new_filename), index=False)
            print(f"✔ Written sorted file: {new_filename}")

directory = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\2H_goal\best_models_by_ht_scoreline"
sort_files_by_precision(directory, inplace=True)


✔ Sorted (in‐place): model_metrics_('0-0',)_20250413_143631.csv
✔ Sorted (in‐place): model_metrics_('0-1',)_20250414_235442.csv
✔ Sorted (in‐place): model_metrics_('0-2',)_20250416_123947.csv
✔ Sorted (in‐place): model_metrics_('1-0',)_20250414_114908.csv
✔ Sorted (in‐place): model_metrics_('1-1',)_20250415_075613.csv
✔ Sorted (in‐place): model_metrics_('1-2',)_20250415_113409.csv
✔ Sorted (in‐place): model_metrics_('2-0',)_20250415_212648.csv
✔ Sorted (in‐place): model_metrics_('2-1',)_20250416_190132.csv
✔ Sorted (in‐place): model_metrics_('3-0',)_20250415_142122.csv


In [21]:
# Directory containing the CSV files
directory = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\2H_goal\best_models_by_ht_scoreline"

# List to collect each top row's data
top_rows = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.startswith("model_metrics_(") and filename.endswith(".csv"):
        ht_score = filename.split("_")[2]  # Extract league name from filename
        file_path = os.path.join(directory, filename)

        try:
            df = pd.read_csv(file_path)
            top_row = df.iloc[0]  # Get only the first row
            top_rows.append({
                'HT_Score': ht_score,
                'Model': top_row['Model'],
                'SMOTE': top_row.get('SMOTE'),
                'Precision_Test': top_row.get('Precision_Test'),
                'Precision_Test/Train_Ratio': top_row.get('Precision_Test/Train_Ratio'),
                'Probability_Threshold': top_row.get('Probability_Threshold'),
                'Params': top_row.get('Params')
            })
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Combine all results into one DataFrame
results_df = pd.DataFrame(top_rows)

# Save or display results
#results_df.to_csv("top_row_model_params.csv", index=False)



In [22]:
results_df

Unnamed: 0,HT_Score,Model,SMOTE,Precision_Test,Precision_Test/Train_Ratio,Probability_Threshold,Params
0,"('0-0',)",XGBoost,,0.8641,0.935,0.8,"{'classifier__colsample_bytree': 0.7, 'classif..."
1,"('0-1',)",XGBoost,0.62,0.8711,0.9283,0.81,"{'classifier__colsample_bytree': 0.8, 'classif..."
2,"('0-2',)",MLP,,0.8475,0.9745,0.81,"{'classifier__activation': 'relu', 'classifier..."
3,"('1-0',)",XGBoost,0.63,0.8791,0.9422,0.81,"{'classifier__colsample_bytree': 0.8, 'classif..."
4,"('1-1',)",XGBoost,,0.9028,0.9166,0.81,"{'classifier__colsample_bytree': 0.7, 'classif..."
5,"('1-2',)",XGBoost,0.9,0.8564,0.9653,0.6,"{'classifier__colsample_bytree': 0.7, 'classif..."
6,"('2-0',)",XGBoost,0.41,0.9268,0.9914,0.76,"{'classifier__colsample_bytree': 0.8, 'classif..."
7,"('2-1',)",XGBoost,0.47,0.9028,0.9053,0.8,"{'classifier__colsample_bytree': 0.7, 'classif..."
8,"('3-0',)",XGBoost,0.73,0.87,0.9321,0.65,"{'classifier__colsample_bytree': 0.8, 'classif..."


In [23]:
features = [
    # 'Unnamed: 0',
    # 'country',
    # 'season',
    # 'date',
    # 'ko_time',
    'round',
    # 'home_team',
    # 'away_team',
    # 'home_goals_ft',
    # 'away_goals_ft',
    # 'home_goals_ht',
    # 'away_goals_ht',
    'home_team_place_total',
    'home_team_place_home',
    'away_team_place_total',
    'away_team_place_away',
    'home_odds',
    'draw_odds',
    'away_odds',
    'over_25_odds',
    'under_25_odds',
    'elo_home',
    'elo_away',
    'form_home',
    'form_away',
    # 'shots_home',
    # 'shots_home_1h',
    # 'shots_home_2h',
    # 'shots_away',
    # 'shots_away_1h',
    # 'shots_away_2h',
    # 'shots_on_target_home',
    # 'shots_on_target_home_1h',
    # 'shots_on_target_home_2h',
    # 'shots_on_target_away',
    # 'shots_on_target_away_1h',
    # 'shots_on_target_away_2h',
    # 'corners_home',
    # 'corners_home_1h',
    # 'corners_home_2h',
    # 'corners_away',
    # 'corners_away_1h',
    # 'corners_away_2h',
    # 'fouls_home',
    # 'fouls_home_1h',
    # 'fouls_home_2h',
    # 'fouls_away',
    # 'fouls_away_1h',
    # 'fouls_away_2h',
    # 'yellow_cards_home',
    # 'yellow_cards_home_1h',
    # 'yellow_cards_home_2h',
    # 'yellow_cards_away',
    # 'yellow_cards_away_1h',
    # 'yellow_cards_away_2h',
    # 'possession_home',
    # 'possession_home_1h',
    # 'possession_home_2h',
    # 'possession_away',
    # 'possession_away_1h',
    # 'possession_away_2h',
    # 'goals_scored_total_home',
    # 'goals_conceded_total_home',
    # 'goals_scored_total_away',
    # 'goals_conceded_total_away',
    # 'points_home',
    # 'points_away',
    # 'is_home_x',
    'home_Overall_Rolling_GoalsScored_Mean',
    'home_Overall_Rolling_GoalsScored_Std',
    'home_Overall_Rolling_GoalsScored_Mean_Short',
    'home_Overall_Momentum_GoalsScored',
    'home_Overall_Trend_Slope_GoalsScored',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'home_Overall_Rolling_FirstHalfGoalsScored_Std',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Overall_Momentum_FirstHalfGoalsScored',
    'home_Overall_Trend_Slope_FirstHalfGoalsScored',
    'home_Overall_Rolling_Shots_Mean',
    'home_Overall_Rolling_Shots_Std',
    'home_Overall_Rolling_Shots_Mean_Short',
    'home_Overall_Momentum_Shots',
    'home_Overall_Trend_Slope_Shots',
    'home_Overall_Rolling_Shots_1h_Mean',
    'home_Overall_Rolling_Shots_1h_Std',
    'home_Overall_Rolling_Shots_1h_Mean_Short',
    'home_Overall_Momentum_Shots_1h',
    'home_Overall_Trend_Slope_Shots_1h',
    'home_Overall_Rolling_Corners_Mean',
    'home_Overall_Rolling_Corners_Std',
    'home_Overall_Rolling_Corners_Mean_Short',
    'home_Overall_Momentum_Corners',
    'home_Overall_Trend_Slope_Corners',
    'home_Overall_Rolling_Corners_1h_Mean',
    'home_Overall_Rolling_Corners_1h_Std',
    'home_Overall_Rolling_Corners_1h_Mean_Short',
    'home_Overall_Momentum_Corners_1h',
    'home_Overall_Trend_Slope_Corners_1h',
    'home_Overall_Rolling_ShotsOnTarget_Mean',
    'home_Overall_Rolling_ShotsOnTarget_Std',
    'home_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget',
    'home_Overall_Trend_Slope_ShotsOnTarget',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'home_Overall_Rolling_ShotsOnTarget_1h_Std',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget_1h',
    'home_Overall_Trend_Slope_ShotsOnTarget_1h',
    'home_Rolling_GoalsScored_Mean',
    'home_Rolling_GoalsScored_Std',
    'home_Rolling_GoalsScored_Mean_Short',
    'home_Momentum_GoalsScored',
    'home_Trend_Slope_GoalsScored',
    'home_Rolling_FirstHalfGoalsScored_Mean',
    'home_Rolling_FirstHalfGoalsScored_Std',
    'home_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Momentum_FirstHalfGoalsScored',
    'home_Trend_Slope_FirstHalfGoalsScored',
    'home_Rolling_Shots_Mean',
    'home_Rolling_Shots_Std',
    'home_Rolling_Shots_Mean_Short',
    'home_Momentum_Shots',
    'home_Trend_Slope_Shots',
    'home_Rolling_Shots_1h_Mean',
    'home_Rolling_Shots_1h_Std',
    'home_Rolling_Shots_1h_Mean_Short',
    'home_Momentum_Shots_1h',
    'home_Trend_Slope_Shots_1h',
    'home_Rolling_Corners_Mean',
    'home_Rolling_Corners_Std',
    'home_Rolling_Corners_Mean_Short',
    'home_Momentum_Corners',
    'home_Trend_Slope_Corners',
    'home_Rolling_Corners_1h_Mean',
    'home_Rolling_Corners_1h_Std',
    'home_Rolling_Corners_1h_Mean_Short',
    'home_Momentum_Corners_1h',
    'home_Trend_Slope_Corners_1h',
    'home_Rolling_ShotsOnTarget_Mean',
    'home_Rolling_ShotsOnTarget_Std',
    'home_Rolling_ShotsOnTarget_Mean_Short',
    'home_Momentum_ShotsOnTarget',
    'home_Trend_Slope_ShotsOnTarget',
    'home_Rolling_ShotsOnTarget_1h_Mean',
    'home_Rolling_ShotsOnTarget_1h_Std',
    'home_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Momentum_ShotsOnTarget_1h',
    'home_Trend_Slope_ShotsOnTarget_1h',
    'home_Overall_Percent_Over_1.5',
    'home_Overall_Rolling5_Percent_Over_1.5',
    'home_Percent_Over_1.5',
    'home_Rolling5_Percent_Over_1.5',
    'home_Overall_Percent_Over_2.5',
    'home_Overall_Rolling5_Percent_Over_2.5',
    'home_Percent_Over_2.5',
    'home_Rolling5_Percent_Over_2.5',
    'home_Overall_Percent_Over_3.5',
    'home_Overall_Rolling5_Percent_Over_3.5',
    'home_Percent_Over_3.5',
    'home_Rolling5_Percent_Over_3.5',
    'home_TeamPct_Over_0.5',
    'home_TeamPct_Over_1.5',
    'home_TeamPct_Over_2.5',
    'home_TeamPct_Over_3.5',
    'home_CornersPct_Over_3.5',
    'home_CornersRolling5Pct_Over_3.5',
    'home_CornersPct_Over_4.5',
    'home_CornersRolling5Pct_Over_4.5',
    'home_CornersPct_Over_5.5',
    'home_CornersRolling5Pct_Over_5.5',
    'home_CornersPct_Over_6.5',
    'home_CornersRolling5Pct_Over_6.5',
    'home_SeasonPct_Over_9.5',
    'home_Rolling5Pct_Over_9.5',
    'home_SeasonPct_Over_10.5',
    'home_Rolling5Pct_Over_10.5',
    'home_SeasonPct_Over_11.5',
    'home_Rolling5Pct_Over_11.5',
    # 'is_home_y',
    'away_Overall_Rolling_GoalsScored_Mean',
    'away_Overall_Rolling_GoalsScored_Std',
    'away_Overall_Rolling_GoalsScored_Mean_Short',
    'away_Overall_Momentum_GoalsScored',
    'away_Overall_Trend_Slope_GoalsScored',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'away_Overall_Rolling_FirstHalfGoalsScored_Std',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Overall_Momentum_FirstHalfGoalsScored',
    'away_Overall_Trend_Slope_FirstHalfGoalsScored',
    'away_Overall_Rolling_Shots_Mean',
    'away_Overall_Rolling_Shots_Std',
    'away_Overall_Rolling_Shots_Mean_Short',
    'away_Overall_Momentum_Shots',
    'away_Overall_Trend_Slope_Shots',
    'away_Overall_Rolling_Shots_1h_Mean',
    'away_Overall_Rolling_Shots_1h_Std',
    'away_Overall_Rolling_Shots_1h_Mean_Short',
    'away_Overall_Momentum_Shots_1h',
    'away_Overall_Trend_Slope_Shots_1h',
    'away_Overall_Rolling_Corners_Mean',
    'away_Overall_Rolling_Corners_Std',
    'away_Overall_Rolling_Corners_Mean_Short',
    'away_Overall_Momentum_Corners',
    'away_Overall_Trend_Slope_Corners',
    'away_Overall_Rolling_Corners_1h_Mean',
    'away_Overall_Rolling_Corners_1h_Std',
    'away_Overall_Rolling_Corners_1h_Mean_Short',
    'away_Overall_Momentum_Corners_1h',
    'away_Overall_Trend_Slope_Corners_1h',
    'away_Overall_Rolling_ShotsOnTarget_Mean',
    'away_Overall_Rolling_ShotsOnTarget_Std',
    'away_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget',
    'away_Overall_Trend_Slope_ShotsOnTarget',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'away_Overall_Rolling_ShotsOnTarget_1h_Std',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget_1h',
    'away_Overall_Trend_Slope_ShotsOnTarget_1h',
    'away_Rolling_GoalsScored_Mean',
    'away_Rolling_GoalsScored_Std',
    'away_Rolling_GoalsScored_Mean_Short',
    'away_Momentum_GoalsScored',
    'away_Trend_Slope_GoalsScored',
    'away_Rolling_FirstHalfGoalsScored_Mean',
    'away_Rolling_FirstHalfGoalsScored_Std',
    'away_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Momentum_FirstHalfGoalsScored',
    'away_Trend_Slope_FirstHalfGoalsScored',
    'away_Rolling_Shots_Mean',
    'away_Rolling_Shots_Std',
    'away_Rolling_Shots_Mean_Short',
    'away_Momentum_Shots',
    'away_Trend_Slope_Shots',
    'away_Rolling_Shots_1h_Mean',
    'away_Rolling_Shots_1h_Std',
    'away_Rolling_Shots_1h_Mean_Short',
    'away_Momentum_Shots_1h',
    'away_Trend_Slope_Shots_1h',
    'away_Rolling_Corners_Mean',
    'away_Rolling_Corners_Std',
    'away_Rolling_Corners_Mean_Short',
    'away_Momentum_Corners',
    'away_Trend_Slope_Corners',
    'away_Rolling_Corners_1h_Mean',
    'away_Rolling_Corners_1h_Std',
    'away_Rolling_Corners_1h_Mean_Short',
    'away_Momentum_Corners_1h',
    'away_Trend_Slope_Corners_1h',
    'away_Rolling_ShotsOnTarget_Mean',
    'away_Rolling_ShotsOnTarget_Std',
    'away_Rolling_ShotsOnTarget_Mean_Short',
    'away_Momentum_ShotsOnTarget',
    'away_Trend_Slope_ShotsOnTarget',
    'away_Rolling_ShotsOnTarget_1h_Mean',
    'away_Rolling_ShotsOnTarget_1h_Std',
    'away_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Momentum_ShotsOnTarget_1h',
    'away_Trend_Slope_ShotsOnTarget_1h',
    'away_Overall_Percent_Over_1.5',
    'away_Overall_Rolling5_Percent_Over_1.5',
    'away_Percent_Over_1.5',
    'away_Rolling5_Percent_Over_1.5',
    'away_Overall_Percent_Over_2.5',
    'away_Overall_Rolling5_Percent_Over_2.5',
    'away_Percent_Over_2.5',
    'away_Rolling5_Percent_Over_2.5',
    'away_Overall_Percent_Over_3.5',
    'away_Overall_Rolling5_Percent_Over_3.5',
    'away_Percent_Over_3.5',
    'away_Rolling5_Percent_Over_3.5',
    'away_TeamPct_Over_0.5',
    'away_TeamPct_Over_1.5',
    'away_TeamPct_Over_2.5',
    'away_TeamPct_Over_3.5',
    'away_CornersPct_Over_3.5',
    'away_CornersRolling5Pct_Over_3.5',
    'away_CornersPct_Over_4.5',
    'away_CornersRolling5Pct_Over_4.5',
    'away_CornersPct_Over_5.5',
    'away_CornersRolling5Pct_Over_5.5',
    'away_CornersPct_Over_6.5',
    'away_CornersRolling5Pct_Over_6.5',
    'away_SeasonPct_Over_9.5',
    'away_Rolling5Pct_Over_9.5',
    'away_SeasonPct_Over_10.5',
    'away_Rolling5Pct_Over_10.5',
    'away_SeasonPct_Over_11.5',
    'away_Rolling5Pct_Over_11.5'
]

In [24]:
def pre_prepared_data(file_path):
    data = pd.read_csv(file_path,
                       low_memory=False)
    # Convert 'date' column to datetime object
    data['date'] = pd.to_datetime(data['date'], format="%Y-%m-%d", errors='coerce')
    data = data.sort_values(by='date')

    # Convert today's date to a pandas Timestamp for compatibility.
    today = pd.Timestamp(datetime.today().date())
    data = data[data['date'] <= today]

    # Clean up and finalise the match-level DataFrame
    data.dropna(inplace=True)
    data['ht_score'] = data['home_goals_ht'].astype(str) + '-' + data['away_goals_ht'].astype(str)
    data['total_goals'] = data['home_goals_ft'] + data['away_goals_ft']
    data['target'] = ((data['home_goals_ft'] > data['home_goals_ht']) | (data['away_goals_ft'] > data['away_goals_ht'])).astype(int)
    return data
matches = pre_prepared_data(r"/engineered_master_data_ALL_2017+.csv")

# Process each league separately
ht_score = matches[['ht_score']].drop_duplicates().apply(tuple, axis=1)

In [25]:
matches_filtered = matches[matches['ht_score']=='0-0']
data =matches_filtered.copy()
matches_filtered

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5,ht_score,total_goals,target
0,Pol1,18,2017-08-04,1930.0,4,Wisla Plock,Wisla Krakow,0,1,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0-0,1,1
1,Den1,18,2017-08-06,1500.0,4,Odense,Aalborg,0,0,0,...,0.5,0.666667,0.666667,0.333333,0.333333,0.333333,0.333333,0-0,0,0
4,Rom1,18,2017-08-08,1645.0,5,Constanta,Voluntari,0,0,0,...,0.0,0.250000,0.250000,0.000000,0.000000,0.000000,0.000000,0-0,0,0
8,Rom1,18,2017-08-11,1900.0,6,Timisoara,Constanta,0,0,0,...,0.0,0.600000,0.600000,0.400000,0.400000,0.400000,0.400000,0-0,0,0
9,Slo1,18,2017-08-11,1920.0,5,Maribor,Domzale,0,0,0,...,0.5,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0-0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96429,Ned2,25,2025-04-01,1900.0,26,Jong Utrecht,Venlo,0,0,0,...,0.4,0.645161,0.600000,0.516129,0.400000,0.354839,0.200000,0-0,0,0
96430,Ned1,25,2025-04-02,1900.0,25,Feyenoord,Groningen,0,0,0,...,0.2,0.384615,0.600000,0.230769,0.200000,0.192308,0.200000,0-0,0,0
96431,Ned2,25,2025-04-07,1900.0,26,Jong Alkmaar,Jong Eindhoven,0,0,0,...,0.2,0.718750,0.400000,0.468750,0.400000,0.375000,0.400000,0-0,0,0
96432,Eng4,25,2025-04-08,1945.0,25,Chesterfield,Gillingham,0,0,0,...,0.2,0.225000,0.000000,0.125000,0.000000,0.100000,0.000000,0-0,0,0


In [26]:
import ast
import numpy as np
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, precision_score, recall_score,
                             f1_score, roc_auc_score, matthews_corrcoef, accuracy_score)

# def replicate_run_from_csv_row(data, features, row):
#     """
#     Replicates a model run from a single row in a metrics CSV for one of three supported models:
#       - MLPClassifier (sklearn)
#       - XGBClassifier (xgboost)
#       - RandomForestClassifier (sklearn)
#
#     The CSV row is assumed to have columns 'SMOTE', 'Probability_Threshold', 'Params', and 'Model'.
#     The function applies SMOTE only to the training data, builds a pipeline with a scaler and the classifier,
#     fits the model, applies a custom probability threshold for classification, and then computes evaluation metrics.
#     """
#     # Extract parameters from the CSV row.
#     smote_level = row['SMOTE']
#     threshold = row['Probability_Threshold']
#     param_dict = ast.literal_eval(row['Params']) if isinstance(row['Params'], str) else row['Params']
#     model_name = row['Model']
#
#     # Prepare data: separate features and target, then perform time-series split.
#     X = data[features]
#     y = data['target']
#     train_size = int(len(data) * 0.8)
#     X_train = X.iloc[:train_size]
#     X_test = X.iloc[train_size:]
#     y_train = y.iloc[:train_size]
#     y_test = y.iloc[train_size:]
#
#     # Apply SMOTE to the training set.
#     smote = SMOTE(sampling_strategy=float(smote_level), random_state=42)
#     X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
#
#     # Build a classifier based on the model name.
#     if model_name == "MLP":
#         from sklearn.neural_network import MLPClassifier
#         classifier = MLPClassifier(random_state=42, max_iter=500)
#     elif model_name == "XGBoost":
#         from xgboost import XGBClassifier
#         classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
#     elif model_name in ["Random Forest", "RandomForest", "Radnom forrest"]:
#         from sklearn.ensemble import RandomForestClassifier
#         classifier = RandomForestClassifier(random_state=42)
#     else:
#         raise ValueError(f"Unsupported model: {model_name}")
#
#     # Build the pipeline with a scaler and the appropriate classifier.
#     pipeline = ImbPipeline([
#         ('scaler', StandardScaler()),
#         ('classifier', classifier)
#     ])
#
#     # Set the classifier parameters from the CSV row.
#     pipeline.set_params(**param_dict)
#
#     # Fit the pipeline on the resampled training data.
#     pipeline.fit(X_train_res, y_train_res)
#
#     # Predict probabilities on the test set and apply the custom threshold.
#     y_proba = pipeline.predict_proba(X_test)[:, 1]
#     y_pred = (y_proba >= float(threshold)).astype(int)
#
#     # Compute evaluation metrics.
#     metrics = {
#         'Precision': precision_score(y_test, y_pred, zero_division=0),
#         'Recall': recall_score(y_test, y_pred, zero_division=0),
#         'F1': f1_score(y_test, y_pred, zero_division=0),
#         'AUC': roc_auc_score(y_test, y_proba),
#         'MCC': matthews_corrcoef(y_test, y_pred),
#         'Accuracy': accuracy_score(y_test, y_pred),
#         'Test_Sample_Size': np.sum(y_pred),
#         'Confusion_Matrix': confusion_matrix(y_test, y_pred)
#     }
#
#     return metrics

def replicate_run_from_csv_row(data, features, row):
    """
    Replicates a model run from a single row in a metrics CSV for one of three supported models:
      - MLPClassifier (sklearn)
      - XGBClassifier (xgboost)
      - RandomForestClassifier (sklearn)

    The CSV row is assumed to have columns 'SMOTE', 'Probability_Threshold', 'Params', and 'Model'.
    The function applies SMOTE only to the training data (if SMOTE is specified), builds a pipeline
    with a scaler and the classifier, fits the model, applies a custom probability threshold for
    classification, and then computes evaluation metrics.
    """
    # 1) Extract parameters
    smote_level = row['SMOTE']
    threshold = float(row['Probability_Threshold'])
    param_dict = ast.literal_eval(row['Params']) if isinstance(row['Params'], str) else row['Params']
    model_name = row['Model']

    # 2) Split data
    X = data[features]
    y = data['target']
    split = int(len(data) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    # 3) Conditionally apply SMOTE
    if pd.notna(smote_level):
        smote = SMOTE(sampling_strategy=float(smote_level), random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = X_train, y_train

    # 4) Choose model
    if model_name == "MLP":
        from sklearn.neural_network import MLPClassifier
        classifier = MLPClassifier(random_state=42, max_iter=500)
    elif model_name == "XGBoost":
        from xgboost import XGBClassifier
        classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    elif model_name in ["Random Forest", "RandomForest", "Radnom forrest"]:
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(random_state=42)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    # 5) Build pipeline
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('classifier', classifier)
    ])
    pipeline.set_params(**param_dict)

    # 6) Fit
    pipeline.fit(X_train_res, y_train_res)

    # 7) Predict with custom threshold
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred  = (y_proba >= threshold).astype(int)

    # 8) Metrics
    return {
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall':    recall_score(y_test, y_pred, zero_division=0),
        'F1':        f1_score(y_test, y_pred, zero_division=0),
        'AUC':       roc_auc_score(y_test, y_proba),
        'MCC':       matthews_corrcoef(y_test, y_pred),
        'Accuracy':  accuracy_score(y_test, y_pred),
        'Test_Sample_Size': int(y_test.sum()),
        'Confusion_Matrix': confusion_matrix(y_test, y_pred)
    }

In [27]:
row = results_df.iloc[0]
metrics = replicate_run_from_csv_row(data, features, row)
print(metrics)

Parameters: { "use_label_encoder" } are not used.



{'Precision': np.float64(0.8640776699029126), 'Recall': np.float64(0.03377609108159393), 'F1': np.float64(0.06501095690284879), 'AUC': np.float64(0.5863180211200053), 'MCC': np.float64(0.04757650504599813), 'Accuracy': 0.277040384072296, 'Test_Sample_Size': 5270, 'Confusion_Matrix': array([[1784,   28],
       [5092,  178]])}


In [32]:
import os
import ast
import joblib
import numpy as np
import pandas as pd
from collections import Counter
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

def train_and_save_model(data, features, row, output_dir):
    """
    Trains a model for a specific league using the parameters provided in a row from results_df,
    and saves the trained pipeline for future predictions only if the 'Precision_Test' value is > 0.8.

    Expected CSV row columns:
      - 'League'                : Identifier for the league (used for naming the saved file)
      - 'SMOTE'                 : The sampling strategy for SMOTE (or NaN for no SMOTE)
      - 'Probability_Threshold' : The custom threshold for predictions
      - 'Precision_Test'        : The precision metric (the model is only saved if this > 0.8)
      - 'Params'                : A string/dict of classifier parameters (e.g. hyperparameters)
      - 'Model'                 : The model type ("MLP", "XGBoost", or "Random Forest")

    The function performs a time-series split (first 80% for training), applies SMOTE only to the training set
    if the desired SMOTE ratio is greater than the current ratio, and fits the model.

    The saved dictionary includes:
      - 'pipeline': the trained model pipeline,
      - 'threshold': the probability threshold (float),
      - 'league': the league identifier,
      - 'smote_level': the SMOTE level used (or None).

    The model is only saved if Precision_Test > 0.8.
    """
    import os
    import ast
    from collections import Counter
    from imblearn.pipeline import Pipeline as ImbPipeline
    from imblearn.over_sampling import SMOTE
    from sklearn.preprocessing import StandardScaler
    from sklearn.neural_network import MLPClassifier
    from xgboost import XGBClassifier
    from sklearn.ensemble import RandomForestClassifier
    import joblib

    # Ensure the output directory exists.
    os.makedirs(output_dir, exist_ok=True)

    # Extract parameters from the row.
    smote_level = row['SMOTE']
    threshold = row['Probability_Threshold']
    precision_test = row['Precision_Test']
    param_dict = ast.literal_eval(row['Params']) if isinstance(row['Params'], str) else row['Params']
    model_name = row['Model']
    ht_score = row.get('HT_Score', 'unknown')

    # Prepare data: separate features and target, then perform a time-series split.
    X = data[features]
    y = data['target']
    train_size = int(len(data) * 0.8)
    X_train = X.iloc[:train_size]
    y_train = y.iloc[:train_size]

    # Apply SMOTE if specified.
    if pd.isna(smote_level):
        X_train_res, y_train_res = X_train, y_train
        used_smote_level = None
    else:
        # Determine the current minority-to-majority ratio.
        counts = Counter(y_train)
        minority_class = min(counts, key=counts.get)
        majority_class = max(counts, key=counts.get)
        current_ratio = counts[minority_class] / counts[majority_class]
        desired_ratio = float(smote_level)

        # Apply SMOTE only if desired_ratio is greater than the current ratio.
        if desired_ratio <= current_ratio:
            print(f"Warning: Specified SMOTE level ({desired_ratio:.2f}) is less than or equal to current ratio ({current_ratio:.2f}) for league {league}. Skipping SMOTE.")
            X_train_res, y_train_res = X_train, y_train
            used_smote_level = None
        else:
            smote = SMOTE(sampling_strategy=desired_ratio, random_state=42)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            used_smote_level = desired_ratio

    # Instantiate the classifier based on the model name.
    if model_name == "MLP":
        classifier = MLPClassifier(random_state=42, max_iter=500)
    elif model_name == "XGBoost":
        classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    elif model_name in ["Random Forest", "RandomForest", "Radnom forrest"]:
        classifier = RandomForestClassifier(random_state=42)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    # Build the pipeline with a scaler and the classifier.
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('classifier', classifier)
    ])
    pipeline.set_params(**param_dict)

    # Fit the pipeline on the (potentially resampled) training data.
    pipeline.fit(X_train_res, y_train_res)

    # Check if the precision test meets the requirement.
    if precision_test > 0.8:
        # Save the trained pipeline along with the threshold, league, and SMOTE level.
        model_dict = {
            'pipeline': pipeline,
            'threshold': float(threshold),
            'ht_score': ht_score,
            'smote_level': used_smote_level
        }
        file_name = f"trained_model_{ht_score}.pkl"
        file_path = os.path.join(output_dir, file_name)
        joblib.dump(model_dict, file_path)
        print(f"Saved trained model for league '{ht_score}' to {file_path}")
        return file_path
    else:
        print(f"Model for league '{ht_score}' not saved because Precision_Test ({precision_test}) is not greater than 0.8.")
        return None




def train_and_save_all_models(data, features, results_df, output_dir):
    """
    Iterates over each row in results_df (each representing parameters for a league model),
    trains the model using those parameters, and saves the model pipeline for future predictions.

    Returns a list of file paths of the saved models.
    """
    saved_files = []
    for idx, row in results_df.iterrows():
        ht_score = row['HT_Score'].strip("(',);")
        data = matches[matches['ht_score']==ht_score]
        try:
            file_path = train_and_save_model(data, features, row, output_dir)
            saved_files.append(file_path)
        except Exception as e:
            print(f"Error training model for row index {idx} (HT Score: {row.get('HT_Score','unknown')}): {e}")
    return saved_files

# Example usage:
# Assuming `data` is your DataFrame containing a 'target' column and the feature columns,
# `features` is your list of feature column names, and `results_df` is the DataFrame
# with each row containing model parameters for a league model (with 'League', 'SMOTE',
# 'Probability_Threshold', 'Params', and 'Model' columns).
#
output_directory = r"path_ht_score\to\save\models"
saved_model_files = train_and_save_all_models(data, features, results_df, output_directory)
#print("Saved model files:", saved_model_files)


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('0-0',)' to path_ht_score\to\save\models\trained_model_('0-0',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('0-1',)' to path_ht_score\to\save\models\trained_model_('0-1',).pkl
Saved trained model for league '('0-2',)' to path_ht_score\to\save\models\trained_model_('0-2',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('1-0',)' to path_ht_score\to\save\models\trained_model_('1-0',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('1-1',)' to path_ht_score\to\save\models\trained_model_('1-1',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('1-2',)' to path_ht_score\to\save\models\trained_model_('1-2',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('2-0',)' to path_ht_score\to\save\models\trained_model_('2-0',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('2-1',)' to path_ht_score\to\save\models\trained_model_('2-1',).pkl


Parameters: { "use_label_encoder" } are not used.



Saved trained model for league '('3-0',)' to path_ht_score\to\save\models\trained_model_('3-0',).pkl


In [37]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import (precision_score, recall_score, f1_score, roc_auc_score,
                             matthews_corrcoef, accuracy_score, confusion_matrix)

def test_saved_model(saved_model_path, data, features):
    """
    Loads a saved model (including its pipeline, probability threshold, SMOTE level, and league)
    and evaluates it on the test set.

    Assumes the data is to be split using a time-series approach (first 80% used for training, last 20% for testing)
    and that the saved model dictionary contains keys:
      - 'pipeline': the trained model pipeline,
      - 'threshold': the probability threshold (float) used for converting probabilities to predictions,
      - 'league': the league identifier,
      - 'smote_level': the SMOTE level used during training (or None if SMOTE was not applied).

    Returns a dictionary of evaluation metrics computed on the test set, including the SMOTE level.
    """
    # Load the saved model dictionary
    model_dict = joblib.load(saved_model_path)
    pipeline = model_dict['pipeline']
    threshold = model_dict['threshold']
    ht_score = model_dict['ht_score']
    smote_level = model_dict.get('smote_level')

    print(f"Testing model for league '{ht_score}' using threshold: {threshold}")
    if smote_level is not None:
        print(f"Using SMOTE level: {smote_level}")
    else:
        print("No SMOTE was applied during training.")

    # Prepare the test set (using 80/20 time-series split as in training)
    X = data[features]
    y = data['target']
    train_size = int(len(data) * 0.8)
    X_test = X.iloc[train_size:]
    y_test = y.iloc[train_size:]

    # Generate predicted probabilities and apply the custom threshold to get predictions
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= float(threshold)).astype(int)

    # Compute evaluation metrics on the test set
    metrics = {
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'AUC': roc_auc_score(y_test, y_proba),
        'MCC': matthews_corrcoef(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Test_Sample_Size': int(np.sum(y_pred)),
        'Confusion_Matrix': confusion_matrix(y_test, y_pred),
        'SMOTE_Level': smote_level
    }

    print("Test metrics:")
    for key, value in metrics.items():
         print(f"{key}: {value}")

    return metrics

# Example usage:
# Assuming the saved model for league Bel1 is in the file specified below, and you have your
# `data` DataFrame and list of `features` already defined:
test_model_path = r"/Goals/2H_goal/ht_scoreline/path_ht_score\to\save\models\trained_model_('2-0',).pkl"
ht_score = test_model_path.split("trained_model_")[1].split(".pkl")[0].strip("(',)")
print(ht_score)
data = matches[matches['ht_score']==ht_score]
test_metrics = test_saved_model(test_model_path, data, features)


2-0
Testing model for league '('2-0',)' using threshold: 0.76
Using SMOTE level: 0.41
Test metrics:
Precision: 0.926829268292683
Recall: 0.1956745623069001
F1: 0.3231292517006803
AUC: 0.5654623346453546
MCC: 0.13051495189118867
Accuracy: 0.3310924369747899
Test_Sample_Size: 205
Confusion_Matrix: [[204  15]
 [781 190]]
SMOTE_Level: 0.41


In [36]:
results_df

Unnamed: 0,HT_Score,Model,SMOTE,Precision_Test,Precision_Test/Train_Ratio,Probability_Threshold,Params
0,"('0-0',)",XGBoost,,0.8641,0.935,0.8,"{'classifier__colsample_bytree': 0.7, 'classif..."
1,"('0-1',)",XGBoost,0.62,0.8711,0.9283,0.81,"{'classifier__colsample_bytree': 0.8, 'classif..."
2,"('0-2',)",MLP,,0.8475,0.9745,0.81,"{'classifier__activation': 'relu', 'classifier..."
3,"('1-0',)",XGBoost,0.63,0.8791,0.9422,0.81,"{'classifier__colsample_bytree': 0.8, 'classif..."
4,"('1-1',)",XGBoost,,0.9028,0.9166,0.81,"{'classifier__colsample_bytree': 0.7, 'classif..."
5,"('1-2',)",XGBoost,0.9,0.8564,0.9653,0.6,"{'classifier__colsample_bytree': 0.7, 'classif..."
6,"('2-0',)",XGBoost,0.41,0.9268,0.9914,0.76,"{'classifier__colsample_bytree': 0.8, 'classif..."
7,"('2-1',)",XGBoost,0.47,0.9028,0.9053,0.8,"{'classifier__colsample_bytree': 0.7, 'classif..."
8,"('3-0',)",XGBoost,0.73,0.87,0.9321,0.65,"{'classifier__colsample_bytree': 0.8, 'classif..."
