In [1]:
import ast
import glob
import os
import re
from collections import Counter
from datetime import datetime
import joblib
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, matthews_corrcoef,
    accuracy_score, confusion_matrix
)
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier


In [113]:
def sort_files_by_precision(directory: str, inplace: bool = True) -> None:
    """
    Reads all CSV files in `directory`, sorts each by the 'Precision_Test' column
    in descending order, and writes the sorted DataFrame back.

    Args:
        directory (str): Path to the folder containing CSVs.
        inplace (bool):
            - If True, overwrite each original file.
            - If False, write to new files prefixed with 'sorted_'.
    """
    pattern = os.path.join(directory, "*.csv")
    for filepath in glob.glob(pattern):
        df = pd.read_csv(filepath)
        if 'Precision_Test' not in df.columns:
            print(f"Skipping {os.path.basename(filepath)}: no 'Precision_Test' column")
            continue

        # sort so highest Precision_Test is at the top
        df_sorted = df.sort_values(by='Precision_Test', ascending=False)

        if inplace:
            df_sorted.to_csv(filepath, index=False)
            print(f"✔ Sorted (in‐place): {os.path.basename(filepath)}")
        else:
            dirname, filename = os.path.split(filepath)
            new_filename = f"sorted_{filename}"
            df_sorted.to_csv(os.path.join(dirname, new_filename), index=False)
            print(f"✔ Written sorted file: {new_filename}")


directory = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\2H_goal\best_models_by_ht_scoreline"
sort_files_by_precision(directory, inplace=True)


In [114]:
# Directory containing the CSV files
directory = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\2H_goal\ht_scoreline\best_models_by_ht_scoreline"

# List to collect each top row's data
top_rows = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.startswith("model_metrics_(") and filename.endswith(".csv"):
        ht_score = filename.split("_")[2]  # Extract league name from filename
        file_path = os.path.join(directory, filename)

        try:
            df = pd.read_csv(file_path)
            top_row = df.iloc[0]  # Get only the first row
            top_rows.append({
                'HT_Score': ht_score,
                'Model': top_row['Model'],
                'SMOTE': top_row.get('SMOTE'),
                'Precision_Test': top_row.get('Precision_Test'),
                'Precision_Test/Train_Ratio': top_row.get('Precision_Test/Train_Ratio'),
                'Probability_Threshold': top_row.get('Probability_Threshold'),
                'Params': top_row.get('Params')
            })
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Combine all results into one DataFrame
results_df = pd.DataFrame(top_rows)

# Save or display results
#results_df.to_csv("top_row_model_params.csv", index=False)



Error processing model_metrics_('0-2',)_20250516_003952.csv: No columns to parse from file


In [115]:
results_df

Unnamed: 0,HT_Score,Model,SMOTE,Precision_Test,Precision_Test/Train_Ratio,Probability_Threshold,Params
0,"('0-0',)",XGBoost,,0.8513,0.9226,0.72,"{'classifier__colsample_bytree': 0.7, 'classif..."
1,"('0-1',)",XGBoost,0.82,0.8525,0.9492,0.77,"{'classifier__colsample_bytree': 0.7, 'classif..."
2,"('1-0',)",XGBoost,0.33,0.8525,0.9049,0.78,"{'classifier__colsample_bytree': 0.7, 'classif..."
3,"('1-1',)",XGBoost,0.6,0.8502,0.9383,0.76,"{'classifier__colsample_bytree': 0.7, 'classif..."
4,"('1-2',)",XGBoost,0.9,0.8531,0.938,0.65,"{'classifier__colsample_bytree': 0.8, 'classif..."
5,"('2-0',)",XGBoost,0.61,0.8508,0.9627,0.71,"{'classifier__colsample_bytree': 0.8, 'classif..."
6,"('2-1',)",XGBoost,0.47,0.8522,0.911,0.73,"{'classifier__colsample_bytree': 0.8, 'classif..."
7,"('3-0',)",XGBoost,0.68,0.8528,0.9066,0.62,"{'classifier__colsample_bytree': 0.7, 'classif..."


In [116]:
features = [
    # 'Unnamed: 0',
    # 'country',
    # 'season',
    # 'date',
    # 'ko_time',
    'round',
    # 'home_team',
    # 'away_team',
    # 'home_goals_ft',
    # 'away_goals_ft',
    # 'home_goals_ht',
    # 'away_goals_ht',
    'home_team_place_total',
    'home_team_place_home',
    'away_team_place_total',
    'away_team_place_away',
    'home_odds',
    'draw_odds',
    'away_odds',
    'over_25_odds',
    'under_25_odds',
    'elo_home',
    'elo_away',
    'form_home',
    'form_away',
    # 'shots_home',
    # 'shots_home_1h',
    # 'shots_home_2h',
    # 'shots_away',
    # 'shots_away_1h',
    # 'shots_away_2h',
    # 'shots_on_target_home',
    # 'shots_on_target_home_1h',
    # 'shots_on_target_home_2h',
    # 'shots_on_target_away',
    # 'shots_on_target_away_1h',
    # 'shots_on_target_away_2h',
    # 'corners_home',
    # 'corners_home_1h',
    # 'corners_home_2h',
    # 'corners_away',
    # 'corners_away_1h',
    # 'corners_away_2h',
    # 'fouls_home',
    # 'fouls_home_1h',
    # 'fouls_home_2h',
    # 'fouls_away',
    # 'fouls_away_1h',
    # 'fouls_away_2h',
    # 'yellow_cards_home',
    # 'yellow_cards_home_1h',
    # 'yellow_cards_home_2h',
    # 'yellow_cards_away',
    # 'yellow_cards_away_1h',
    # 'yellow_cards_away_2h',
    # 'possession_home',
    # 'possession_home_1h',
    # 'possession_home_2h',
    # 'possession_away',
    # 'possession_away_1h',
    # 'possession_away_2h',
    # 'goals_scored_total_home',
    # 'goals_conceded_total_home',
    # 'goals_scored_total_away',
    # 'goals_conceded_total_away',
    # 'points_home',
    # 'points_away',
    # 'is_home_x',
    'home_Overall_Rolling_GoalsScored_Mean',
    'home_Overall_Rolling_GoalsScored_Std',
    'home_Overall_Rolling_GoalsScored_Mean_Short',
    'home_Overall_Momentum_GoalsScored',
    'home_Overall_Trend_Slope_GoalsScored',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'home_Overall_Rolling_FirstHalfGoalsScored_Std',
    'home_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Overall_Momentum_FirstHalfGoalsScored',
    'home_Overall_Trend_Slope_FirstHalfGoalsScored',
    'home_Overall_Rolling_Shots_Mean',
    'home_Overall_Rolling_Shots_Std',
    'home_Overall_Rolling_Shots_Mean_Short',
    'home_Overall_Momentum_Shots',
    'home_Overall_Trend_Slope_Shots',
    'home_Overall_Rolling_Shots_1h_Mean',
    'home_Overall_Rolling_Shots_1h_Std',
    'home_Overall_Rolling_Shots_1h_Mean_Short',
    'home_Overall_Momentum_Shots_1h',
    'home_Overall_Trend_Slope_Shots_1h',
    'home_Overall_Rolling_Corners_Mean',
    'home_Overall_Rolling_Corners_Std',
    'home_Overall_Rolling_Corners_Mean_Short',
    'home_Overall_Momentum_Corners',
    'home_Overall_Trend_Slope_Corners',
    'home_Overall_Rolling_Corners_1h_Mean',
    'home_Overall_Rolling_Corners_1h_Std',
    'home_Overall_Rolling_Corners_1h_Mean_Short',
    'home_Overall_Momentum_Corners_1h',
    'home_Overall_Trend_Slope_Corners_1h',
    'home_Overall_Rolling_ShotsOnTarget_Mean',
    'home_Overall_Rolling_ShotsOnTarget_Std',
    'home_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget',
    'home_Overall_Trend_Slope_ShotsOnTarget',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'home_Overall_Rolling_ShotsOnTarget_1h_Std',
    'home_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Overall_Momentum_ShotsOnTarget_1h',
    'home_Overall_Trend_Slope_ShotsOnTarget_1h',
    'home_Rolling_GoalsScored_Mean',
    'home_Rolling_GoalsScored_Std',
    'home_Rolling_GoalsScored_Mean_Short',
    'home_Momentum_GoalsScored',
    'home_Trend_Slope_GoalsScored',
    'home_Rolling_FirstHalfGoalsScored_Mean',
    'home_Rolling_FirstHalfGoalsScored_Std',
    'home_Rolling_FirstHalfGoalsScored_Mean_Short',
    'home_Momentum_FirstHalfGoalsScored',
    'home_Trend_Slope_FirstHalfGoalsScored',
    'home_Rolling_Shots_Mean',
    'home_Rolling_Shots_Std',
    'home_Rolling_Shots_Mean_Short',
    'home_Momentum_Shots',
    'home_Trend_Slope_Shots',
    'home_Rolling_Shots_1h_Mean',
    'home_Rolling_Shots_1h_Std',
    'home_Rolling_Shots_1h_Mean_Short',
    'home_Momentum_Shots_1h',
    'home_Trend_Slope_Shots_1h',
    'home_Rolling_Corners_Mean',
    'home_Rolling_Corners_Std',
    'home_Rolling_Corners_Mean_Short',
    'home_Momentum_Corners',
    'home_Trend_Slope_Corners',
    'home_Rolling_Corners_1h_Mean',
    'home_Rolling_Corners_1h_Std',
    'home_Rolling_Corners_1h_Mean_Short',
    'home_Momentum_Corners_1h',
    'home_Trend_Slope_Corners_1h',
    'home_Rolling_ShotsOnTarget_Mean',
    'home_Rolling_ShotsOnTarget_Std',
    'home_Rolling_ShotsOnTarget_Mean_Short',
    'home_Momentum_ShotsOnTarget',
    'home_Trend_Slope_ShotsOnTarget',
    'home_Rolling_ShotsOnTarget_1h_Mean',
    'home_Rolling_ShotsOnTarget_1h_Std',
    'home_Rolling_ShotsOnTarget_1h_Mean_Short',
    'home_Momentum_ShotsOnTarget_1h',
    'home_Trend_Slope_ShotsOnTarget_1h',
    'home_Overall_Percent_Over_1.5',
    'home_Overall_Rolling5_Percent_Over_1.5',
    'home_Percent_Over_1.5',
    'home_Rolling5_Percent_Over_1.5',
    'home_Overall_Percent_Over_2.5',
    'home_Overall_Rolling5_Percent_Over_2.5',
    'home_Percent_Over_2.5',
    'home_Rolling5_Percent_Over_2.5',
    'home_Overall_Percent_Over_3.5',
    'home_Overall_Rolling5_Percent_Over_3.5',
    'home_Percent_Over_3.5',
    'home_Rolling5_Percent_Over_3.5',
    'home_TeamPct_Over_0.5',
    'home_TeamPct_Over_1.5',
    'home_TeamPct_Over_2.5',
    'home_TeamPct_Over_3.5',
    'home_CornersPct_Over_3.5',
    'home_CornersRolling5Pct_Over_3.5',
    'home_CornersPct_Over_4.5',
    'home_CornersRolling5Pct_Over_4.5',
    'home_CornersPct_Over_5.5',
    'home_CornersRolling5Pct_Over_5.5',
    'home_CornersPct_Over_6.5',
    'home_CornersRolling5Pct_Over_6.5',
    'home_SeasonPct_Over_9.5',
    'home_Rolling5Pct_Over_9.5',
    'home_SeasonPct_Over_10.5',
    'home_Rolling5Pct_Over_10.5',
    'home_SeasonPct_Over_11.5',
    'home_Rolling5Pct_Over_11.5',
    # 'is_home_y',
    'away_Overall_Rolling_GoalsScored_Mean',
    'away_Overall_Rolling_GoalsScored_Std',
    'away_Overall_Rolling_GoalsScored_Mean_Short',
    'away_Overall_Momentum_GoalsScored',
    'away_Overall_Trend_Slope_GoalsScored',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean',
    'away_Overall_Rolling_FirstHalfGoalsScored_Std',
    'away_Overall_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Overall_Momentum_FirstHalfGoalsScored',
    'away_Overall_Trend_Slope_FirstHalfGoalsScored',
    'away_Overall_Rolling_Shots_Mean',
    'away_Overall_Rolling_Shots_Std',
    'away_Overall_Rolling_Shots_Mean_Short',
    'away_Overall_Momentum_Shots',
    'away_Overall_Trend_Slope_Shots',
    'away_Overall_Rolling_Shots_1h_Mean',
    'away_Overall_Rolling_Shots_1h_Std',
    'away_Overall_Rolling_Shots_1h_Mean_Short',
    'away_Overall_Momentum_Shots_1h',
    'away_Overall_Trend_Slope_Shots_1h',
    'away_Overall_Rolling_Corners_Mean',
    'away_Overall_Rolling_Corners_Std',
    'away_Overall_Rolling_Corners_Mean_Short',
    'away_Overall_Momentum_Corners',
    'away_Overall_Trend_Slope_Corners',
    'away_Overall_Rolling_Corners_1h_Mean',
    'away_Overall_Rolling_Corners_1h_Std',
    'away_Overall_Rolling_Corners_1h_Mean_Short',
    'away_Overall_Momentum_Corners_1h',
    'away_Overall_Trend_Slope_Corners_1h',
    'away_Overall_Rolling_ShotsOnTarget_Mean',
    'away_Overall_Rolling_ShotsOnTarget_Std',
    'away_Overall_Rolling_ShotsOnTarget_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget',
    'away_Overall_Trend_Slope_ShotsOnTarget',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean',
    'away_Overall_Rolling_ShotsOnTarget_1h_Std',
    'away_Overall_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Overall_Momentum_ShotsOnTarget_1h',
    'away_Overall_Trend_Slope_ShotsOnTarget_1h',
    'away_Rolling_GoalsScored_Mean',
    'away_Rolling_GoalsScored_Std',
    'away_Rolling_GoalsScored_Mean_Short',
    'away_Momentum_GoalsScored',
    'away_Trend_Slope_GoalsScored',
    'away_Rolling_FirstHalfGoalsScored_Mean',
    'away_Rolling_FirstHalfGoalsScored_Std',
    'away_Rolling_FirstHalfGoalsScored_Mean_Short',
    'away_Momentum_FirstHalfGoalsScored',
    'away_Trend_Slope_FirstHalfGoalsScored',
    'away_Rolling_Shots_Mean',
    'away_Rolling_Shots_Std',
    'away_Rolling_Shots_Mean_Short',
    'away_Momentum_Shots',
    'away_Trend_Slope_Shots',
    'away_Rolling_Shots_1h_Mean',
    'away_Rolling_Shots_1h_Std',
    'away_Rolling_Shots_1h_Mean_Short',
    'away_Momentum_Shots_1h',
    'away_Trend_Slope_Shots_1h',
    'away_Rolling_Corners_Mean',
    'away_Rolling_Corners_Std',
    'away_Rolling_Corners_Mean_Short',
    'away_Momentum_Corners',
    'away_Trend_Slope_Corners',
    'away_Rolling_Corners_1h_Mean',
    'away_Rolling_Corners_1h_Std',
    'away_Rolling_Corners_1h_Mean_Short',
    'away_Momentum_Corners_1h',
    'away_Trend_Slope_Corners_1h',
    'away_Rolling_ShotsOnTarget_Mean',
    'away_Rolling_ShotsOnTarget_Std',
    'away_Rolling_ShotsOnTarget_Mean_Short',
    'away_Momentum_ShotsOnTarget',
    'away_Trend_Slope_ShotsOnTarget',
    'away_Rolling_ShotsOnTarget_1h_Mean',
    'away_Rolling_ShotsOnTarget_1h_Std',
    'away_Rolling_ShotsOnTarget_1h_Mean_Short',
    'away_Momentum_ShotsOnTarget_1h',
    'away_Trend_Slope_ShotsOnTarget_1h',
    'away_Overall_Percent_Over_1.5',
    'away_Overall_Rolling5_Percent_Over_1.5',
    'away_Percent_Over_1.5',
    'away_Rolling5_Percent_Over_1.5',
    'away_Overall_Percent_Over_2.5',
    'away_Overall_Rolling5_Percent_Over_2.5',
    'away_Percent_Over_2.5',
    'away_Rolling5_Percent_Over_2.5',
    'away_Overall_Percent_Over_3.5',
    'away_Overall_Rolling5_Percent_Over_3.5',
    'away_Percent_Over_3.5',
    'away_Rolling5_Percent_Over_3.5',
    'away_TeamPct_Over_0.5',
    'away_TeamPct_Over_1.5',
    'away_TeamPct_Over_2.5',
    'away_TeamPct_Over_3.5',
    'away_CornersPct_Over_3.5',
    'away_CornersRolling5Pct_Over_3.5',
    'away_CornersPct_Over_4.5',
    'away_CornersRolling5Pct_Over_4.5',
    'away_CornersPct_Over_5.5',
    'away_CornersRolling5Pct_Over_5.5',
    'away_CornersPct_Over_6.5',
    'away_CornersRolling5Pct_Over_6.5',
    'away_SeasonPct_Over_9.5',
    'away_Rolling5Pct_Over_9.5',
    'away_SeasonPct_Over_10.5',
    'away_Rolling5Pct_Over_10.5',
    'away_SeasonPct_Over_11.5',
    'away_Rolling5Pct_Over_11.5'
]

In [125]:
def pre_prepared_data(file_path):
    data = pd.read_csv(file_path,
                       low_memory=False)
    # Convert 'date' column to datetime object
    data['date'] = pd.to_datetime(data['date'], format="%Y-%m-%d", errors='coerce')
    data = data.sort_values(by='date')

    # Convert today's date to a pandas Timestamp for compatibility.
    today = pd.Timestamp(datetime.today().date())
    data = data[data['date'] <= today]

    # Clean up and finalise the match-level DataFrame
    data.dropna(inplace=True)
    data['ht_score'] = data['home_goals_ht'].astype(str) + '-' + data['away_goals_ht'].astype(str)
    data['total_goals'] = data['home_goals_ft'] + data['away_goals_ft']
    data['target'] = ((data['home_goals_ft'] > data['home_goals_ht']) | (
                data['away_goals_ft'] > data['away_goals_ht'])).astype(int)
    return data


matches = pre_prepared_data(r"C:\Users\leere\PycharmProjects\Football_ML3\engineered_master_data_ALL_2017+.csv")
matches = pd.get_dummies(matches, columns=['country'], prefix='country')
dummy_cols = [col for col in matches.columns if col.startswith('country_')]
features = features + dummy_cols
# Process each league separately
ht_score = matches[['ht_score']].drop_duplicates().apply(tuple, axis=1)

In [118]:
matches_filtered = matches[matches['ht_score'] == '1-1']
data = matches_filtered.copy()
#matches_filtered

In [120]:
def replicate_run_from_csv_row(data, base_features, row):
    """
    Replicates a model run (MLP, XGBoost or RF) from one CSV row,
    but first one-hot-encodes 'country' exactly as in your main script.
    """
    # 2) Extract parameters from CSV row
    smote_level = row['SMOTE']
    threshold = float(row['Probability_Threshold'])
    param_dict = ast.literal_eval(row['Params']) if isinstance(row['Params'], str) else row['Params']
    model_name = row['Model']

    # 3) Train/test split (time-series)
    X = data[features]
    y = data['target']
    split = int(len(data) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    # 4) SMOTE on the training set only
    if smote_level not in [None, 'None'] and pd.notna(smote_level):
        sm = SMOTE(sampling_strategy=float(smote_level), random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
    else:
        X_train_res, y_train_res = X_train, y_train

    # 5) Instantiate the correct classifier with your original defaults
    if model_name == "MLP":
        from sklearn.neural_network import MLPClassifier
        clf = MLPClassifier(
            random_state=42,
            max_iter=10000,  # ← match your pipeline
            early_stopping=param_dict.get('classifier__early_stopping', True)
        )
    elif model_name == "XGBoost":
        from xgboost import XGBClassifier
        clf = XGBClassifier(
            random_state=42,
            eval_metric='logloss'
        )
    elif model_name in ["Random Forest", "RandomForest"]:
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(
            random_state=42,
            class_weight='balanced'  # ← match your pipeline
        )
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    # 6) Build and configure the pipeline
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('classifier', clf)
    ])
    pipeline.set_params(**param_dict)

    # 7) Fit & predict
    pipeline.fit(X_train_res, y_train_res)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    # 8) Compute metrics
    return {
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'AUC': roc_auc_score(y_test, y_proba),
        'MCC': matthews_corrcoef(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Test_Predicted_Positives': int(y_pred.sum()),
        'Confusion_Matrix': confusion_matrix(y_test, y_pred)
    }


In [126]:
row = results_df.iloc[3]
metrics = replicate_run_from_csv_row(data, features, row)
print(metrics)

{'Precision': np.float64(0.8500881834215167), 'Recall': np.float64(0.33967582804792107), 'F1': np.float64(0.48539778449144005), 'AUC': np.float64(0.5866516059489835), 'MCC': np.float64(0.12641126486429052), 'Accuracy': 0.44516829533116176, 'Test_Predicted_Positives': 567, 'Confusion_Matrix': array([[338,  85],
       [937, 482]])}


In [122]:
def train_and_save_model(data, base_features, row, output_dir):
    """
    Trains a model for a specific half-time score (ht_score) using the parameters in 'row',
    one-hot-encodes 'country', then saves the pipeline + metadata to .pkl if Precision_Test > 0.8.
    # """
    # # 1) One-hot-encode country exactly as in your main pipeline
    # if 'country' in data.columns:
    #     data = pd.get_dummies(data, columns=['country'], prefix='country')
    #     dummy_cols = [c for c in data.columns if c.startswith('country_')]
    #     features = base_features + dummy_cols
    # else:
    #     features = base_features[:]

    # 2) Extract parameters
    ht_score = row.get('HT_Score', row.get('ht_score', 'unknown'))
    smote_level = row['SMOTE']
    threshold = float(row['Probability_Threshold'])
    precision_test = float(row['Precision_Test'])
    param_dict = (ast.literal_eval(row['Params'])
                  if isinstance(row['Params'], str)
                  else row['Params'])
    model_name = row['Model']

    # 3) Time-series split (first 80% train)
    X = data[features]
    y = data['target']
    split = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

    # 4) Apply SMOTE only if specified & needed
    used_smote = None
    if smote_level not in (None, 'None') and pd.notna(smote_level):
        desired = float(smote_level)
        counts = Counter(y_train)
        current = counts[min(counts, key=counts.get)] / counts[max(counts, key=counts.get)]
        if desired > current:
            sm = SMOTE(sampling_strategy=desired, random_state=42)
            X_train, y_train = sm.fit_resample(X_train, y_train)
            used_smote = desired
        else:
            print(f"SMOTE {desired:.2f} ≤ current {current:.2f} for '{ht_score}', skipping SMOTE")

    # 5) Instantiate the classifier
    if model_name == "MLP":
        clf = MLPClassifier(random_state=42, max_iter=10000)
    elif model_name == "XGBoost":
        clf = XGBClassifier(random_state=42,
                            eval_metric='logloss')
    elif "RandomForest" in model_name or "Random Forest" in model_name:
        clf = RandomForestClassifier(random_state=42,
                                     class_weight='balanced')
    else:
        raise ValueError(f"Unsupported model: {model_name!r}")

    # 6) Build and configure pipeline
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),
        ('classifier', clf)
    ])
    pipeline.set_params(**param_dict)

    # 7) Fit on (possibly resampled) training data
    pipeline.fit(X_train, y_train)

    # 8) Save if precision_test > 0.8
    if precision_test > 0.8:
        os.makedirs(output_dir, exist_ok=True)
        model_obj = {
            'pipeline': pipeline,
            'threshold': threshold,
            'ht_score': ht_score,
            'smote_level': used_smote,
            'params': param_dict
        }
        ht_clean = str(ht_score).replace('/', '-').replace(' ', '')
        fname = f"trained_model_{ht_clean}_thr{threshold:.2f}_sm{(used_smote or 0):.2f}.pkl"
        path = os.path.join(output_dir, fname)
        joblib.dump(model_obj, path)
        print(f"Saved model for ht_score '{ht_score}' to: {path}")
        return path

    print(f"Did not save model for ht_score '{ht_score}' (Precision_Test = {precision_test:.2f})")
    return None


def train_and_save_all_models(data, features, results_df, output_dir):
    """
    Iterates over each row in results_df (each representing parameters for a league model),
    trains the model using those parameters, and saves the model pipeline for future predictions.

    Returns a list of file paths of the saved models.
    """
    saved_files = []
    for idx, row in results_df.iterrows():
        ht_score = row['HT_Score'].strip("(',);")
        data = matches[matches['ht_score'] == ht_score]
        try:
            file_path = train_and_save_model(data, features, row, output_dir)
            saved_files.append(file_path)
        except Exception as e:
            print(f"Error training model for row index {idx} (HT Score: {row.get('HT_Score', 'unknown')}): {e}")
    return saved_files


# Example usage:
# Assuming `data` is your DataFrame containing a 'target' column and the feature columns,
# `features` is your list of feature column names, and `results_df` is the DataFrame
# with each row containing model parameters for a league model (with 'League', 'SMOTE',
# 'Probability_Threshold', 'Params', and 'Model' columns).
#
output_directory = r"path_ht_score\to\save\models"
saved_model_files = train_and_save_all_models(data, features, results_df, output_directory)
#print("Saved model files:", saved_model_files)


Saved model for ht_score '('0-0',)' to: path_ht_score\to\save\models\trained_model_('0-0',)_thr0.72_sm0.00.pkl
Saved model for ht_score '('0-1',)' to: path_ht_score\to\save\models\trained_model_('0-1',)_thr0.77_sm0.82.pkl
Saved model for ht_score '('1-0',)' to: path_ht_score\to\save\models\trained_model_('1-0',)_thr0.78_sm0.33.pkl
Saved model for ht_score '('1-1',)' to: path_ht_score\to\save\models\trained_model_('1-1',)_thr0.76_sm0.60.pkl
Saved model for ht_score '('1-2',)' to: path_ht_score\to\save\models\trained_model_('1-2',)_thr0.65_sm0.90.pkl
Saved model for ht_score '('2-0',)' to: path_ht_score\to\save\models\trained_model_('2-0',)_thr0.71_sm0.61.pkl
Saved model for ht_score '('2-1',)' to: path_ht_score\to\save\models\trained_model_('2-1',)_thr0.73_sm0.47.pkl
Saved model for ht_score '('3-0',)' to: path_ht_score\to\save\models\trained_model_('3-0',)_thr0.62_sm0.68.pkl


In [123]:
def test_saved_model(saved_model_path: str,
                     data: pd.DataFrame,
                     features: list) -> dict:
    """
    Loads a saved model dict (with keys 'pipeline', 'threshold', 'ht_score', 'smote_level'),
    then evaluates it on the last 20% of `data` (time-series split) using exactly the
    columns in `features`. Returns the same metrics dict you used at training time.
    """
    # 1) Load
    model_dict = joblib.load(saved_model_path)
    pipeline = model_dict['pipeline']
    threshold = float(model_dict['threshold'])
    ht_score = model_dict['ht_score']
    smote_lvl = model_dict.get('smote_level')

    print(f"Testing ht_score='{ht_score}', threshold={threshold:.2f}, SMOTE={smote_lvl}")

    # 2) Prepare test split (no further encoding—we assume `features` already covers all dummy cols)
    X = data[features]
    y = data['target']
    split = int(len(X) * 0.8)
    X_test = X.iloc[split:]
    y_test = y.iloc[split:]

    # 3) Predict & apply threshold
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    # 4) Compute metrics
    metrics = {
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1': f1_score(y_test, y_pred, zero_division=0),
        'AUC': roc_auc_score(y_test, y_proba),
        'MCC': matthews_corrcoef(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Test_Predicted_Positives': int(y_pred.sum()),
        'Confusion_Matrix': confusion_matrix(y_test, y_pred),
        'SMOTE_Level': smote_lvl
    }

    print("Test metrics:")
    for name, val in metrics.items():
        print(f"  {name}: {val}")

    return metrics


def extract_ht_score(model_path: str) -> str:
    """
    Given a path like
      .../trained_model_('2-0',)_thr0.71_sm0.61.pkl
    returns the clean ht_score string, e.g. "2-0".
    """
    fname = os.path.basename(model_path)
    m = re.match(r"trained_model_(.+?)_thr", fname)
    if not m:
        raise ValueError(f"Filename {fname!r} doesn’t match expected pattern")
    raw = m.group(1)  # e.g. "('2-0',)" or "0-0"
    # if it's a tuple literal, eval and pick the first element
    if raw.startswith("(") and raw.endswith(")"):
        try:
            tpl = ast.literal_eval(raw)
            return tpl[0]
        except Exception:
            # fallback to stripping punctuation
            return raw.strip("()'\",")
    else:
        return raw


# 'features' here already includes your country dummy columns
base_features = features

# Pick out ht_score from filename
model_path = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\2H_goal\ht_scoreline\path_ht_score\to\save\models\trained_model_('0-0',)_thr0.72_sm0.00.pkl"
ht_score = extract_ht_score(model_path)

# Subset your matches DataFrame
df_20 = matches[matches['ht_score'] == ht_score]

# Run the test
results = test_saved_model(model_path, df_20, base_features)
results


Testing ht_score='('0-0',)', threshold=0.72, SMOTE=None
Test metrics:
  Precision: 0.8512720156555773
  Recall: 0.0825426944971537
  F1: 0.15049299429164503
  AUC: 0.5870722696256456
  MCC: 0.06846761707240423
  Accuracy: 0.3065518215193448
  Test_Predicted_Positives: 511
  Confusion_Matrix: [[1736   76]
 [4835  435]]
  SMOTE_Level: None


{'Precision': np.float64(0.8512720156555773),
 'Recall': np.float64(0.0825426944971537),
 'F1': np.float64(0.15049299429164503),
 'AUC': np.float64(0.5870722696256456),
 'MCC': np.float64(0.06846761707240423),
 'Accuracy': 0.3065518215193448,
 'Test_Predicted_Positives': 511,
 'Confusion_Matrix': array([[1736,   76],
        [4835,  435]]),
 'SMOTE_Level': None}

In [127]:
results_df

Unnamed: 0,HT_Score,Model,SMOTE,Precision_Test,Precision_Test/Train_Ratio,Probability_Threshold,Params
0,"('0-0',)",XGBoost,,0.8513,0.9226,0.72,"{'classifier__colsample_bytree': 0.7, 'classif..."
1,"('0-1',)",XGBoost,0.82,0.8525,0.9492,0.77,"{'classifier__colsample_bytree': 0.7, 'classif..."
2,"('1-0',)",XGBoost,0.33,0.8525,0.9049,0.78,"{'classifier__colsample_bytree': 0.7, 'classif..."
3,"('1-1',)",XGBoost,0.6,0.8502,0.9383,0.76,"{'classifier__colsample_bytree': 0.7, 'classif..."
4,"('1-2',)",XGBoost,0.9,0.8531,0.938,0.65,"{'classifier__colsample_bytree': 0.8, 'classif..."
5,"('2-0',)",XGBoost,0.61,0.8508,0.9627,0.71,"{'classifier__colsample_bytree': 0.8, 'classif..."
6,"('2-1',)",XGBoost,0.47,0.8522,0.911,0.73,"{'classifier__colsample_bytree': 0.8, 'classif..."
7,"('3-0',)",XGBoost,0.68,0.8528,0.9066,0.62,"{'classifier__colsample_bytree': 0.7, 'classif..."
