In [1]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    HistGradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:

import pandas as pd
import requests
import logging
from io import StringIO
import concurrent.futures
from tqdm import tqdm

# Set logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
DOWNLOADING_OPTION = 1  # 1 for web scraping, 0 for using saved data
SEASONS = range(2000, 2025)
TO_KEEP = [
    'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR',
    'HS', 'AS', 'HC', 'AC', 'HF', 'AF', 'HST', 'AST', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A',
    'B365>2.5', 'B365<2.5'
]

DICT_COUNTRIES = {
    "Spanish La Liga": "SP1", "Spanish Segunda Division": "SP2", "German Bundesliga": "D1",
    "German Bundesliga 2": "D2", "Italian Serie A": "I1", "Italian Serie B": "I2",
    "English Premier League": "E0", "English Championship": "E1", "English League 1": "E2",
    "English League C": "EC", "English League 2": "E3", "French Ligue 1": "F1",
    "French Ligue 2": "F2", "Dutch Eredivisie": "N1", "Belgian First Division A": "B1",
    "Portuguese Primeira Liga": "P1", "Turkish Super League": "T1", "Greek Super League": "G1",
    "Scottish Premier League": "SC0", "Scottish League1": "SC1", "Scottish League2": "SC2",
    "Scottish League3": "SC3"
}

DICT_OTHERS = {
    "Argentina": "ARG", "Austria": "AUT", "Brazil": "BRA", "China": "CHN",
    "Denmark": "DNK", "Finland": "FIN", "Ireland": "IRL", "Japan": "JPN",
    "Mexico": "MEX", "Norway": "NOR", "Poland": "POL", "Romania": "ROM",
    "Russia": "RUS", "Sweden": "SWE", "Switzerland": "SWZ", "USA": "USA"
}

COMMON_COLUMNS = ["Div", "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", 'B365H', 'B365A', 'B365D']

# Function to download data for DICT_COUNTRIES leagues
def download_data(league, year):
    s_year = int(str(year)[-2:])
    url = f"https://www.football-data.co.uk/mmz4281/{s_year}{(s_year + 1):02d}/{DICT_COUNTRIES[league]}.csv"
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        # Use flexible reading to handle errors
        matches = pd.read_csv(StringIO(response.text), on_bad_lines='skip')

        # Select only columns that exist in the downloaded data
        existing_columns = [col for col in TO_KEEP if col in matches.columns]
        return matches[existing_columns]

    except pd.errors.ParserError as e:
        logging.warning(f"Parser error for {league} - {year}: {e}")
    except Exception as e:
        logging.warning(f"Failed to download data for {league} - {year}: {e}")
    return None

# Load data for other leagues in DICT_OTHERS and rename columns as needed
def load_other_leagues():
    other_data = []
    for country, code in DICT_OTHERS.items():
        try:
            matches = pd.read_csv(f"https://www.football-data.co.uk/new/{code}.csv", on_bad_lines='skip')
            # Rename columns to match the standard format
            matches = matches.rename(columns={
                "League": "Div", "Home": "HomeTeam", "Away": "AwayTeam",
                "HG": "FTHG", "AG": "FTAG", "Res": "FTR",
                "PSCH": "B365H", "PSCD": "B365D", "PSCA": "B365A"
            })
            other_data.append(matches[COMMON_COLUMNS])
        except Exception as e:
            logging.warning(f"Failed to load data for {country}: {e}")
    return pd.concat(other_data, ignore_index=True) if other_data else pd.DataFrame()

# Download and process data in parallel for DICT_COUNTRIES leagues
def download_all_data():
    data_frames = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(download_data, league, year) for league in DICT_COUNTRIES for year in SEASONS]
        with tqdm(total=len(futures), unit='file', desc='Downloading historical data') as pbar:
            for future in concurrent.futures.as_completed(futures):
                matches = future.result()
                if matches is not None:
                    data_frames.append(matches)
                pbar.update(1)
    return pd.concat(data_frames, ignore_index=True) if data_frames else pd.DataFrame()

if DOWNLOADING_OPTION:
    # Download main leagues' data
    matches_matches = download_all_data()
    matches_matches['Date'] = pd.to_datetime(matches_matches['Date'], dayfirst=True, errors='coerce')
    matches_matches.sort_values('Date', inplace=True)

    # Create a subsample if needed
    SUBSAMPLE_SIZE = -1
    subset_matches = matches_matches.sample(n=SUBSAMPLE_SIZE, random_state=42) if SUBSAMPLE_SIZE > 0 else matches_matches.copy()
    subset_matches['Date'] = subset_matches['Date'].dt.strftime('%Y-%m-%d')

    # Load and process "other leagues" data
    other_leagues = load_other_leagues()
    other_leagues['Date'] = pd.to_datetime(other_leagues['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # Combine all data into final DataFrame
    final_matches = pd.concat([subset_matches, other_leagues], ignore_index=True).reset_index(drop=True)
    print(final_matches.head())


Downloading historical data: 100%|█████████████████████████████████████████████████| 550/550 [01:51<00:00,  4.95file/s]


  Div        Date   HomeTeam    AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  ...  \
0  F1  2000-07-28   Paris SG  Strasbourg   3.0   1.0   H   1.0   1.0   D  ...   
1  F1  2000-07-28  Marseille      Troyes   3.0   1.0   H   2.0   1.0   H  ...   
2  F2  2000-07-28  Wasquehal       Nancy   0.0   1.0   A   0.0   1.0   A  ...   
3  F1  2000-07-29       Lyon      Rennes   2.0   2.0   D   0.0   2.0   A  ...   
4  F1  2000-07-29      Lille      Monaco   1.0   1.0   D   0.0   1.0   A  ...   

   HC  AC  HF  AF  HST  AST  HY  AY  HR  AR  
0 NaN NaN NaN NaN  NaN  NaN NaN NaN NaN NaN  
1 NaN NaN NaN NaN  NaN  NaN NaN NaN NaN NaN  
2 NaN NaN NaN NaN  NaN  NaN NaN NaN NaN NaN  
3 NaN NaN NaN NaN  NaN  NaN NaN NaN NaN NaN  
4 NaN NaN NaN NaN  NaN  NaN NaN NaN NaN NaN  

[5 rows x 27 columns]


In [3]:
import os
import joblib
from sklearn.preprocessing import LabelEncoder

def preprocess_football_data_live(df, year):
    # Step 1: Remove rows with missing values in key columns
    required_columns = ['B365H', 'B365D', 'B365A', 'B365<2.5', 'B365>2.5']
    df_filtered = df.dropna(subset=required_columns).copy()

    # Step 2: Select relevant columns
    col = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'B365H', 'B365D', 'B365A', 'B365<2.5', 'B365>2.5']
    df_filtered = df_filtered[col]

    # Step 3: Encode categorical columns
    team_encoder = LabelEncoder()
    outcome_encoder = LabelEncoder()
    df_filtered['HomeTeam'] = team_encoder.fit_transform(df_filtered['HomeTeam'])
    df_filtered['AwayTeam'] = team_encoder.transform(df_filtered['AwayTeam'])

    # Step 4: Feature engineering
    df_filtered['Implied_Prob_H'] = 1 / df_filtered['B365H'].replace({0: None})
    df_filtered['Implied_Prob_D'] = 1 / df_filtered['B365D'].replace({0: None})
    df_filtered['Implied_Prob_A'] = 1 / df_filtered['B365A'].replace({0: None})

    df_filtered['Odds_Ratio_HA'] = df_filtered['B365H'] / df_filtered['B365A']
    total_prob = df_filtered[['Implied_Prob_H', 'Implied_Prob_D', 'Implied_Prob_A']].sum(axis=1)
    df_filtered['Norm_Prob_H'] = df_filtered['Implied_Prob_H'] / total_prob
    df_filtered['Norm_Prob_D'] = df_filtered['Implied_Prob_D'] / total_prob
    df_filtered['Norm_Prob_A'] = df_filtered['Implied_Prob_A'] / total_prob
    df_filtered[['Norm_Prob_H', 'Norm_Prob_D', 'Norm_Prob_A']] = df_filtered[
        ['Norm_Prob_H', 'Norm_Prob_D', 'Norm_Prob_A']
    ].fillna(0)

    # Step 5: Handle date and filter by year
    df_filtered['Date'] = pd.to_datetime(df_filtered['Date'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Date'])
    df_filtered = df_filtered[df_filtered['Date'].dt.year >= year].reset_index(drop=True)

    # Rolling averages and streaks using `.shift(1)` to exclude the current match
    df_filtered['HomeOdds_5'] = (
        df_filtered.groupby('HomeTeam')['B365H']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )
    df_filtered['AwayOdds_5'] = (
        df_filtered.groupby('AwayTeam')['B365A']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )

    df_filtered['HomeGoalsScored_5'] = (
        df_filtered.groupby('HomeTeam')['FTHG']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )
    df_filtered['AwayGoalsScored_5'] = (
        df_filtered.groupby('AwayTeam')['FTAG']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )

    df_filtered['HomeGoalsConceded_5'] = (
        df_filtered.groupby('HomeTeam')['FTAG']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )
    df_filtered['AwayGoalsConceded_5'] = (
        df_filtered.groupby('AwayTeam')['FTHG']
        .transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())
        .fillna(0)
    )

    df_filtered['Home_Win_Streak'] = (
        df_filtered.groupby('HomeTeam')['FTR']
        .transform(lambda x: (x.shift(1) == 'H').rolling(5).sum().fillna(0))
    )
    df_filtered['Away_Win_Streak'] = (
        df_filtered.groupby('AwayTeam')['FTR']
        .transform(lambda x: (x.shift(1) == 'A').rolling(5).sum().fillna(0))
    )

    # Step 6: Save encoders
    os.makedirs('encoders', exist_ok=True)
    joblib.dump(team_encoder, 'encoders/home_team_encoder.pkl')
    joblib.dump(team_encoder, 'encoders/away_team_encoder.pkl')
    joblib.dump(outcome_encoder, 'encoders/outcome_encoder.pkl')

    print("Encoders saved successfully!")

    return df_filtered

# Apply the preprocessing function to include rolling features and save encoders
preprocessed_df = preprocess_football_data_live(final_matches, 2023)

# Display the preprocessed DataFrame with new features
preprocessed_df.head()


Encoders saved successfully!


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,B365H,B365D,B365A,B365<2.5,...,Norm_Prob_D,Norm_Prob_A,HomeOdds_5,AwayOdds_5,HomeGoalsScored_5,AwayGoalsScored_5,HomeGoalsConceded_5,AwayGoalsConceded_5,Home_Win_Streak,Away_Win_Streak
0,2023-01-01,154,144,4.0,0.0,H,1.95,3.75,3.1,2.1,...,0.241969,0.292705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-01-01,66,451,1.0,1.0,D,3.0,3.25,2.45,1.88,...,0.293267,0.389027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-01-01,511,460,2.0,0.0,H,2.15,3.3,3.0,1.8,...,0.275112,0.302623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-01-01,503,15,4.0,1.0,H,1.62,3.8,4.75,1.98,...,0.241215,0.192972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-01-01,339,335,2.0,2.0,D,6.5,4.0,1.44,2.3,...,0.227626,0.632296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Preprocess the dataset and include form features
finaldf = preprocessed_df.copy()#(final_matches, 2023)
finaldf.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,B365H,B365D,B365A,B365<2.5,...,Norm_Prob_D,Norm_Prob_A,HomeOdds_5,AwayOdds_5,HomeGoalsScored_5,AwayGoalsScored_5,HomeGoalsConceded_5,AwayGoalsConceded_5,Home_Win_Streak,Away_Win_Streak
0,2023-01-01,154,144,4.0,0.0,H,1.95,3.75,3.1,2.1,...,0.241969,0.292705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-01-01,66,451,1.0,1.0,D,3.0,3.25,2.45,1.88,...,0.293267,0.389027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-01-01,511,460,2.0,0.0,H,2.15,3.3,3.0,1.8,...,0.275112,0.302623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-01-01,503,15,4.0,1.0,H,1.62,3.8,4.75,1.98,...,0.241215,0.192972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-01-01,339,335,2.0,2.0,D,6.5,4.0,1.44,2.3,...,0.227626,0.632296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
fix = pd.read_csv('https://www.football-data.co.uk/fixtures.csv')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam',
       'B365H', 'B365D', 'B365A', 'B365<2.5', 'B365>2.5']]
fix.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,B365H,B365D,B365A,B365<2.5,B365>2.5
0,B1,31/01/2025,19:45,Oud-Heverlee Leuven,Mechelen,2.35,3.25,3.0,1.83,2.03
1,B1,01/02/2025,15:00,Charleroi,Dender,1.73,3.8,4.5,1.85,2.0
2,B1,01/02/2025,17:15,Cercle Brugge,Standard,1.83,3.4,4.33,1.7,2.1
3,B1,01/02/2025,19:45,Genk,Beerschot VA,1.25,5.5,10.0,2.7,1.44
4,B1,02/02/2025,12:30,Antwerp,Club Brugge,4.75,3.6,1.73,2.1,1.7


In [6]:
df_filtered = finaldf.copy()

# Step 7: Add calculated features
df_filtered['is_draw'] = (df_filtered['FTHG'] == df_filtered['FTAG']).astype(int)
df_filtered['hw_draw'] = (df_filtered['FTHG'] >= df_filtered['FTAG']).astype(int)
df_filtered['aw_draw'] = (df_filtered['FTAG'] >= df_filtered['FTHG']).astype(int)
df_filtered['ov_un_35'] = ((df_filtered['FTHG'] + df_filtered['FTAG']) >= 4).astype(int)
df_filtered['ov_un_25'] = ((df_filtered['FTHG'] + df_filtered['FTAG']) >= 3).astype(int)
df_filtered['ov_un_15'] = ((df_filtered['FTHG'] + df_filtered['FTAG']) >= 2).astype(int)
df_filtered['is_ftr'] = df_filtered['FTR'].map({'H': 0, 'D': 1, 'A': 2})

In [7]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
import numpy as np

# Function to preprocess the data before training
def preprocess_and_split(finaldf):
    os.makedirs('saved_models', exist_ok=True)

    # Features and target preparation
    X = finaldf.drop(columns=['Date', 'hw_draw', 'aw_draw', 'ov_un_15', 'ov_un_25', 'ov_un_35',
                              'is_draw', 'FTHG', 'FTAG', 'FTR', 'is_ftr'])

    # Convert feature names to strings and remove special characters
    X.columns = X.columns.astype(str).str.replace(r'[^a-zA-Z0-9_]', '', regex=True)

    # Fill missing values with column means
    X = X.fillna(X.mean()) 

    y_targets = finaldf[['is_draw', 'hw_draw', 'aw_draw', 'ov_un_15', 'ov_un_25', 'ov_un_35', 'is_ftr']]  

    # Save feature names
    feature_names = X.columns.tolist()
    with open('saved_models/feature_names.pkl', 'wb') as f:
        joblib.dump(feature_names, f)

    # Apply scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save the scaler
    joblib.dump(scaler, 'saved_models/scaler.pkl')

    return X_scaled, y_targets

# Preprocess data
X_scaled, y_targets = preprocess_and_split(df_filtered)

# SMOTE for handling class imbalance
smote = SMOTE(random_state=42)

# Hyperparameter tuning grid for Random Forest
grid_params = {
    'n_estimators': [100, 400],
    'max_depth': [20, 45],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 3],
    'bootstrap': [True, False]
}

for target in y_targets.columns:
    print(f"\nTraining for target: {target}")
    y = y_targets[target]

    # Handle class imbalance with SMOTE
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )

    # Perform GridSearch for hyperparameter tuning
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=grid_params, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Get the best model after tuning
    best_model = grid_search.best_estimator_

    # Print best hyperparameters
    print(f"\nBest parameters for {target}: {grid_search.best_params_}")

    # Save the best model
    model_path = f'saved_models/{target}_rf_model.pkl'
    joblib.dump(best_model, model_path)

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Best Model Accuracy for {target}: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))






Training for target: is_draw
Fitting 3 folds for each of 32 candidates, totalling 96 fits

Best parameters for is_draw: {'bootstrap': False, 'max_depth': 45, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best Model Accuracy for is_draw: 0.84
              precision    recall  f1-score   support

           0       0.78      0.95      0.86      2445
           1       0.93      0.74      0.82      2444

    accuracy                           0.84      4889
   macro avg       0.86      0.84      0.84      4889
weighted avg       0.86      0.84      0.84      4889


Training for target: hw_draw
Fitting 3 folds for each of 32 candidates, totalling 96 fits

Best parameters for hw_draw: {'bootstrap': False, 'max_depth': 45, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best Model Accuracy for hw_draw: 0.82
              precision    recall  f1-score   support

           0       0.84      0.77      0.81      2322
           1       0.79      0.86 

In [20]:
import joblib
from sklearn.metrics import accuracy_score

# Load the saved feature names and scaler
with open('saved_models/feature_names.pkl', 'rb') as f:
    feature_names = joblib.load(f)

scaler = joblib.load('saved_models/scaler.pkl')

# Re-scale the training data
X_train_scaled = scaler.transform(X_train)

# Compute training accuracy for each target
for target in y_targets.columns:
    print(f"\nCalculating Training Accuracy for Target: {target}")

    # Load trained model
    model_path = f'saved_models/{target}_rf_model.pkl'
    best_model = joblib.load(model_path)

    # Predict on training data
    y_train_pred = best_model.predict(X_train_scaled)

    # Calculate training accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy for {target}: {train_accuracy:.2f}")




Calculating Training Accuracy for Target: is_draw
Training Accuracy for is_draw: 0.35

Calculating Training Accuracy for Target: hw_draw
Training Accuracy for hw_draw: 0.32

Calculating Training Accuracy for Target: aw_draw
Training Accuracy for aw_draw: 0.37

Calculating Training Accuracy for Target: ov_un_15
Training Accuracy for ov_un_15: 0.34

Calculating Training Accuracy for Target: ov_un_25
Training Accuracy for ov_un_25: 0.31

Calculating Training Accuracy for Target: ov_un_35
Training Accuracy for ov_un_35: 0.30

Calculating Training Accuracy for Target: is_ftr
Training Accuracy for is_ftr: 0.36


In [21]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Function to preprocess the data before training
def preprocess_and_split(finaldf):
    os.makedirs('saved_models', exist_ok=True)

    # Features and target preparation
    X = finaldf.drop(columns=['Date', 'hw_draw', 'aw_draw', 'ov_un_15', 'ov_un_25', 'ov_un_35',
                              'is_draw', 'FTHG', 'FTAG', 'FTR', 'is_ftr'])

    # Convert feature names to strings and remove special characters
    X.columns = X.columns.astype(str).str.replace(r'[^a-zA-Z0-9_]', '', regex=True)

    # Fill missing values with column means
    X = X.fillna(X.mean()) 

    y_targets = finaldf[['is_draw', 'hw_draw', 'aw_draw', 'ov_un_15', 'ov_un_25', 'ov_un_35', 'is_ftr']]  

    # Save feature names
    feature_names = X.columns.tolist()
    with open('saved_models/feature_names.pkl', 'wb') as f:
        joblib.dump(feature_names, f)

    return X, y_targets

# Preprocess data
X, y_targets = preprocess_and_split(df_filtered)

# SMOTE for handling class imbalance
smote = SMOTE(random_state=42)

# Hyperparameter tuning grid for Random Forest
grid_params = {
    'n_estimators': [100, 400],
    'max_depth': [10, 30],  # Reduced depth to prevent overfitting
    'min_samples_split': [5, 15],  # Increased min samples split
    'min_samples_leaf': [2, 5],  # Prevent overfitting
    'bootstrap': [True]
}

# Dictionary to store training accuracy for each target
train_accuracies = {}

for target in y_targets.columns:
    print(f"\nTraining for target: {target}")
    y = y_targets[target]

    # Train-test split before SMOTE
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Apply SMOTE **only on the training set**
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Scaling should be done separately for train and test
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)  # Fit only on training set
    X_test_scaled = scaler.transform(X_test)  # Transform test set separately

    # Save the scaler
    joblib.dump(scaler, 'saved_models/scaler.pkl')

    # Perform GridSearch for hyperparameter tuning
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=grid_params, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_scaled, y_train_resampled)  # Only use training set!

    # Get the best model after tuning
    best_model = grid_search.best_estimator_

    # Print best hyperparameters
    print(f"\nBest parameters for {target}: {grid_search.best_params_}")

    # Save the best model
    model_path = f'saved_models/{target}_rf_model.pkl'
    joblib.dump(best_model, model_path)

    # Evaluate the model on test data
    y_pred = best_model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Best Model Accuracy for {target} on Test Data: {test_accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    # --- 🔹 Training Accuracy Calculation ---
    y_train_pred = best_model.predict(X_train_scaled)  # Predict on resampled & scaled training data
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)  # Compare with resampled labels
    print(f"Training Accuracy for {target}: {train_accuracy:.2f}")

    # Store the training accuracy
    train_accuracies[target] = train_accuracy

# Print all training accuracies
print("\nTraining Accuracy Summary:")
for target, acc in train_accuracies.items():
    print(f"{target}: {acc:.2f}")



Training for target: is_draw
Fitting 3 folds for each of 16 candidates, totalling 48 fits

Best parameters for is_draw: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 15, 'n_estimators': 400}
Best Model Accuracy for is_draw on Test Data: 0.73
              precision    recall  f1-score   support

           0       0.74      0.98      0.84      2445
           1       0.16      0.01      0.02       872

    accuracy                           0.73      3317
   macro avg       0.45      0.50      0.43      3317
weighted avg       0.58      0.73      0.62      3317

Training Accuracy for is_draw: 0.96

Training for target: hw_draw
Fitting 3 folds for each of 16 candidates, totalling 48 fits

Best parameters for hw_draw: {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 400}
Best Model Accuracy for hw_draw on Test Data: 0.70
              precision    recall  f1-score   support

           0       0.51    

In [8]:
fix = pd.read_csv('https://www.football-data.co.uk/fixtures.csv')[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam',
       'B365H', 'B365D', 'B365A', 'B365<2.5', 'B365>2.5']]
fix.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,B365H,B365D,B365A,B365<2.5,B365>2.5
0,B1,31/01/2025,19:45,Oud-Heverlee Leuven,Mechelen,2.35,3.25,3.0,1.83,2.03
1,B1,01/02/2025,15:00,Charleroi,Dender,1.73,3.8,4.5,1.85,2.0
2,B1,01/02/2025,17:15,Cercle Brugge,Standard,1.83,3.4,4.33,1.7,2.1
3,B1,01/02/2025,19:45,Genk,Beerschot VA,1.25,5.5,10.0,2.7,1.44
4,B1,02/02/2025,12:30,Antwerp,Club Brugge,4.75,3.6,1.73,2.1,1.7


In [10]:
import pandas as pd
import joblib
import numpy as np

# Function to preprocess upcoming matches
def preprocess_upcoming_matches(fix, encoders):
    df_upcoming = fix.copy()

    # Ensure 'Date' column is in datetime format
    df_upcoming['Date'] = pd.to_datetime(df_upcoming['Date'], format='%Y-%m-%d', errors='coerce')

    # Function to safely encode team names
    def safe_encode(column, encoder):
        known_teams = set(encoder.classes_)  # Get all known team names

        # Replace unseen teams with a default value (-1)
        df_upcoming[column] = df_upcoming[column].apply(
            lambda team: encoder.transform([team])[0] if team in known_teams else -1
        )

    # Encode categorical features using saved encoders, handling unknown teams
    safe_encode('HomeTeam', encoders['home_team'])
    safe_encode('AwayTeam', encoders['away_team'])

    # Calculate Implied Probabilities
    df_upcoming['Implied_Prob_H'] = 1 / df_upcoming['B365H'].replace({0: np.nan})
    df_upcoming['Implied_Prob_D'] = 1 / df_upcoming['B365D'].replace({0: np.nan})
    df_upcoming['Implied_Prob_A'] = 1 / df_upcoming['B365A'].replace({0: np.nan})

    # Calculate Odds Ratio for Home and Away
    df_upcoming['Odds_Ratio_HA'] = df_upcoming['B365H'] / df_upcoming['B365A']

    # Normalize probabilities to sum to 1
    total_prob = df_upcoming[['Implied_Prob_H', 'Implied_Prob_D', 'Implied_Prob_A']].sum(axis=1)
    df_upcoming['Norm_Prob_H'] = df_upcoming['Implied_Prob_H'] / total_prob
    df_upcoming['Norm_Prob_D'] = df_upcoming['Implied_Prob_D'] / total_prob
    df_upcoming['Norm_Prob_A'] = df_upcoming['Implied_Prob_A'] / total_prob

    # Replace NaN values resulting from division by zero
    df_upcoming[['Norm_Prob_H', 'Norm_Prob_D', 'Norm_Prob_A']] = df_upcoming[
        ['Norm_Prob_H', 'Norm_Prob_D', 'Norm_Prob_A']
    ].fillna(0)

    # Add placeholder columns for rolling averages and win streaks
    for col in ['HomeOdds_5', 'AwayOdds_5', 'HomeGoalsScored_5', 'AwayGoalsScored_5',
                'HomeGoalsConceded_5', 'AwayGoalsConceded_5', 'Home_Win_Streak', 'Away_Win_Streak']:
        df_upcoming[col] = 0  # Placeholder if no history is available

    return df_upcoming

# Prediction function using trained Random Forest models
def predict_upcoming_matches(fix, models, encoders, scaler):
    # Preprocess the upcoming matches
    df_upcoming = preprocess_upcoming_matches(fix, encoders)

    # Load and align feature names
    with open('saved_models/feature_names.pkl', 'rb') as f:
        feature_names = joblib.load(f)
    df_upcoming = df_upcoming.reindex(columns=feature_names, fill_value=0)

    # Apply scaling
    df_upcoming_scaled = scaler.transform(df_upcoming)

    # Predict for each target using best-trained Random Forest models
    predictions_results = {}
    for target, model in models.items():
        predictions_results[f'Predicted_{target}'] = model.predict(df_upcoming_scaled)

    # Combine predictions with original data
    predictions_df = pd.DataFrame(predictions_results)
    final_results = pd.concat([fix.reset_index(drop=True), predictions_df], axis=1)

    return final_results

# Load encoders, scaler, and GridSearch-optimized models
encoders = {
    'home_team': joblib.load('encoders/home_team_encoder.pkl'),
    'away_team': joblib.load('encoders/away_team_encoder.pkl'),
}

scaler = joblib.load('saved_models/scaler.pkl')

# Load the trained Random Forest models
models = {}
targets = ['is_draw', 'hw_draw', 'aw_draw', 'ov_un_15', 'ov_un_25', 'ov_un_35', 'is_ftr']
for target in targets:
    models[target] = joblib.load(f'saved_models/{target}_rf_model.pkl')

# Predict upcoming matches
final_results = predict_upcoming_matches(fix, models, encoders, scaler)
print(final_results)


    Div        Date   Time             HomeTeam      AwayTeam  B365H  B365D  \
0    B1  31/01/2025  19:45  Oud-Heverlee Leuven      Mechelen   2.35   3.25   
1    B1  01/02/2025  15:00            Charleroi        Dender   1.73   3.80   
2    B1  01/02/2025  17:15        Cercle Brugge      Standard   1.83   3.40   
3    B1  01/02/2025  19:45                 Genk  Beerschot VA   1.25   5.50   
4    B1  02/02/2025  12:30              Antwerp   Club Brugge   4.75   3.60   
..   ..         ...    ...                  ...           ...    ...    ...   
187  T1  01/02/2025  16:00           Buyuksehyr    Samsunspor   2.15   3.30   
188  T1  02/02/2025  10:30        Ad. Demirspor     Kasimpasa   3.90   3.80   
189  T1  02/02/2025  13:00               Goztep    Alanyaspor   1.70   3.50   
190  T1  02/02/2025  16:00           Fenerbahce      Rizespor   1.36   4.75   
191  T1  03/02/2025  17:00            Gaziantep   Galatasaray   6.50   4.50   

     B365A  B365<2.5  B365>2.5  Predicted_is_draw  

In [11]:
def predict_upcoming_matches_with_decoded_probs(fix, models, encoders, scaler):
    # Define decoding mappings
    decode_is_ftr = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
    decode_binary = {0: 'No', 1: 'Yes'}

    # Preprocess the upcoming matches
    df_upcoming = preprocess_upcoming_matches(fix, encoders)

    # Load and align feature names
    with open('saved_models/feature_names.pkl', 'rb') as f:
        feature_names = joblib.load(f)
    df_upcoming = df_upcoming.reindex(columns=feature_names, fill_value=0)

    # Apply scaling
    df_upcoming_scaled = scaler.transform(df_upcoming)

    # Predict for each target and collect decoded predictions with probabilities
    results = []
    for idx, row in fix.iterrows():
        match_result = {
            "Date": row["Date"],
            "Time": row["Time"],
            "HomeTeam": row["HomeTeam"],
            "AwayTeam": row["AwayTeam"],
        }
        for target, model in models.items():
            # Prediction
            prediction = model.predict([df_upcoming_scaled[idx]])[0]

            # Decode Prediction
            if target == 'is_ftr':
                decoded_prediction = decode_is_ftr[prediction]
            else:
                decoded_prediction = decode_binary[prediction]
            match_result[f"Decoded_{target}"] = decoded_prediction

            # Respective Probability
            if hasattr(model, "predict_proba"):
                proba = model.predict_proba([df_upcoming_scaled[idx]])[0]
                match_result[f"Prob_{target}"] = proba[prediction]
        results.append(match_result)

    # Convert results into a DataFrame
    final_results = pd.DataFrame(results)
    return final_results

# Predict upcoming matches
final_decoded_results = predict_upcoming_matches_with_decoded_probs(fix, models, encoders, scaler)

In [12]:
from datetime import datetime

# Convert Date column to datetime
final_decoded_results["Date"] = pd.to_datetime(final_decoded_results["Date"], format="%d/%m/%Y")

# Refiltering with proper date format
today = datetime.today().strftime("%d/%m/%Y")  # Get today's date in the same format
filtered_df = final_decoded_results[final_decoded_results["Date"].dt.strftime("%d/%m/%Y") == today]

In [13]:
def display_top_n_predictions(df, probability_columns, n=3):
    for col in probability_columns:
        if "Prob" in col:
            # Get the decoded column name and top N values
            decoded_col = col.replace("Prob", "Decoded")
            top_n_df = df.nlargest(n, col)[["Date", "Time", "HomeTeam", "AwayTeam", col]].copy()
            top_n_df["Predicted_Class"] = df.loc[top_n_df.index, decoded_col]

            print(f"\nTop {n} {col} Predictions:")
            display(top_n_df)

# List of probability columns
probability_columns = ["Prob_is_draw", "Prob_hw_draw", "Prob_aw_draw", 'Prob_ov_un_15', "Prob_ov_un_25", 'Prob_ov_un_35', "Prob_is_ftr"]

# Display top N for each category
display_top_n_predictions(filtered_df, probability_columns, n=15)



Top 15 Prob_is_draw Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_is_draw,Predicted_Class



Top 15 Prob_hw_draw Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_hw_draw,Predicted_Class



Top 15 Prob_aw_draw Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_aw_draw,Predicted_Class



Top 15 Prob_ov_un_15 Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_ov_un_15,Predicted_Class



Top 15 Prob_ov_un_25 Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_ov_un_25,Predicted_Class



Top 15 Prob_ov_un_35 Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_ov_un_35,Predicted_Class



Top 15 Prob_is_ftr Predictions:


Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Prob_is_ftr,Predicted_Class


In [14]:
def best_predictions_per_match(final_results):
    best_predictions = []

    for _, row in final_results.iterrows():
        match = {
            "Date": row["Date"],
            "Time": row["Time"],
            "HomeTeam": row["HomeTeam"],
            "AwayTeam": row["AwayTeam"],
            "FTR": row["Decoded_is_ftr"],
            "Prob_ftr": row["Prob_is_ftr"],
        }

        # Collect all predictions and probabilities
        predictions = [
            ("is_draw", row["Decoded_is_draw"], row["Prob_is_draw"]),
            ("is_ftr", row["Decoded_is_ftr"], row["Prob_is_ftr"]),
            ("hw_draw", row["Decoded_hw_draw"], row["Prob_hw_draw"]),
            ("aw_draw", row["Decoded_aw_draw"], row["Prob_aw_draw"]),
            ("ov_un_25", row["Decoded_ov_un_25"], row["Prob_ov_un_25"]),
        ]

        # Find the prediction with the highest probability
        best_prediction = max(predictions, key=lambda x: x[2])  # Sort by probability
        match["Best_Target"] = best_prediction[0]
        match["Best_Prediction"] = best_prediction[1]
        match["Best_Probability"] = best_prediction[2]

        best_predictions.append(match)

    return pd.DataFrame(best_predictions)

# Generate the DataFrame for best predictions
best_predictions_df = best_predictions_per_match(final_decoded_results)

In [15]:
# Display the best predictions DataFrame
best_predictions_df

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR,Prob_ftr,Best_Target,Best_Prediction,Best_Probability
0,2025-01-31,19:45,Oud-Heverlee Leuven,Mechelen,Home Win,0.3850,hw_draw,Yes,0.735000
1,2025-02-01,15:00,Charleroi,Dender,Home Win,0.5125,hw_draw,Yes,0.757500
2,2025-02-01,17:15,Cercle Brugge,Standard,Draw,0.4450,hw_draw,Yes,0.825000
3,2025-02-01,19:45,Genk,Beerschot VA,Home Win,0.6575,hw_draw,Yes,0.905000
4,2025-02-02,12:30,Antwerp,Club Brugge,Away Win,0.5075,aw_draw,Yes,0.693945
...,...,...,...,...,...,...,...,...,...
187,2025-02-01,16:00,Buyuksehyr,Samsunspor,Away Win,0.3950,hw_draw,Yes,0.605000
188,2025-02-02,10:30,Ad. Demirspor,Kasimpasa,Away Win,0.6175,aw_draw,Yes,0.789167
189,2025-02-02,13:00,Goztep,Alanyaspor,Home Win,0.4825,is_draw,No,0.690000
190,2025-02-02,16:00,Fenerbahce,Rizespor,Home Win,0.7450,hw_draw,Yes,0.887500


In [16]:
best_predictions_df[best_predictions_df.Date=='2025-01-06'].nlargest(30,'Best_Probability')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,FTR,Prob_ftr,Best_Target,Best_Prediction,Best_Probability


In [17]:
final_decoded_results[final_decoded_results.Date=='06/01/2025'][['Date', 'Time', 'HomeTeam', 'AwayTeam','Decoded_ov_un_25', 'Prob_ov_un_25']].nlargest(15,'Prob_ov_un_25')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Decoded_ov_un_25,Prob_ov_un_25


In [18]:
final_decoded_results[final_decoded_results.Date=='06/01/2025'].nlargest(5,'Prob_ov_un_25')

Unnamed: 0,Date,Time,HomeTeam,AwayTeam,Decoded_is_draw,Prob_is_draw,Decoded_hw_draw,Prob_hw_draw,Decoded_aw_draw,Prob_aw_draw,Decoded_ov_un_15,Prob_ov_un_15,Decoded_ov_un_25,Prob_ov_un_25,Decoded_ov_un_35,Prob_ov_un_35,Decoded_is_ftr,Prob_is_ftr
