In [1]:
import joblib
from sklearn.ensemble import RandomForestClassifier  # Example model type
import os
import re
import pandas as pd
import ast
import os
import random as rd
from datetime import datetime
from functools import reduce
from itertools import product
from operator import mul
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# Define the directory to search for text files
folder_path = r"C:\Users\leere\PycharmProjects\Football_ML3\Leagues_top_5"

# Define a regex pattern to extract the relevant data
pattern = re.compile(
    r"League: \('([^']*)', '([^']*)'\), p-value: ([\d.]+), Threshold: ([\d.]+), "
    r"Staked: (\d+), Profit: [^\d-]*([-\d.]+), ROI: ([\d.]+)%, "
    r"Params: ({.*})"
)

# List to store extracted data
data = []

# Walk through the directory and subdirectories
for root, _, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".txt"):  # Only process text files
            file_path = os.path.join(root, file)
            with open(file_path, "r", encoding="latin1") as f:
                for line in f:
                    match = pattern.search(line)
                    if match:
                        country, league, p_value, threshold, staked, profit, roi, params = match.groups()
                        data.append({
                            "Country": country,
                            "League": league,
                            "p-value": float(p_value),
                            "Threshold": float(threshold),
                            "Staked": int(staked),
                            "Profit": float(profit),
                            "ROI": float(roi),
                            "Params": params
                        })

# Create a DataFrame from the data
df = pd.DataFrame(data)

In [3]:
#Group the main DataFrame by 'League' and 'Sub-League'
grouped = df.groupby(['Country', 'League'])

# Create a list of DataFrames, one for each group
dataframes_list = [group for _, group in grouped]

# Optionally, name the DataFrames for reference
dataframe_names = [f"{name[0]}_{name[1]}" for name in grouped.groups.keys()]

# Optionally return the list of DataFrames and their names
dataframes_named = dict(zip(dataframe_names, dataframes_list))

In [4]:
# List to store rows with maximum profit from each group
max_profit_rows = []

# Iterate through each group
for name, group in df.groupby(['Country', 'League']):
    # Find the row with the maximum profit in the group
    max_row = group.loc[group['Profit'].idxmax()]
    max_profit_rows.append(max_row)

# Convert the result to a DataFrame
max_profit_df = pd.DataFrame(max_profit_rows)

# Display the resulting DataFrame
max_profit_df


Unnamed: 0,Country,League,p-value,Threshold,Staked,Profit,ROI,Params
994,England,Premier L,0.0055,0.51,343,42.98,12.5,"{'model__n_estimators': 29, 'model__max_depth'..."
4800,France,Ligue 1,0.0084,0.7,250,38.77,15.5,"{'model__hidden_layer_sizes': (443, 114), 'mod..."
6900,Germany,Bundesliga,0.0005,0.61,176,33.15,18.8,"{'model__n_estimators': 93, 'model__max_depth'..."
10900,Italy,Serie A,0.1144,0.72,285,26.8,9.4,"{'model__hidden_layer_sizes': (47, 74), 'model..."
12500,Spain,Primera,0.0315,0.72,234,36.34,15.5,"{'model__hidden_layer_sizes': (50, 50), 'model..."


In [5]:
# # Directory to save the models
# models_dir = "TOP_5_MODELS"  # Change this to your desired directory
# os.makedirs(models_dir, exist_ok=True)
#
# # Iterate through each row in the DataFrame
# for index, row in max_profit_df.iterrows():
#     # Parse the parameters from the Params column
#     params = ast.literal_eval(row['Params'])  # Safely convert string to dictionary
#
#     # Remove the 'model__' prefix from each parameter key
#     cleaned_params = {key.replace('model__', ''): value for key, value in params.items()}
#
#     # Instantiate the model with the cleaned parameters
#     model = RandomForestClassifier(**cleaned_params)
#
#     # Save the model with a unique filename based on League and Sub-League
#     model_filename = f"{models_dir}/{row['Country']}_{row['League']}_model.pkl"
#     joblib.dump(model, model_filename)
#     print(f"Saved model for {row['Country']} - {row['League']} to {model_filename}")

import os
import ast
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Directory to save the models
models_dir = "TOP_5_MODELS_v2"  # Change this to your desired directory
os.makedirs(models_dir, exist_ok=True)

# Iterate through each row in the DataFrame
for index, row in max_profit_df.iterrows():
    # Parse the parameters from the Params column
    params = ast.literal_eval(row['Params'])  # Safely convert string to dictionary

    # Determine the model type based on parameters
    if 'hidden_layer_sizes' in str(params):
        model_type = 'MLP'
    else:
        model_type = 'RandomForest'

    # Remove the 'model__' prefix from each parameter key
    cleaned_params = {key.replace('model__', ''): value for key, value in params.items()}

    # Instantiate the appropriate model
    if model_type == 'MLP':
        model = MLPClassifier(**cleaned_params, random_state=42)
    else:
        model = RandomForestClassifier(**cleaned_params, random_state=42)

    # Save the model with a unique filename based on League and Sub-League
    model_filename = f"{models_dir}/{row['Country']}_{row['League']}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Saved {model_type} model for {row['Country']} - {row['League']} to {model_filename}")




Saved RandomForest model for England - Premier L to TOP_5_MODELS/England_Premier L_model.pkl
Saved MLP model for France - Ligue 1 to TOP_5_MODELS/France_Ligue 1_model.pkl
Saved RandomForest model for Germany - Bundesliga to TOP_5_MODELS/Germany_Bundesliga_model.pkl
Saved MLP model for Italy - Serie A to TOP_5_MODELS/Italy_Serie A_model.pkl
Saved MLP model for Spain - Primera to TOP_5_MODELS/Spain_Primera_model.pkl


In [6]:
# List of common encodings to try
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

data = None
for encoding in encodings:
    try:
        data = pd.read_csv("GOAL_DATA_TOP_5.csv", encoding=encoding)
        print(f"Successfully read the file with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to decode with encoding: {encoding}")

col_dict = {
    "country": "Country",
    "league": "League",
    "datameci": "Date",
    "etapa": "Round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "place1t": "Home_team_place_total",
    "place1a": "Home_team_place_home",
    "place2t": "Away_team_place_total",
    "place2d": "Away_team_place_away",
    "customh": "ELO_home",
    "customa": "ELO_away",
    "custom3": "FORM_home",
    "custom4": "FORM_away",
    "home_val": "home_win",
    "home_val_2": "home_win_15",
    "home_val_3": "home_o25",
    "home_val_4": "home_o35",
    "home_val_5": "home_scored",
    "away_val": "away_win",
    "away_val_2": "away_win_15",
    "away_val_3": "away_o25",
    "away_val_4": "away_o35",
    "away_val_5": "away_scored",
    "scor1": "home_goals",
    "scor2": "away_goals",
    "cotao": "o2.5_odds",
}

data = data.rename(columns=col_dict).filter(items=col_dict.values())
# Convert Date column to datetime type
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')

# Order by date
data = data.sort_values(by='Date')

data = data[data["Round"] >= 8]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

data['total_goals'] = data['home_goals'] + data['away_goals']
data['over_2.5_goals'] = data['total_goals'].apply(lambda x: 1 if x > 2.5 else 0)

# Create unique tuples across 'Name' and 'City'
leagues = max_profit_df[['Country', 'League']].drop_duplicates().apply(tuple, axis=1).tolist()

data_ready = data.drop(columns=['home_team', 'away_team', 'home_goals', 'away_goals', 'total_goals', 'o2.5_odds', ])




Successfully read the file with encoding: utf-8


In [1]:
import ast
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Directory where models will be saved
models_dir = "TOP_5_MODELS_v2"
os.makedirs(models_dir, exist_ok=True)

# Iterate through each league
for index, (country_name, league_name) in enumerate(leagues):
    # Filter data for the current league
    data_filtered = data_ready[(data_ready["Country"] == country_name) & (data_ready["League"] == league_name)]
    if data_filtered.empty:
        continue

    # Split the data into training and testing sets by date
    cut_off_date = data_filtered['Date'].quantile(0.8)
    train_data = data_filtered[data_filtered['Date'] <= cut_off_date]
    y_train = train_data["over_2.5_goals"]
    train_data = train_data.drop(columns=['Date', 'over_2.5_goals', 'League', 'Country'])
    test_data = data_filtered[data_filtered['Date'] > cut_off_date]
    y_test = test_data["over_2.5_goals"]
    test_data = test_data.drop(columns=['Date', 'over_2.5_goals', 'League', 'Country'])

    # Initialize the scaler
    scaler = StandardScaler()
    scaler.fit(train_data)
    train_data_scaled = scaler.transform(train_data)
    test_data_scaled = scaler.transform(test_data)

    # Extract parameters and threshold from max_profit_df
    params_str = max_profit_df['Params'].iloc[index]
    threshold = max_profit_df['Threshold'].iloc[index]

    # Convert the params from string to dictionary and clean the keys
    params = ast.literal_eval(params_str)
    cleaned_params = {key.replace('model__', ''): value for key, value in params.items()}

    # Determine the model type based on parameters
    if 'hidden_layer_sizes' in str(cleaned_params):
        model_type = 'MLP'
    else:
        model_type = 'RandomForest'

    # Remove the 'model__' prefix from each parameter key
    cleaned_params = {key.replace('model__', ''): value for key, value in params.items()}

    # Instantiate the appropriate model
    if model_type == 'MLP':
        model = MLPClassifier(**cleaned_params, random_state=42)
    else:
        model = RandomForestClassifier(**cleaned_params, random_state=42)

    print(f"Training model for {country_name} - {league_name}...")
    print(f"Params: {cleaned_params}, Threshold: {threshold}")

    model.fit(train_data_scaled, y_train)

    # Save the trained model along with its scaler and threshold
    model_filename = f"{models_dir}/{country_name}_{league_name}_model.pkl"
    joblib.dump({'model': model, 'scaler': scaler, 'threshold': threshold}, model_filename)
    print(f"Trained and saved model for {country_name} - {league_name} at {model_filename}")

    # Evaluate the model on the test data
    y_proba = model.predict_proba(test_data_scaled)[:, 1]  # Probabilities for the positive class
    y_pred = (y_proba >= threshold).astype(int)  # Apply the saved threshold

    # Simulate betting results
    profit_list = []
    profit = 0
    for idx, pred in enumerate(y_pred):
        if pred == 1:  # Bet placed
            odds = data.iloc[test_data.index[idx]]['o2.5_odds']
            profit_value = (odds - 1) if y_test.iloc[idx] == 1 else -1
            profit_list.append(profit_value)
            profit += profit_value

    # Calculate betting performance metrics
    total_stake = len(profit_list)
    roi = (profit / total_stake) * 100 if total_stake > 0 else 0

    # Print results
    print(f"{league_name}")
    print(f"  Staked: {total_stake}")
    print(f"  Profit: £{profit:.2f}")
    print(f"  ROI: {roi:.1f}%")


NameError: name 'os' is not defined