### Data Preprocessing

### Overview

This notebook is used to make the over 2.5 predictions for all the matches saved in the `data/netx_matches.json` file. 

### Pre-requisites 

1. A conda environment is needed.

For example:
```
cd path/to/conda/dir
conda env create -f aifootball_predictions.yaml
conda activate aifootball_predictions
python -m ipykernel install --user --name aifootball_predictions --display-name "aifootball_predictions"
```

2. All the trained models in the `models` directory

3. All the preprocessed dataset `data/preprocessed` which will be used to make the predictions

### Authors

- mauo.andretta222@gmail.com

In [86]:
# install the necessary packages
import pandas as pd
import os
import json
import pickle
import numpy as np

Specify useful features for the home and away teams, togheter with the netural features (related to no team)

In [20]:
home_team_features = [
    'HomeTeam',
    'FTHG', 'HG',  # Full Time Home Team Goals
    'HTHG',        # Half Time Home Team Goals
    'HS',          # Home Team Shots
    'HST',         # Home Team Shots on Target
    'HHW',         # Home Team Hit Woodwork
    'HC',          # Home Team Corners
    'HF',          # Home Team Fouls Committed
    'HFKC',        # Home Team Free Kicks Conceded
    'HO',          # Home Team Offsides
    'HY',          # Home Team Yellow Cards
    'HR',          # Home Team Red Cards
    'HBP',         # Home Team Bookings Points
    'B365H', 'BFH', 'BSH', 'BWH', 'GBH', 'IWH', 'LBH', 'PSH', 'SOH', 'SBH', 'SJH', 'SYH', 'VCH', 'WHH',  # Home win odds
    'BbMxH', 'BbAvH', 'MaxH', 'AvgH',  # Home win odds statistics
    'BFEH',       # Betfair Exchange home win odds
    'BbMxAHH', 'BbAvAHH', 'GBAHH', 'LBAHH', 'B365AHH', 'PAHH', 'MaxAHH', 'AvgAHH',  # Asian handicap home team odds
    'BbAHh', 'AHh', 'GBAH', 'LBAH', 'B365AH',  # Size of handicap (home team)
    'AvgHomeGoalsScored', 'AvgHomeGoalsConceded',
    'HomeOver2.5Perc',
    'AvgLast5HomeGoalsScored', 'AvgLast5HomeGoalsConceded',
    'Last5HomeOver2.5Count', 'Last5HomeOver2.5Perc'
]


In [21]:
away_team_features = [
    'AwayTeam',
    'FTAG', 'AG',  # Full Time Away Team Goals
    'HTAG',        # Half Time Away Team Goals
    'AS',          # Away Team Shots
    'AST',         # Away Team Shots on Target
    'AHW',         # Away Team Hit Woodwork
    'AC',          # Away Team Corners
    'AF',          # Away Team Fouls Committed
    'AFKC',        # Away Team Free Kicks Conceded
    'AO',          # Away Team Offsides
    'AY',          # Away Team Yellow Cards
    'AR',          # Away Team Red Cards
    'ABP',         # Away Team Bookings Points
    'B365A', 'BFA', 'BSA', 'BWA', 'GBA', 'IWA', 'LBA', 'PSA', 'SOA', 'SBA', 'SJA', 'SYA', 'VCA', 'WHA',  # Away win odds
    'BbMxA', 'BbAvA', 'MaxA', 'AvgA',  # Away win odds statistics
    'BFEA',       # Betfair Exchange away win odds
    'BbMxAHA', 'BbAvAHA', 'GBAHA', 'LBAHA', 'B365AHA', 'PAHA', 'MaxAHA', 'AvgAHA',  # Asian handicap away team odds
    'AvgAwayGoalsScored', 'AvgAwayGoalsConceded',
    'AwayOver2.5Perc',
    'AvgLast5AwayGoalsScored', 'AvgLast5AwayGoalsConceded',
    'Last5AwayOver2.5Count', 'Last5AwayOver2.5Perc'
]

In [72]:
general_features = [
    'Div',       # League Division
    'Date',      # Match Date
    'Time',      # Match Time
    'FTR', 'Res',  # Full Time Result
    'HTR',        # Half Time Result
    'Attendance', # Crowd Attendance
    'Referee',    # Match Referee
    'Bb1X2',      # Number of BetBrain bookmakers used to calculate match odds averages and maximums
    'BbMxD', 'BbAvD', 'MaxD', 'AvgD',  # Draw odds statistics
    'B365D', 'BFD', 'BSD', 'BWD', 'GBD', 'IWD', 'LBD', 'PSD', 'SOD', 'SBD', 'SJD', 'SYD', 'VCD', 'WHD',  # Draw odds
    'BbOU',       # Number of BetBrain bookmakers used to calculate over/under 2.5 goals
    'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5',  # Over/Under 2.5 goals odds statistics
    'GB>2.5', 'GB<2.5', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5',
    'MaxC>2.5', 'B365C<2.5',  # Over/Under 2.5 goals odds
    'BbAH',       # Number of BetBrain bookmakers used to Asian handicap averages and maximums
    'Over2.5'     # Binary indicator if the match ended with more than 2.5 total goals
]

In [4]:
# Open the JSON file and load its data into a dictionary
with open("../data/next_matches.json", 'r', encoding='utf-16') as json_file:
    data_dict = json.load(json_file)

In [5]:
data_dict

{'E0': [{'date': '2024-08-31 11:30:00',
   'home_team': 'Arsenal',
   'away_team': 'Brighton'},
  {'date': '2024-08-31 14:00:00',
   'home_team': 'Brentford',
   'away_team': 'Southampton'},
  {'date': '2024-08-31 14:00:00',
   'home_team': 'Everton',
   'away_team': 'Bournemouth'},
  {'date': '2024-08-31 14:00:00',
   'home_team': 'Ipswich',
   'away_team': 'Fulham'},
  {'date': '2024-08-31 14:00:00',
   'home_team': 'Leicester',
   'away_team': 'Aston Villa'},
  {'date': '2024-08-31 14:00:00',
   'home_team': "Nott'm Forest",
   'away_team': 'Wolves'},
  {'date': '2024-08-31 16:30:00',
   'home_team': 'West Ham',
   'away_team': 'Man City'},
  {'date': '2024-09-01 12:30:00',
   'home_team': 'Chelsea',
   'away_team': 'Crystal Palace'},
  {'date': '2024-09-01 12:30:00',
   'home_team': 'Newcastle',
   'away_team': 'Tottenham'},
  {'date': '2024-09-01 15:00:00',
   'home_team': 'Man United',
   'away_team': 'Liverpool'}],
 'SP1': [{'date': '2024-08-26 19:30:00',
   'home_team': 'Villar

In [73]:
# read all the csv files from the data/raw folder
# walk through the data/raw folder and read all the csv files

directory_path = '../data/processed'  # Replace with your directory path
dataframes = {}

# Walk through the directory and find CSV files
for root, dirs, files in os.walk(directory_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            
            # Determine the key and load the dataframe based on the league identifier
            if 'E0' in file:
                dataframes['uk_data'] = pd.read_csv(file_path)
                print(f"Created dataframe: uk_data from {file}")
            elif 'I1' in file:
                dataframes['it_data'] = pd.read_csv(file_path)
                print(f"Created dataframe: it_data from {file}")
            elif 'SP1' in file:
                dataframes['es_data'] = pd.read_csv(file_path)
                print(f"Created dataframe: es_data from {file}")
            elif 'D1' in file:
                dataframes['de_data'] = pd.read_csv(file_path)
                print(f"Created dataframe: de_data from {file}")
            elif 'F1' in file:
                dataframes['fr_data'] = pd.read_csv(file_path)
                print(f"Created dataframe: fr_data from {file}")
            else:
                print(f"File {file} does not match any known league identifier.")

# Access the dataframes
uk_data = dataframes.get('uk_data')
it_data = dataframes.get('it_data')
es_data = dataframes.get('es_data')
de_data = dataframes.get('de_data')
fr_data = dataframes.get('fr_data')          

Created dataframe: de_data from D1_merged_preprocessed.csv
Created dataframe: uk_data from E0_merged_preprocessed.csv
Created dataframe: fr_data from F1_merged_preprocessed.csv
Created dataframe: it_data from I1_merged_preprocessed.csv
Created dataframe: es_data from SP1_merged_preprocessed.csv


In [74]:
# Directory containing the .pkl files
models_dir = '../models'
models = {}
# Iterate through the directory to find all .pkl files
for filename in os.listdir(models_dir):
    if filename.endswith('.pkl'):
        # Full path to the .pkl file
        filepath = os.path.join(models_dir, filename)
        print(f"Loading model from {filepath}")

        # Determine the key and load the dataframe based on the league identifier
        if 'E0' in filepath:
            print(f"Loading model: uk from {filepath}")
            # Load the model using pickle
            with open(filepath, 'rb') as file:
                model = pickle.load(file)
                
            models['uk_model'] = model
            print(f"Instantiated model: uk_data from {filepath}")


Loading model from ../models\D1_voting_classifier.pkl
Loading model from ../models\E0_voting_classifier.pkl
Loading model: uk from ../models\E0_voting_classifier.pkl
Instantiated model: uk_data from ../models\E0_voting_classifier.pkl
Loading model from ../models\F1_voting_classifier.pkl
Loading model from ../models\I1_voting_classifier.pkl
Loading model from ../models\SP1_voting_classifier.pkl


In [102]:
# Make the predictions
for league, matches in data_dict.items():
    print(f"Making predictions for {league}")
    if "E0" in league:
        for match in matches:            
            home_team = match['home_team']
            away_team = match['away_team']
            print(f"Home team: {home_team}, Away team: {away_team}")

            # row to predict
            numeric_columns = uk_data.select_dtypes(include=['number']).columns  # Get the column names for all the numeric columns
            # If the column Over2.5 is present, drop it
            if 'Over2.5' in numeric_columns:
                numeric_columns = numeric_columns.drop('Over2.5')

            row = pd.DataFrame(columns=numeric_columns)  # Create an empty DataFrame with the numeric columns
            row.loc[len(row)] = [None] * len(row.columns)  # Initialize a new row with NaN

            # Check if the home team is in the DataFrame
            if home_team not in uk_data['HomeTeam'].values:
                print(f"Home team {home_team} not found in the DataFrame, skipping to the next match")
                continue

            # Check if the away team is in the DataFrame
            if away_team not in uk_data['AwayTeam'].values:
                print(f"Away team {away_team} not found in the DataFrame, skipping to the next match")
                continue

            # get the data for the home team
            home_team_df = uk_data[uk_data['HomeTeam'] == home_team]
            # Sort the filtered DataFrame by Date in descending order
            home_sorted_df = home_team_df.sort_values(by='Date', ascending=False)
            # Select the first 5 rows
            home_team_final_df = home_sorted_df.head(5)[numeric_columns]

            # get the data for the home team
            away_team_df = uk_data[uk_data['AwayTeam'] == away_team]
            # Sort the filtered DataFrame by Date in descending order
            away_sorted_df = away_team_df.sort_values(by='Date', ascending=False)
            # Select the first 5 rows
            away_team_final_df = away_sorted_df.head(5)[numeric_columns]  

            for column in home_team_features:
                if column in home_team_final_df.columns:
                    #print(f"Column in the home_team_df: {column}")
                    row.loc[len(row)-1,column] = home_team_final_df[column].mean()
            
            for column in away_team_features:
                if column in away_team_final_df.columns:
                    #print(f"Column in the away_team_df: {column}")
                    row.loc[len(row)-1, column] = away_team_final_df[column].mean()

            for column in general_features:
                if column != "Over2.5":
                    if ((column in away_team_final_df.columns) and (column in home_team_final_df.columns)):
                        #print(f"Column in the away_team_df and in the home_team_final_df: {column}")
                        row.loc[len(row)-1,column] = away_team_final_df[column].mean() + home_team_final_df[column].mean() / 2

            # Make the prediction
            X_test = row.values
            prediction = models['uk_model'].predict(X_test)
            predicted_probability = models['uk_model'].predict_proba(X_test)[0]

            if prediction == 1:
                print(f"The match between {home_team} and {away_team} will end with more than 2.5 goals. With a probability of {[round (prediction,2) for prediction in predicted_probability]}")
            else:
                print(f"The match between {home_team} and {away_team} will end with less than 2.5 goals. With a probability of {[round (prediction,2) for prediction in predicted_probability]}")


Making predictions for E0
Home team: Arsenal, Away team: Brighton
The match between Arsenal and Brighton will end with more than 2.5 goals. With a probability of [0.36, 0.64]
Home team: Brentford, Away team: Southampton
The match between Brentford and Southampton will end with less than 2.5 goals. With a probability of [0.79, 0.21]
Home team: Everton, Away team: Bournemouth
The match between Everton and Bournemouth will end with less than 2.5 goals. With a probability of [0.74, 0.26]
Home team: Ipswich, Away team: Fulham
The match between Ipswich and Fulham will end with less than 2.5 goals. With a probability of [0.9, 0.1]
Home team: Leicester, Away team: Aston Villa
The match between Leicester and Aston Villa will end with less than 2.5 goals. With a probability of [0.56, 0.44]
Home team: Nott'm Forest, Away team: Wolves
The match between Nott'm Forest and Wolves will end with less than 2.5 goals. With a probability of [0.79, 0.21]
Home team: West Ham, Away team: Man City
The match b

In [79]:
home_team_final_df.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
365,0.0,0.0,5,2,0.0,1.0,0.0,1.0,0.0,0.0,1.66,2.3,1.0,0
575,60.0,60.0,8,8,68.42,1.8,73.68,2.0,1.6,1.8,1.4,3.2,1.63,0
37,40.0,40.0,2,5,68.42,0.4,52.63,1.8,1.2,1.8,1.4,3.0,1.63,0
210,60.0,60.0,10,7,68.42,1.4,63.16,2.0,1.4,1.8,1.41,3.75,1.63,0
631,80.0,60.0,13,4,68.42,2.4,57.89,2.4,1.2,1.0,1.37,3.4,1.63,0


In [80]:
away_team_final_df.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
442,0.0,0.0,2,5,0.0,0.0,0.0,0.0,2.0,2.0,1.4,3.2,0.0,0
441,80.0,80.0,5,7,78.95,2.0,63.16,2.6,1.8,2.0,1.39,3.2,2.53,0
440,40.0,60.0,8,8,52.63,1.4,63.16,1.2,1.6,1.6,1.42,3.2,1.63,0
439,20.0,60.0,6,7,36.84,1.2,63.16,1.4,0.8,2.0,1.53,2.63,1.16,0
438,80.0,80.0,5,7,63.16,1.4,63.16,1.6,1.2,2.2,1.48,2.75,1.63,0


In [81]:
row.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
0,48.0,56.0,7.6,6.8,54.736,1.2,50.528,1.84,1.08,1.96,2.168,4.561,1.504,0.0


In [93]:
X_test = row.values
X_test

array([[48.0, 56.0, 7.6, 6.8, 54.736000000000004, 1.2, 50.528,
        1.8399999999999999, 1.0799999999999998, 1.9600000000000002,
        2.168, 4.561, 1.504, 0.0]], dtype=object)

In [95]:
prediction = models['uk_model'].predict(X_test)
print(f"Prediction: {prediction}")

Prediction: [1]


In [98]:
# get the probabilities
prediction = models['uk_model'].predict_proba(X_test)[0]
print(f"Prediction probabilities of Over2.5: {[round (prediction,2) for prediction in prediction]}")

Prediction probabilities of Over2.5: [0.45, 0.55]
