### Data Preprocessing

### Overview

This notebook is used to make the over 2.5 predictions for all the matches saved in the `data/netx_matches.json` file. 

### Pre-requisites 

1. A conda environment is needed.

For example:
```
cd path/to/conda/dir
conda env create -f aifootball_predictions.yaml
conda activate aifootball_predictions
python -m ipykernel install --user --name aifootball_predictions --display-name "aifootball_predictions"
```

2. All the trained models in the `models` directory

3. All the preprocessed dataset `data/preprocessed` which will be used to make the predictions

### Authors

- mauo.andretta222@gmail.com

In [20]:
# install the necessary packages
import pandas as pd
import os
import json
import pickle
import numpy as np
from datetime import datetime
from sklearn.impute import KNNImputer

Specify useful features for the home and away teams, togheter with the netural features (related to no team)

In [2]:
home_team_features = [
    'HomeTeam',
    'FTHG', 'HG',  # Full Time Home Team Goals
    'HTHG',        # Half Time Home Team Goals
    'HS',          # Home Team Shots
    'HST',         # Home Team Shots on Target
    'HHW',         # Home Team Hit Woodwork
    'HC',          # Home Team Corners
    'HF',          # Home Team Fouls Committed
    'HFKC',        # Home Team Free Kicks Conceded
    'HO',          # Home Team Offsides
    'HY',          # Home Team Yellow Cards
    'HR',          # Home Team Red Cards
    'HBP',         # Home Team Bookings Points
    'B365H', 'BFH', 'BSH', 'BWH', 'GBH', 'IWH', 'LBH', 'PSH', 'SOH', 'SBH', 'SJH', 'SYH', 'VCH', 'WHH',  # Home win odds
    'BbMxH', 'BbAvH', 'MaxH', 'AvgH',  # Home win odds statistics
    'BFEH',       # Betfair Exchange home win odds
    'BbMxAHH', 'BbAvAHH', 'GBAHH', 'LBAHH', 'B365AHH', 'PAHH', 'MaxAHH', 'AvgAHH',  # Asian handicap home team odds
    'BbAHh', 'AHh', 'GBAH', 'LBAH', 'B365AH',  # Size of handicap (home team)
    'AvgHomeGoalsScored', 'AvgHomeGoalsConceded',
    'HomeOver2.5Perc',
    'AvgLast5HomeGoalsScored', 'AvgLast5HomeGoalsConceded',
    'Last5HomeOver2.5Count', 'Last5HomeOver2.5Perc'
]


In [3]:
away_team_features = [
    'AwayTeam',
    'FTAG', 'AG',  # Full Time Away Team Goals
    'HTAG',        # Half Time Away Team Goals
    'AS',          # Away Team Shots
    'AST',         # Away Team Shots on Target
    'AHW',         # Away Team Hit Woodwork
    'AC',          # Away Team Corners
    'AF',          # Away Team Fouls Committed
    'AFKC',        # Away Team Free Kicks Conceded
    'AO',          # Away Team Offsides
    'AY',          # Away Team Yellow Cards
    'AR',          # Away Team Red Cards
    'ABP',         # Away Team Bookings Points
    'B365A', 'BFA', 'BSA', 'BWA', 'GBA', 'IWA', 'LBA', 'PSA', 'SOA', 'SBA', 'SJA', 'SYA', 'VCA', 'WHA',  # Away win odds
    'BbMxA', 'BbAvA', 'MaxA', 'AvgA',  # Away win odds statistics
    'BFEA',       # Betfair Exchange away win odds
    'BbMxAHA', 'BbAvAHA', 'GBAHA', 'LBAHA', 'B365AHA', 'PAHA', 'MaxAHA', 'AvgAHA',  # Asian handicap away team odds
    'AvgAwayGoalsScored', 'AvgAwayGoalsConceded',
    'AwayOver2.5Perc',
    'AvgLast5AwayGoalsScored', 'AvgLast5AwayGoalsConceded',
    'Last5AwayOver2.5Count', 'Last5AwayOver2.5Perc'
]

In [13]:
general_features = [
    'Div',       # League Division
    'Date',      # Match Date
    'Time',      # Match Time
    'FTR', 'Res',  # Full Time Result
    'HTR',        # Half Time Result
    'Attendance', # Crowd Attendance
    'Referee',    # Match Referee
    'Bb1X2',      # Number of BetBrain bookmakers used to calculate match odds averages and maximums
    'BbMxD', 'BbAvD', 'MaxD', 'AvgD',  # Draw odds statistics
    'B365D', 'BFD', 'BSD', 'BWD', 'GBD', 'IWD', 'LBD', 'PSD', 'SOD', 'SBD', 'SJD', 'SYD', 'VCD', 'WHD',  # Draw odds
    'BbOU',       # Number of BetBrain bookmakers used to calculate over/under 2.5 goals
    'BbMx>2.5', 'BbAv>2.5', 'BbMx<2.5', 'BbAv<2.5',  # Over/Under 2.5 goals odds statistics
    'GB>2.5', 'GB<2.5', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'AvgC>2.5', 'Avg<2.5', 'AvgC<2.5', 'MaxCAHA', 'B365CD', 'PC<2.5',
    'MaxC>2.5', 'B365C<2.5',  'MaxCA', 'B365CAHH',# Over/Under 2.5 goals odds
    'BbAH',       # Number of BetBrain bookmakers used to Asian handicap averages and maximums
    'Over2.5'     # Binary indicator if the match ended with more than 2.5 total goals
]

In [5]:
# Open the JSON file and load its data into a dictionary
with open("../data/next_matches.json", 'r', encoding='utf-16') as json_file:
    data_dict = json.load(json_file)

In [6]:
data_dict

{'E0': {'id': 2021,
  'crest': 'https://crests.football-data.org/PL.png',
  'name': 'Premier League',
  'next_matches': [{'date': '2024-09-14 11:30:00',
    'home_team': 'Southampton',
    'away_team': 'Man United',
    'home_team_crest': 'https://crests.football-data.org/340.png',
    'away_team_crest': 'https://crests.football-data.org/66.png'},
   {'date': '2024-09-14 14:00:00',
    'home_team': 'Brighton',
    'away_team': 'Ipswich',
    'home_team_crest': 'https://crests.football-data.org/397.png',
    'away_team_crest': 'https://crests.football-data.org/349.png'},
   {'date': '2024-09-14 14:00:00',
    'home_team': 'Crystal Palace',
    'away_team': 'Leicester',
    'home_team_crest': 'https://crests.football-data.org/354.png',
    'away_team_crest': 'https://crests.football-data.org/338.png'},
   {'date': '2024-09-14 14:00:00',
    'home_team': 'Fulham',
    'away_team': 'West Ham',
    'home_team_crest': 'https://crests.football-data.org/63.png',
    'away_team_crest': 'https:/

In [21]:
# Define the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Function to apply KNN imputer to league data and row_to_predict
def impute_with_knn(league_data: pd.DataFrame, row_to_predict: pd.DataFrame) -> pd.DataFrame:
    """
    Impute missing values in row_to_predict using KNN based on the entire league data.

    Args:
        league_data (pd.DataFrame): The entire dataset for the current league.
        row_to_predict (pd.DataFrame): The row that needs to be predicted and imputed.

    Returns:
        pd.DataFrame: The imputed row.
    """
    # Select only numeric columns from the league data
    numeric_columns = league_data.select_dtypes(include=['number']).columns
    
    # Combine the row_to_predict with league data
    combined_data = pd.concat([league_data[numeric_columns], row_to_predict], ignore_index=True)
    
    # Apply KNN imputation on the combined data
    imputed_data = knn_imputer.fit_transform(combined_data)
    
    # Extract the last row (the imputed row_to_predict)
    imputed_row = pd.DataFrame([imputed_data[-1]], columns=numeric_columns)
    
    return imputed_row


In [23]:
# Open the JSON file and load its data into a dictionary
json_competitions = '../data/next_matches.json'
with open(json_competitions, 'r', encoding='utf-16') as json_file:
    competitions = json.load(json_file)

# Define the models directory
models_dir = '../models'
# Define the data directory to make predictions
data_dir = '../data/processed'

VALID_LEAGUES = ["E0","I1", "D1", "SP1", "F1"]

# Initialize a variable to hold the Telegram message
prediction_message = f"🎯 **AI Football Predictions: Will There Be Over 2.5 Goals?** 🎯\n\nCheck out the latest predictions for the upcoming football matches! We've analyzed the data and here are our thoughts:\n PREDICTIONS DONE: {datetime.now().strftime('%Y-%m-%d')} \n\n"

for league in VALID_LEAGUES:
    # Iterate through the directory to find all .pkl files
    for filename in os.listdir(models_dir):
        # Check if the name of the file contains the league identifier
        if league in filename:
            # Checl if the file is a .pkl file
            if filename.endswith('.pkl'):
                filepath = os.path.join(models_dir, filename)

                # Load the model using pickle for the current league
                with open(filepath, 'rb') as file:
                    league_model = pickle.load(file)
    
    # Iterate through the directory to find all .csv files
    for filename in os.listdir(data_dir):
        # Check if the name of the file contains the league identifier
        if league in filename:
            # Check if the file is a .csv file
            if filename.endswith('.csv'):
                filepath = os.path.join(data_dir, filename)
                # Load the data using pandas for the current league
                league_data = pd.read_csv(filepath)

    # If the input_league_data and league_model is not empty, make the predictions
    if league_data is None and league_model is None:
        print(f"Could not find the data or model for the league {league}")
        continue
    # else make the predictions
    else:    
        # Make the predictions
        for competition_league, competitions_info in competitions.items():
            if competition_league == league:
                # Define the nationality flag to use in the message
                print(f"\n Making predictions for {league} \n")
                # Prepare the section for the current league
                league_section = f"🔵 **{competitions_info['name']}**:\n"
                for match in competitions_info["next_matches"]:            
                    home_team = match['home_team']
                    away_team = match['away_team']
                    print(f"Home team: {home_team}, Away team: {away_team}")

                    # row to predict
                    numeric_columns = league_data.select_dtypes(include=['number']).columns  # Get the column names for all the numeric columns

                    # If the column Over2.5 is in the numeric columns, remove it because it is the target column
                    if 'Over2.5' in numeric_columns:
                        numeric_columns = numeric_columns.drop('Over2.5')

                    # Define the row to predict 
                    row_to_predict = pd.DataFrame(columns=numeric_columns)  # Create an empty DataFrame with the numeric columns
                    row_to_predict.loc[len(row_to_predict)] = [None] * len(row_to_predict.columns)  # Initialize a new row with NaN

                    # Check if the home team is in the DataFrame
                    if home_team not in league_data['HomeTeam'].values:
                        print(f"Home team {home_team} not found in the {league} data, skipping to the next match")
                        continue

                    # Check if the away team is in the DataFrame
                    if away_team not in league_data['AwayTeam'].values:
                        print(f"Away team {away_team} not found in the {league} data, skipping to the next match")
                        continue

                    # get the data for the home team
                    home_team_df = league_data[league_data['HomeTeam'] == home_team]
                    # Sort the filtered DataFrame by Date in descending order
                    home_sorted_df = home_team_df.sort_values(by='Date', ascending=False)
                    # Select the first 5 rows
                    home_team_final_df = home_sorted_df.head(5)[numeric_columns]

                    # get the data for the home team
                    away_team_df = league_data[league_data['AwayTeam'] == away_team]
                    # Sort the filtered DataFrame by Date in descending order
                    away_sorted_df = away_team_df.sort_values(by='Date', ascending=False)
                    # Select the first 5 rows
                    away_team_final_df = away_sorted_df.head(5)[numeric_columns]

                    for column in row_to_predict.columns:
                        if column in home_team_features:
                            #print(f"Column in the home_team_df: {column}")
                            row_to_predict.loc[len(row_to_predict)-1,column] = home_team_final_df[column].mean()
                        elif column in away_team_features:
                            #print(f"Column in the away_team_df: {column}")
                            row_to_predict.loc[len(row_to_predict)-1, column] = away_team_final_df[column].mean()
                        else :
                            #print(f"Column in the general_features: {column}")
                            row_to_predict.loc[len(row_to_predict)-1,column] = away_team_final_df[column].mean() + home_team_final_df[column].mean() / 2

                    try:
                        # Make the prediction
                        X_test = row_to_predict.values
                        prediction = league_model.predict(X_test)
                        predicted_probability = league_model.predict_proba(X_test)[0]

                        if prediction == 1:
                            print(f"The match between {home_team} and {away_team} will end with more than 2.5 goals. With a probability of {[round (prediction,2) for prediction in predicted_probability]}")
                            result = f"Over 2.5 Goals! 🔥 ({round(predicted_probability[1] * 100, 2)}% chance)"
                        else:
                            print(f"The match between {home_team} and {away_team} will end with less than 2.5 goals. With a probability of {[round (prediction,2) for prediction in predicted_probability]}")
                            result = f"Under 2.5 Goals ({round(predicted_probability[0] * 100, 2)}% chance)"

                        # Add the match result to the league section
                        league_section += f"- ⚽ **{home_team}** 🆚 **{away_team}**: {result}\n"
                    except Exception as e:
                        print(f"An error occurred while making the prediction for the match between {home_team} and {away_team}")
                        print(f"Error: {e}")
                        row_to_predict.head()
                        continue

        # Add the league section to the Telegram message
        prediction_message += league_section + "\n"


 Making predictions for E0 

Home team: Southampton, Away team: Man United


TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

In [19]:
print(prediction_message)

🎯 **AI Football Predictions: Will There Be Over 2.5 Goals?** 🎯

Check out the latest predictions for the upcoming football matches! We've analyzed the data and here are our thoughts:
 PREDICTIONS DONE: 2024-09-09 

🔵 **Premier League**:
- ⚽ **Southampton** 🆚 **Man United**: Under 2.5 Goals (61.63% chance)
- ⚽ **Brighton** 🆚 **Ipswich**: Over 2.5 Goals! 🔥 (85.26% chance)
- ⚽ **Crystal Palace** 🆚 **Leicester**: Over 2.5 Goals! 🔥 (71.3% chance)
- ⚽ **Fulham** 🆚 **West Ham**: Over 2.5 Goals! 🔥 (75.39% chance)
- ⚽ **Liverpool** 🆚 **Nott'm Forest**: Over 2.5 Goals! 🔥 (56.05% chance)
- ⚽ **Man City** 🆚 **Brentford**: Over 2.5 Goals! 🔥 (86.87% chance)
- ⚽ **Aston Villa** 🆚 **Everton**: Under 2.5 Goals (59.1% chance)
- ⚽ **Bournemouth** 🆚 **Chelsea**: Over 2.5 Goals! 🔥 (89.64% chance)
- ⚽ **Tottenham** 🆚 **Arsenal**: Over 2.5 Goals! 🔥 (84.23% chance)
- ⚽ **Wolves** 🆚 **Newcastle**: Over 2.5 Goals! 🔥 (55.86% chance)

🔵 **Serie A**:
- ⚽ **Empoli** 🆚 **Juventus**: Under 2.5 Goals (74.29% chance)
-

In [151]:
# File path and name
file_path = "final_predictions.txt"

# Saving the string to a file
with open(file_path, 'w', encoding='utf-8') as file:
    file.write(prediction_message)

In [79]:
home_team_final_df.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
365,0.0,0.0,5,2,0.0,1.0,0.0,1.0,0.0,0.0,1.66,2.3,1.0,0
575,60.0,60.0,8,8,68.42,1.8,73.68,2.0,1.6,1.8,1.4,3.2,1.63,0
37,40.0,40.0,2,5,68.42,0.4,52.63,1.8,1.2,1.8,1.4,3.0,1.63,0
210,60.0,60.0,10,7,68.42,1.4,63.16,2.0,1.4,1.8,1.41,3.75,1.63,0
631,80.0,60.0,13,4,68.42,2.4,57.89,2.4,1.2,1.0,1.37,3.4,1.63,0


In [80]:
away_team_final_df.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
442,0.0,0.0,2,5,0.0,0.0,0.0,0.0,2.0,2.0,1.4,3.2,0.0,0
441,80.0,80.0,5,7,78.95,2.0,63.16,2.6,1.8,2.0,1.39,3.2,2.53,0
440,40.0,60.0,8,8,52.63,1.4,63.16,1.2,1.6,1.6,1.42,3.2,1.63,0
439,20.0,60.0,6,7,36.84,1.2,63.16,1.4,0.8,2.0,1.53,2.63,1.16,0
438,80.0,80.0,5,7,63.16,1.4,63.16,1.6,1.2,2.2,1.48,2.75,1.63,0


In [81]:
row_to_predict.head()

Unnamed: 0,Last5HomeOver2.5Perc,Last5AwayOver2.5Perc,HST,AST,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AwayOver2.5Perc,AvgLast5HomeGoalsScored,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,MaxC>2.5,B365C<2.5,AvgHomeGoalsScored,HR
0,48.0,56.0,7.6,6.8,54.736,1.2,50.528,1.84,1.08,1.96,2.168,4.561,1.504,0.0


In [93]:
X_test = row_to_predict.values
X_test

array([[48.0, 56.0, 7.6, 6.8, 54.736000000000004, 1.2, 50.528,
        1.8399999999999999, 1.0799999999999998, 1.9600000000000002,
        2.168, 4.561, 1.504, 0.0]], dtype=object)

In [95]:
prediction = models['uk_model'].predict(X_test)
print(f"Prediction: {prediction}")

Prediction: [1]


In [98]:
# get the probabilities
prediction = models['uk_model'].predict_proba(X_test)[0]
print(f"Prediction probabilities of Over2.5: {[round (prediction,2) for prediction in prediction]}")

Prediction probabilities of Over2.5: [0.45, 0.55]
