PART TWO
PREDICTION USING TEAM PERFORMANCE

In [1]:
# Load the libraries to be used

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, label_binarize
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import SMOTE

# Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns




In [5]:
# PREDICTION USING TEAM PERFORMANCE
# Load the data from path
Data_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\Dataset 2\football_matches_dataset.csv"
Data = pd.read_csv(Data_path)

# Display the first few rows of the data
print(" The First few rows of the dataset:")
print(Data.head())

 The First few rows of the dataset:
   ID              home_team            away_team            date   league  \
0   0                  Reims  Paris Saint Germain  August 08 2014  Ligue 1   
1   1  Evian Thonon Gaillard                 Caen  August 09 2014  Ligue 1   
2   2               Guingamp        Saint-Etienne  August 09 2014  Ligue 1   
3   3                  Lille                 Metz  August 09 2014  Ligue 1   
4   4            Montpellier             Bordeaux  August 09 2014  Ligue 1   

      season  home_score  away_score  home_xg  away_xg  ...  \
0  2014/2015           2           2     1.37     2.66  ...   
1  2014/2015           0           3     0.81     1.24  ...   
2  2014/2015           0           2     0.63     1.50  ...   
3  2014/2015           0           0     1.54     0.06  ...   
4  2014/2015           0           1     1.02     0.75  ...   

  home_defense_rating_last_2_home_matches  \
0                                     NaN   
1                         

In [6]:
# DATA CLEANING 
# Cleaning the dataset  


# Drop other league matches (filter to retain only EPL matches)
Epl_matches = Data[Data['league'] == 'EPL']

# Check the filtered dataset to ensure only EPL matches are retained
print(Epl_matches.head())


# Then drop columns that are not needed (all the expected variable columns and date) 
columns_to_drop = [
    'date', 'league', 'season','home_xg', 
    'away_xg', 'datetime', 'home_mean_xg_last_4_matches', 
    'away_mean_xg_last_4_matches', 'home_mean_xg_last_2_home_matches', 
    'away_mean_xg_last_2_away_matches', 'home_mean_xg_against_last_4_matches', 
    'away_mean_xg_against_last_4_matches', 'home_mean_xg_against_last_2_home_matches', 
    'away_mean_xg_against_last_2_away_matches', 'ID'
]

Epl_matches_dropped = Epl_matches.drop(columns=columns_to_drop)

# Display the first few rows of the updated DataFrame to confirm that the columns were dropped
print(Epl_matches_dropped.head())

# Handling duplicates and missing values by checking through the dataset 

# Check through to Inspect the datatypes ensuring it contains the appropriate data types
print(Epl_matches_dropped.dtypes)

# Check through the data if there are duplicate rows or columns
print(Epl_matches_dropped.duplicated().sum())

# Then Check for missing values
print(Epl_matches_dropped.isnull().sum())

# Drop rows that have missing values 
Epl_matches_dropped = Epl_matches_dropped.dropna(axis=0, how='any')

# confirm if there are no more missing values
print(Epl_matches_dropped.isnull().sum())


# check the characteristics of the cleaned dataset and display the first few rows of the columns
print("The first few rows of the dataset:")
print(Epl_matches_dropped.head())

# Hence, using the for loop, check the length of all columns to ensure they are the same 
for column in Epl_matches_dropped.columns:
    print(f"The length of the column '{column}' is: {len(Epl_matches_dropped[column])}")
    

# Save the new cleaned data to a csv file
Epl_matches_dropped.to_csv(r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\Dataset 2\Cleaned_dataset.csv", index=False) 


    ID            home_team       away_team            date league     season  \
11  11              Arsenal  Crystal Palace  August 16 2014    EPL  2014/2015   
12  12            Leicester         Everton  August 16 2014    EPL  2014/2015   
15  15    Manchester United         Swansea  August 16 2014    EPL  2014/2015   
18  18  Queens Park Rangers            Hull  August 16 2014    EPL  2014/2015   
20  20                Stoke     Aston Villa  August 16 2014    EPL  2014/2015   

    home_score  away_score  home_xg  away_xg  ...  \
11           2           1     1.55     0.16  ...   
12           2           2     1.28     0.61  ...   
15           1           2     1.17     0.28  ...   
18           0           1     1.90     1.12  ...   
20           0           1     0.42     0.91  ...   

   home_defense_rating_last_2_home_matches  \
11                                     NaN   
12                                     NaN   
15                                     NaN   
18        

In [7]:
# FEATURE ENGINEERING- In this section, new features will be created to provide better data input for the models 

# Load the dataset
Data_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\Dataset 2\Cleaned_dataset.csv"
df = pd.read_csv(Data_path)

# create a new feature to show the match fulltime result (either home win, draw or lose)
def determine_match_result(row):
    if row['home_score'] > row['away_score']:
        return 'H'
    elif row['home_score'] < row['away_score']:
        return 'A'
    else:
        return 'D'

df['Full_Time_Result'] = df.apply(determine_match_result, axis=1)

# create a new feature for the standing difference between the home and away team
df['Standings_Difference'] = df['home_standings'] - df['away_standings']



# create new feature for team form using their last four matches 
def calculate_form(points_last_4_matches):
    if points_last_4_matches >= 10:
        return 'Good'
    elif 5 <= points_last_4_matches < 10:
        return 'Average'
    else:
        return 'Poor'

# Home Team Form
df['Home_Team_Form'] = df['home_points_last_4_matches'].apply(calculate_form)

# Away Team Form
df['Away_Team_Form'] = df['away_points_last_4_matches'].apply(calculate_form)


# Confirm the new features have been added
print(df.head())


# save the update CSV file with new features to the pth
df.to_csv(r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\Dataset 2\New_featured_dataset.csv", index=False) 

print ('The Csv file has been saved in the path')


             home_team    away_team  home_score  away_score  home_standings  \
0          Aston Villa      Arsenal           0           3               2   
1              Burnley   Sunderland           0           0              17   
2     Newcastle United         Hull           2           2              20   
3  Queens Park Rangers        Stoke           2           2              18   
4              Swansea  Southampton           0           1               4   

   away_standings  home_points  away_points  home_points_last_4_matches  \
0               7           10            6                        10.0   
1              15            2            3                         2.0   
2              10            2            5                         2.0   
3              13            3            4                         3.0   
4               5            9            7                         9.0   

   away_points_last_4_matches  ...  home_midfield_rating_last_2_home_match