In [70]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [71]:
df = pd.read_csv('prediction.csv')

In [72]:
df.head()

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
0,1.0,Fri,2018-08-17,20:15,Girona,0.7,0–0,0.0,Valladolid,10368.0,Estadi Municipal de Montilivi,Guillermo Cuadra
1,1.0,Fri,2018-08-17,22:15,Betis,0.9,0–3,2.5,Levante,46225.0,Estadio Benito Villamarín,Ignacio Iglesias
2,1.0,Sat,2018-08-18,18:15,Celta Vigo,0.6,1–1,0.6,Espanyol,16215.0,Estadio de Balaídos,Santiago Jaime
3,1.0,Sat,2018-08-18,20:15,Villarreal,1.6,1–2,0.7,Real Sociedad,16250.0,Estadio de la Cerámica,Mario Melero
4,1.0,Sat,2018-08-18,22:15,Barcelona,3.2,3–0,0.3,Alavés,52356.0,Camp Nou,José Sánchez


In [73]:
#displaying all rows that contain at least one missing value. This is useful for identifying rows that may need further cleaning or imputation.
df[df.isnull().any(axis=1)]

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee
1029,24.0,Tue,2020-03-10,20:00,Eibar,2.0,1–2,1.2,Real Sociedad,,Estadio Municipal de Ipurúa,David Medié
1030,28.0,Thu,2020-06-11,22:00,Sevilla,1.7,2–0,0.5,Betis,,Estadio Ramón Sánchez Pizjuán,Antonio Matéu Lahoz
1031,28.0,Fri,2020-06-12,19:30,Granada,2.3,2–1,0.6,Getafe,,Estadio Nuevo Los Cármenes,David Medié
1032,28.0,Fri,2020-06-12,22:00,Valencia,1.0,1–1,1.4,Levante,,Estadio de Mestalla,Hsu Jason
1033,28.0,Sat,2020-06-13,14:00,Espanyol,2.5,2–0,0.1,Alavés,,RCDE Stadium,Pablo González
...,...,...,...,...,...,...,...,...,...,...,...,...
2256,36.0,Sun,2021-05-16,12:00,Crystal Palace,1.9,3–2,1.4,Aston Villa,,Selhurst Park,David Coote
2257,36.0,Sun,2021-05-16,14:05,Tottenham,3.2,2–0,0.9,Wolves,,Tottenham Hotspur Stadium,Martin Atkinson
2258,36.0,Sun,2021-05-16,16:30,West Brom,1.0,1–2,1.9,Liverpool,,The Hawthorns,Mike Dean
2259,36.0,Sun,2021-05-16,19:00,Everton,1.4,0–1,1.2,Sheffield Utd,,Goodison Park,Jonathan Moss


In [74]:
#filters the DataFrame df to show only the rows where the value in the Score column is NaN (i.e., missing or null).
df[df['Score'].isnull()]

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee


In [75]:
# Drop rows where there are missing values in 'Score', 'xG', 'xG.1', and 'Referee'
df = df.dropna(subset=['Score', 'xG', 'xG.1', 'Referee'])

# We'll also choose to drop the 'Attendance' column if not used for prediction
# Drop 'Attendance' and 'Time' columns as they may not be essential for the initial model
df = df.drop(columns=['Attendance', 'Time'])

In [76]:
df.head()

Unnamed: 0,Wk,Day,Date,Home,xG,Score,xG.1,Away,Venue,Referee
0,1.0,Fri,2018-08-17,Girona,0.7,0–0,0.0,Valladolid,Estadi Municipal de Montilivi,Guillermo Cuadra
1,1.0,Fri,2018-08-17,Betis,0.9,0–3,2.5,Levante,Estadio Benito Villamarín,Ignacio Iglesias
2,1.0,Sat,2018-08-18,Celta Vigo,0.6,1–1,0.6,Espanyol,Estadio de Balaídos,Santiago Jaime
3,1.0,Sat,2018-08-18,Villarreal,1.6,1–2,0.7,Real Sociedad,Estadio de la Cerámica,Mario Melero
4,1.0,Sat,2018-08-18,Barcelona,3.2,3–0,0.3,Alavés,Camp Nou,José Sánchez


In [77]:
# Split the 'Score' column into 'HomeGoals' and 'AwayGoals' columns
df[['home_goals', 'away_goals']] = df['Score'].str.split('–', expand=True).astype(float)

In [78]:
# Create a season_start column 
df['Date'] = pd.to_datetime(df['Date'])
#reates the new season_start column by applying a lambda function to each date in the Date column.
df['season_start'] = df['Date'].apply(lambda x: x.year - 1 if x.month < 8 else x.year)

In [79]:
# Create a target variable for the match result
def determine_result(row):
    if row['home_goals'] > row['away_goals']:
        return 'Home win'
    elif row['home_goals'] < row['away_goals']:
        return 'Away win'
    else:
        return 'Draw'

df['result'] = df.apply(determine_result, axis=1)

In [80]:
#encodes the 'Day' column as a categorical feature by creating dummy variables, and it uses pd.get_dummies to do so.
df['Day'] = df['Date'].dt.day_name()
df = pd.get_dummies(df, columns=['Day'])

# Display the updated dataset with new features
df.head()

Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,away_goals,season_start,result,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,1.0,2018-08-17,Girona,0.7,0–0,0.0,Valladolid,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw,True,False,False,False,False,False,False
1,1.0,2018-08-17,Betis,0.9,0–3,2.5,Levante,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018,Away win,True,False,False,False,False,False,False
2,1.0,2018-08-18,Celta Vigo,0.6,1–1,0.6,Espanyol,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018,Draw,False,False,True,False,False,False,False
3,1.0,2018-08-18,Villarreal,1.6,1–2,0.7,Real Sociedad,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
4,1.0,2018-08-18,Barcelona,3.2,3–0,0.3,Alavés,Camp Nou,José Sánchez,3.0,0.0,2018,Home win,False,False,True,False,False,False,False


In [81]:
# resets the index of the DataFrame, removing the existing index and replacing it with a new default one (starting from 0).
df.reset_index(drop=True, inplace=True)
#sorts the data by date
df.sort_values(['Date'], inplace=True)

In [82]:
#iterates over the unique values in the 'Home' column (i.e., all the unique home teams).
for x in df.Home.unique():
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    break

In [83]:
temp_df

Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,away_goals,season_start,result,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
380,1.0,2018-08-10,Manchester Utd,1.5,2–1,1.8,Leicester City,Old Trafford,Andre Marriner,2.0,1.0,2018,Home win,True,False,False,False,False,False,False
398,2.0,2018-08-19,Brighton,1.7,3–2,1.4,Manchester Utd,The American Express Community Stadium,Kevin Friend,3.0,2.0,2018,Home win,False,False,False,True,False,False,False
409,3.0,2018-08-27,Manchester Utd,1.5,0–3,1.2,Tottenham,Old Trafford,Craig Pawson,0.0,3.0,2018,Away win,False,True,False,False,False,False,False
419,4.0,2018-09-02,Burnley,0.8,0–2,2.5,Manchester Utd,Turf Moor,Jonathan Moss,0.0,2.0,2018,Away win,False,False,False,True,False,False,False
426,5.0,2018-09-15,Watford,1.3,1–2,1.9,Manchester Utd,Vicarage Road Stadium,Mike Dean,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519,35.0,2024-04-27,Manchester Utd,2.7,1–1,2.1,Burnley,Old Trafford,John Brooks,1.0,1.0,2023,Draw,False,False,True,False,False,False,False
4536,36.0,2024-05-06,Crystal Palace,1.7,4–0,0.4,Manchester Utd,Selhurst Park,Jarred Gillett,4.0,0.0,2023,Home win,False,True,False,False,False,False,False
4545,37.0,2024-05-12,Manchester Utd,0.5,0–1,1.5,Arsenal,Old Trafford,Paul Tierney,0.0,1.0,2023,Away win,False,False,False,True,False,False,False
4549,34.0,2024-05-15,Manchester Utd,1.5,3–2,2.2,Newcastle Utd,Old Trafford,Robert Jones,3.0,2.0,2023,Home win,False,False,False,False,False,False,True


In [84]:
# Update the rolling average goals for the home team in the main dataset
#Iterating through each unique team in the 'Home' column:
for x in df.Home.unique():
    #filtering the data for each team
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    #Sorting the Matches Chronologically:
    temp_df = temp_df.sort_values(['Date'])
    #Assigning a Goal Value for Calculation:
    temp_df['goal_value_to_calculate'] = temp_df.apply(lambda y: y['home_goals'] if y['Home'] == x else y['away_goals'], axis=1)
    #Calculating the Rolling Average
    temp_df['rolling_avg_goals'] = temp_df['goal_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    #Updating the Main DataFrame
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            df.at[index, 'home_rolling_avg_goals'] = row['rolling_avg_goals']
        else:
            df.at[index, 'away_rolling_avg_goals'] = row['rolling_avg_goals']

In [85]:
#filtering the DataFrame df to retrieve information about matches involving the team "Brighton".
df[(df['Home'] == 'Brighton') | (df['Away'] == 'Brighton')][['Wk', 'Date', 'Home', 'Away', 'home_goals', 'away_goals','home_rolling_avg_goals', 'away_rolling_avg_goals']]

Unnamed: 0,Wk,Date,Home,Away,home_goals,away_goals,home_rolling_avg_goals,away_rolling_avg_goals
384,1.0,2018-08-11,Watford,Brighton,2.0,0.0,,
398,2.0,2018-08-19,Brighton,Manchester Utd,3.0,2.0,0.0,2.000000
405,3.0,2018-08-25,Liverpool,Brighton,1.0,0.0,3.0,1.500000
412,4.0,2018-09-01,Brighton,Fulham,2.0,2.0,1.0,1.666667
429,5.0,2018-09-17,Southampton,Brighton,2.0,2.0,1.0,1.250000
...,...,...,...,...,...,...,...,...
4524,35.0,2024-04-28,Bournemouth,Brighton,3.0,0.0,1.2,0.400000
4533,36.0,2024-05-05,Brighton,Aston Villa,1.0,0.0,0.2,2.200000
4542,37.0,2024-05-11,Newcastle Utd,Brighton,1.0,1.0,2.8,0.400000
4548,34.0,2024-05-15,Brighton,Chelsea,1.0,2.0,0.6,2.400000


In [86]:
# Calculate the rolling average of expected goals (xG) for each team in the DataFrame df.
#Loop through Unique Home Teams:
for x in df.Home.unique():
    #Create Temporary DataFrame for Matches and sorting with date
    temp_df = df[(df['Home'] == x) | (df['Away'] == x)]
    temp_df = temp_df.sort_values(['Date'])
    #Calculate xG Value
    temp_df['xG_value_to_calculate'] = temp_df.apply(lambda y: y['xG'] if y['Home'] == x else y['xG.1'], axis=1)
    #Calculate Rolling Average of xG
    temp_df['rolling_avg_xG'] = temp_df['xG_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    #Update Main DataFrame with Rolling Averages
    for index, row in temp_df.iterrows():
        if row['Home'] == x:
            df.at[index, 'home_rolling_avg_xG'] = row['rolling_avg_xG']
        else:
            df.at[index, 'away_rolling_avg_xG'] = row['rolling_avg_xG']

In [87]:
# One last bit of clean up is to drop the rows where the rolling averages are null
df = df.dropna(subset=['home_rolling_avg_goals', 'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'])

In [88]:
#Filtering for Brighton Matches
df[(df['Home'] == 'Brighton') | (df['Away'] == 'Brighton')][['Date', 'Home', 'Away', 'xG', 'xG.1','home_rolling_avg_xG', 'away_rolling_avg_xG']]

Unnamed: 0,Date,Home,Away,xG,xG.1,home_rolling_avg_xG,away_rolling_avg_xG
398,2018-08-19,Brighton,Manchester Utd,1.7,1.4,0.300000,1.50
405,2018-08-25,Liverpool,Brighton,1.6,0.6,3.300000,1.00
412,2018-09-01,Brighton,Fulham,2.8,1.6,0.866667,1.20
429,2018-09-17,Southampton,Brighton,2.0,1.4,1.500000,1.35
437,2018-09-22,Brighton,Tottenham,0.8,1.9,1.360000,1.54
...,...,...,...,...,...,...,...
4524,2024-04-28,Bournemouth,Brighton,1.9,0.9,1.360000,0.72
4533,2024-05-05,Brighton,Aston Villa,2.5,0.1,0.800000,1.34
4542,2024-05-11,Newcastle Utd,Brighton,2.2,1.4,2.140000,1.08
4548,2024-05-15,Brighton,Chelsea,1.3,1.5,1.260000,2.22


In [89]:
# Let's train a model to predict the match result
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [90]:
print(df.columns)
df.drop(columns=['Date', 'xG', 'xG.1', 'Home', 'Away', 'Referee', 'Venue', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns


Index(['Wk', 'Date', 'Home', 'xG', 'Score', 'xG.1', 'Away', 'Venue', 'Referee',
       'home_goals', 'away_goals', 'season_start', 'result', 'Day_Friday',
       'Day_Monday', 'Day_Saturday', 'Day_Sunday', 'Day_Thursday',
       'Day_Tuesday', 'Day_Wednesday', 'home_rolling_avg_goals',
       'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'],
      dtype='object')


Index(['Wk', 'Day_Friday', 'Day_Monday', 'Day_Saturday', 'Day_Sunday',
       'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday',
       'home_rolling_avg_goals', 'away_rolling_avg_goals',
       'home_rolling_avg_xG', 'away_rolling_avg_xG'],
      dtype='object')

In [91]:
df

Unnamed: 0,Wk,Date,Home,xG,Score,xG.1,Away,Venue,Referee,home_goals,...,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
390,2.0,2018-08-18,Cardiff City,0.9,0–0,1.6,Newcastle Utd,Cardiff City Stadium,Craig Pawson,0.0,...,False,True,False,False,False,False,0.0,1.0,1.40,1.00
392,2.0,2018-08-18,Everton,1.5,2–1,1.7,Southampton,Goodison Park,Lee Mason,2.0,...,False,True,False,False,False,False,2.0,0.0,1.00,1.10
393,2.0,2018-08-18,Leicester City,0.2,2–0,1.1,Wolves,King Power Stadium,Mike Dean,2.0,...,False,True,False,False,False,False,1.0,2.0,1.80,1.00
391,2.0,2018-08-18,West Ham,1.2,1–2,1.3,Bournemouth,London Stadium,Stuart Attwell,1.0,...,False,True,False,False,False,False,0.0,2.0,0.70,2.20
395,2.0,2018-08-18,Chelsea,1.6,3–2,2.4,Arsenal,Stamford Bridge,Martin Atkinson,3.0,...,False,True,False,False,False,False,3.0,0.0,1.90,0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4174,38.0,2024-05-25,Rayo Vallecano,1.1,0–1,0.9,Athletic Club,Estadio del Rayo Vallecano,José Luis Munuera,0.0,...,False,True,False,False,False,False,0.4,1.6,1.36,0.96
4176,38.0,2024-05-26,Getafe,0.9,1–2,1.4,Mallorca,Coliseum Alfonso Pérez,Víctor García,1.0,...,False,False,True,False,False,False,0.6,1.0,1.52,1.08
4177,38.0,2024-05-26,Las Palmas,1.0,1–1,2.5,Alavés,Estadio de Gran Canaria,Francisco Hernández,1.0,...,False,False,True,False,False,False,0.4,1.4,1.08,1.46
4179,38.0,2024-05-26,Sevilla,1.5,1–2,1.3,Barcelona,Estadio Ramón Sánchez Pizjuán,Javier Villanueva,1.0,...,False,False,True,False,False,False,1.2,2.6,1.40,1.96


In [92]:
# Define the features and target variable
features = [column for column in df.drop(columns=['Date', 'xG', 'xG.1', 'Home', 'Away', 'Referee', 'Venue', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets, The dataset is split into training data (for seasons up to 2022) and test data (for the 2023 season).

train_data = df[df['season_start'] <= 2022]
test_data = df[df['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']


In [93]:
# Check the data types of X_train
print(X_train.dtypes)

# Display the first few rows of X_train to identify any non-numeric values
print(X_train.head())



Wk                        float64
Day_Friday                   bool
Day_Monday                   bool
Day_Saturday                 bool
Day_Sunday                   bool
Day_Thursday                 bool
Day_Tuesday                  bool
Day_Wednesday                bool
home_rolling_avg_goals    float64
away_rolling_avg_goals    float64
home_rolling_avg_xG       float64
away_rolling_avg_xG       float64
dtype: object
      Wk  Day_Friday  Day_Monday  Day_Saturday  Day_Sunday  Day_Thursday  \
390  2.0       False       False          True       False         False   
392  2.0       False       False          True       False         False   
393  2.0       False       False          True       False         False   
391  2.0       False       False          True       False         False   
395  2.0       False       False          True       False         False   

     Day_Tuesday  Day_Wednesday  home_rolling_avg_goals  \
390        False          False                     0.0   
392

In [94]:
# Train a Random Forest model
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)

In [95]:
# Make predictions
predictions = clf.predict(X_test)

In [97]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.44591029023746703

In [98]:
# confusion matrix
confusion_matrix(y_test, predictions)

array([[ 90,  27, 112],
       [ 64,  17, 107],
       [ 83,  27, 231]], dtype=int64)

In [99]:
# Our baseline model has an accuracy of
df['result'].value_counts(normalize=True)

result
Home win    0.446188
Away win    0.304972
Draw        0.248840
Name: proportion, dtype: float64

In [100]:
# using GridSearchCV to tune the hyperparameters of your Random Forest model. This approach will help us to find the best combination of parameters to improve the model's performance.
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200], # The number of trees in the forest.
    'max_depth': [5, 10, 15, 20] #The maximum depth of the trees.
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [101]:
grid_search.best_params_

{'max_depth': 5, 'n_estimators': 200}

In [102]:
# Train a Random Forest model with the best hyperparameters
clf = RandomForestClassifier(random_state=1, n_estimators=200, max_depth=5)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.5026385224274407

The accuracy of the model has increased after tuning the hyperparameters!


In [103]:
#converts categorical variables (Home, Away, Referee, Venue) into dummy/indicator variables. This is necessary because most machine learning algorithms, including Random Forests, require numerical input.
df = pd.get_dummies(df, columns=['Home', 'Away', 'Referee', 'Venue'])
df.head()

Unnamed: 0,Wk,Date,xG,Score,xG.1,home_goals,away_goals,season_start,result,Day_Friday,...,Venue_The American Express Community Stadium,Venue_The City Ground,Venue_The Hawthorns,Venue_The John Smith's Stadium,Venue_Tottenham Hotspur Stadium,Venue_Turf Moor,Venue_Vicarage Road Stadium,Venue_Villa Park,Venue_Vitality Stadium,Venue_Wembley Stadium
390,2.0,2018-08-18,0.9,0–0,1.6,0.0,0.0,2018,Draw,False,...,False,False,False,False,False,False,False,False,False,False
392,2.0,2018-08-18,1.5,2–1,1.7,2.0,1.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False
393,2.0,2018-08-18,0.2,2–0,1.1,2.0,0.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False
391,2.0,2018-08-18,1.2,1–2,1.3,1.0,2.0,2018,Away win,False,...,False,False,False,False,False,False,False,False,False,False
395,2.0,2018-08-18,1.6,3–2,2.4,3.0,2.0,2018,Home win,False,...,False,False,False,False,False,False,False,False,False,False


In [104]:
# Define the features and target variable
features = [column for column in df.drop(columns=['Date', 'xG', 'xG.1', 'Score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets
train_data = df[df['season_start'] <= 2022]
test_data = df[df['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

In [105]:
# Find the best hyperparameters for the Random Forest model
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

grid_search.best_params_

{'max_depth': 15, 'n_estimators': 50}

In [106]:
# Train a Random Forest model with the new features
clf = RandomForestClassifier(random_state=1, n_estimators=50, max_depth=15)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.5184696569920845

The accuracy of the model has increased after adding more features


In [107]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

models = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=1),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, 20]
        },
        'accuracy': ''
    },
    'Gaussian Naive Bayes': {
        'model': GaussianNB(),
        'params': {},
        'accuracy': ''
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=1),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        },
        'accuracy': ''
    }
}

In [108]:
for model_name, model in models.items():
    grid_search = GridSearchCV(estimator=model['model'], param_grid=model['params'], cv=5)
    grid_search.fit(X_train, y_train)
    
    model['accuracy'] = grid_search.best_score_
    model['best_params'] = grid_search.best_params_

In [109]:
models

{'Random Forest': {'model': RandomForestClassifier(random_state=1),
  'params': {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, 20]},
  'accuracy': 0.50199203187251,
  'best_params': {'max_depth': 15, 'n_estimators': 50}},
 'Gaussian Naive Bayes': {'model': GaussianNB(),
  'params': {},
  'accuracy': 0.45473208844551066,
  'best_params': {}},
 'Logistic Regression': {'model': LogisticRegression(random_state=1),
  'params': {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
  'accuracy': 0.5011969804248964,
  'best_params': {'C': 0.1, 'penalty': 'l2'}}}

In [None]:
# Now let's see how we can use to model to predict the outcome of a match
# We'll use the model to predict the outcome of a hypothetical match

In [110]:
home_team = 'Chelsea'
away_team = 'Manchester Utd'
referee = 'Anthony Taylor'

In [111]:
# Essentially, we need to create a row of data that represents the match we want to predict
# We'll use the features we used to train the model



# Create a row of data for the hypothetical match
data = {
    'Wk': [25],
    'home_rolling_avg_goals': [1.9],
    'away_rolling_avg_goals': [1.2],
    'home_rolling_avg_xG': [2.1],
    'away_rolling_avg_xG': [1.3],
    'Day_Saturday': [1],
    'Home_Chelsea': [1],
    'Away_Manchester Utd': [1],
    'Referee_Anthony Taylor': [1],
    'Venue_Stamford Bridge': [1]
}

match = pd.DataFrame(columns=X_train.columns, data=data)

match.fillna(0, inplace=True)



In [112]:
# Use the Random Forest model to predict the outcome of the match
prediction = clf.predict(match)
prediction

array(['Home win'], dtype=object)