In [24]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [25]:
data_paths = [
    {'csv':'champs.csv', 'competition': 'cup', 'country':''},
    {'csv':'dfbpokal.csv', 'competition': 'cup', 'country':'GER'},
    {'csv':'englandplayoffs.csv', 'competition': 'cup', 'country':'ENG'},
    {'csv':'facup.csv', 'competition': 'cup', 'country':'ENG'},
    {'csv':'leaguecup.csv', 'competition': 'cup', 'country':'ENG'},
    {'csv':'belgium.csv', 'competition': 'league', 'country':'BEL'},
    {'csv':'england.csv', 'competition': 'league', 'country':'ENG'},
    {'csv':'france.csv', 'competition': 'league', 'country':'FRA'},
    {'csv':'germany.csv', 'competition': 'league', 'country':'GER'},
    {'csv':'germany2.csv', 'competition': 'league', 'country':'GER'},
    {'csv':'greece.csv', 'competition': 'league', 'country':'GRE'},
    {'csv':'holland.csv', 'competition': 'league', 'country':'NED'},
    {'csv':'italy.csv', 'competition': 'league', 'country':'ITA'},
    {'csv':'mls.csv', 'competition': 'league', 'country':'USA'},
    {'csv':'portugal.csv', 'competition': 'league', 'country':'POR'},
    {'csv':'safrica.csv', 'competition': 'league', 'country':'SAF'},
    {'csv':'scotland.csv', 'competition': 'league', 'country':'SCO'},
    {'csv':'spain.csv', 'competition': 'league', 'country':'ESP'},
    {'csv':'turkey.csv', 'competition': 'league', 'country':'TUR'} 
]

desiredColumns = ['home', 'visitor', 'hgoal', 'vgoal', 'hcountry', 'vcountry', 'Date', 'competition']

combined_df = pd.DataFrame(columns=desiredColumns)

for x in data_paths:
    data_path = os.path.join('Data', 'engsoccerdata-master', 'engsoccerdata-master', 'data-raw', x['csv'])
    temp_df = pd.read_csv(data_path)
    temp_df['competition'] = x['competition']
    if x['country'] != '':
        temp_df['hcountry'] = x['country']
        temp_df['vcountry'] = x['country']
    
    # temp_df = temp_df[desiredColumns]
    
    combined_df = pd.concat([combined_df, temp_df[desiredColumns]], ignore_index=True)

combined_df.replace("NA", np.nan, inplace=True)
combined_df = combined_df.dropna()
    
print(combined_df.head)

  temp_df = pd.read_csv(data_path)


<bound method NDFrame.head of                           home            visitor hgoal vgoal hcountry  \
0                  Sporting CP  Partizan Belgrade     3     3      POR   
1       Budapesti Voros Lobogo     RSC Anderlecht     6     3      HUN   
2              Servette Geneve        Real Madrid     0     2      SUI   
3              Rot-Weiss Essen       Hibernian FC     0     4      GER   
4               Djurgardens IF   Gwardia Warszawa     0     0      SWE   
...                        ...                ...   ...   ...      ...   
453030        Yeni Malatyaspor         Fenerbahce     0     5      TUR   
453031              Buyuksehyr        Trabzonspor     3     1      TUR   
453032           Karagumruk SK         Alanyaspor     0     1      TUR   
453033           Ad. Demirspor             Goztep     7     0      TUR   
453034               Hatayspor        Giresunspor     4     1      TUR   

       vcountry        Date competition  
0           SRB  1955-09-04         cup

In [26]:
def is_valid_date(date_str, date_format="%Y-%m-%d"):
    try:
        datetime.strptime(date_str, date_format)
        return True
    except ValueError:
        return False

In [27]:
combined_df['Date'] = combined_df['Date'].astype(str)

# Filter out invalid dates
combined_df['Valid_Date'] = combined_df['Date'].apply(is_valid_date)
invalid_dates = combined_df[~combined_df['Valid_Date']]
combined_df = combined_df[combined_df['Valid_Date']].drop(columns=['Valid_Date'])

# Split up the dates to use as separate columns
combined_df['Date'] = pd.to_datetime(combined_df['Date'])

combined_df['year'] = combined_df['Date'].dt.year
combined_df['month'] = combined_df['Date'].dt.month
combined_df['day'] = combined_df['Date'].dt.day

combined_df = combined_df.drop(columns=['Date'])

print(combined_df.head)

<bound method NDFrame.head of                           home            visitor hgoal vgoal hcountry  \
0                  Sporting CP  Partizan Belgrade     3     3      POR   
1       Budapesti Voros Lobogo     RSC Anderlecht     6     3      HUN   
2              Servette Geneve        Real Madrid     0     2      SUI   
3              Rot-Weiss Essen       Hibernian FC     0     4      GER   
4               Djurgardens IF   Gwardia Warszawa     0     0      SWE   
...                        ...                ...   ...   ...      ...   
453030        Yeni Malatyaspor         Fenerbahce     0     5      TUR   
453031              Buyuksehyr        Trabzonspor     3     1      TUR   
453032           Karagumruk SK         Alanyaspor     0     1      TUR   
453033           Ad. Demirspor             Goztep     7     0      TUR   
453034               Hatayspor        Giresunspor     4     1      TUR   

       vcountry competition  year  month  day  
0           SRB         cup  1955

In [28]:
encode = ['home', 'visitor',  'hcountry', 'vcountry', 'competition']

label_encoders = {}

for feature in encode:
    label_encoder = LabelEncoder()
    combined_df[feature] = label_encoder.fit_transform(combined_df[feature])
    label_encoders[feature] = label_encoder 

In [29]:
y_hgoal = combined_df['hgoal']
y_vgoal = combined_df['vgoal']
X_hgoal = combined_df.drop(columns=['hgoal'])
X2_vgoal = combined_df.drop(columns=['vgoal'])

X_train_hgoal, X_test_hgoal, y_train_hgoal, y_test_hgoal = train_test_split(X_hgoal, y_hgoal, test_size=0.2, random_state=42)
X_train_vgoal, X_test_vgoal, y_train_vgoal, y_test_vgoal = train_test_split(X2_vgoal, y_vgoal, test_size=0.2, random_state=42)

In [30]:
gb_regressor_hgoal = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_regressor_vgoal = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [31]:
# Hyperparameter tuning - Takes 4 hours and 8 mins

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search_hgoal = GridSearchCV(gb_regressor_hgoal, param_grid=param_grid, cv=5)
grid_search_hgoal.fit(X_train_hgoal, y_train_hgoal)
best_params_hgoal = grid_search_hgoal.best_params_

grid_search_vgoal = GridSearchCV(gb_regressor_vgoal, param_grid=param_grid, cv=5)
grid_search_vgoal.fit(X_train_vgoal, y_train_vgoal)
best_params_vgoal = grid_search_vgoal.best_params_



In [32]:
tuned_gb_regressor_hgoal = GradientBoostingRegressor(**best_params_hgoal, random_state=42)
tuned_gb_regressor_vgoal = GradientBoostingRegressor(**best_params_vgoal, random_state=42)

# Train the models for 'hgoal' and 'vgoal' - 5 mins total
tuned_gb_regressor_hgoal.fit(X_train_hgoal, y_train_hgoal)
tuned_gb_regressor_vgoal.fit(X_train_vgoal, y_train_vgoal)

In [33]:
# Make predictions
y_pred_hgoal = tuned_gb_regressor_hgoal.predict(X_test_hgoal)

# Calculate Mean Squared Error
mse_hgoal = mean_squared_error(y_test_hgoal, y_pred_hgoal)
print("Mean Squared Error (Home Game):", mse_hgoal)

Mean Squared Error (Home Game): 1.858661559395164


In [34]:
# Make predictions
y_pred_vgoal = tuned_gb_regressor_vgoal.predict(X_test_vgoal)

# Calculate Mean Squared Error
mse_vgoal = mean_squared_error(y_test_vgoal, y_pred_vgoal)
print("Mean Squared Error (Away Game):", mse_vgoal)

Mean Squared Error (Away Game): 1.2156300563999123


In [41]:
import joblib

# Assume tuned_gb_regressor_vgoal is your trained GradientBoostingRegressor model
joblib.dump(tuned_gb_regressor_vgoal, 'Trained/Hyperparameter_Tuned/AwayModel.pkl')
joblib.dump(tuned_gb_regressor_hgoal, 'Trained/Hyperparameter_Tuned/HomeModel.pkl')

['Trained/Hyperparameter_Tuned/HomeModel.pkl']