In [5]:
import pandas as pd

# Load the CSV file into a DataFrame
df_bundesliga_recencyfeatures = pd.read_csv('df_bundesliga_recencyfeatures.csv')



In [7]:
import numpy as np

# Initialize columns for the ratings
df_bundesliga_recencyfeatures['home_attack_strength'] = np.nan
df_bundesliga_recencyfeatures['home_defense_weakness'] = np.nan
df_bundesliga_recencyfeatures['away_attack_strength'] = np.nan
df_bundesliga_recencyfeatures['away_defense_weakness'] = np.nan


team_stats = {}

# Loop through each season
for season in df_bundesliga_recencyfeatures['season_start_year'].unique():
    # Filter matches for the current season
    season_matches = df_bundesliga_recencyfeatures[df_bundesliga_recencyfeatures['season_start_year'] == season]

    # Loop through each match in the season
    for index, row in season_matches.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']

        # Initialize team stats if not already done
        if home_team not in team_stats:
            team_stats[home_team] = {'home_goals_scored': 0, 'home_goals_conceded': 0, 'home_matches': 0,
                                     'away_goals_scored': 0, 'away_goals_conceded': 0, 'away_matches': 0}
        if away_team not in team_stats:
            team_stats[away_team] = {'home_goals_scored': 0, 'home_goals_conceded': 0, 'home_matches': 0,
                                     'away_goals_scored': 0, 'away_goals_conceded': 0, 'away_matches': 0}

        # Calculate current ratings
        home_attack_strength = team_stats[home_team]['home_goals_scored'] / (team_stats[home_team]['home_matches'] + 1)
        home_defense_weakness = team_stats[home_team]['home_goals_conceded'] / (team_stats[home_team]['home_matches'] + 1)
        away_attack_strength = team_stats[away_team]['away_goals_scored'] / (team_stats[away_team]['away_matches'] + 1)
        away_defense_weakness = team_stats[away_team]['away_goals_conceded'] / (team_stats[away_team]['away_matches'] + 1)

        # Assign ratings to the dataframe
        df_bundesliga_recencyfeatures.loc[index, 'home_attack_strength'] = home_attack_strength
        df_bundesliga_recencyfeatures.loc[index, 'home_defense_weakness'] = home_defense_weakness
        df_bundesliga_recencyfeatures.loc[index, 'away_attack_strength'] = away_attack_strength
        df_bundesliga_recencyfeatures.loc[index, 'away_defense_weakness'] = away_defense_weakness

        # Update team stats with the current match
        team_stats[home_team]['home_goals_scored'] += row['home_goals']
        team_stats[home_team]['home_goals_conceded'] += row['away_goals']
        team_stats[home_team]['home_matches'] += 1

        team_stats[away_team]['away_goals_scored'] += row['away_goals']
        team_stats[away_team]['away_goals_conceded'] += row['home_goals']
        team_stats[away_team]['away_matches'] += 1


In [10]:
#Predict Number of Goals scored by the home and away teams using logistic function
# Find optimal parameters for the logistic model that minimized the difference between the predicted and actual goals scored 

import numpy as np
from scipy.optimize import minimize

# Define the logistic function for goal prediction
def logistic_goal_model(params, X):
    alpha_h, beta_h, gamma_h, alpha_a, beta_a, gamma_a = params
    home_goals_pred = []
    away_goals_pred = []

    for row in X:
        home_attack_strength, away_defense_weakness, away_attack_strength, home_defense_weakness = row
        
        # Calculate predicted goals using the logistic function
        pred_home = alpha_h / (1 + np.exp(-beta_h * (home_attack_strength + away_defense_weakness) - gamma_h))
        pred_away = alpha_a / (1 + np.exp(-beta_a * (away_attack_strength + home_defense_weakness) - gamma_a))
        
        home_goals_pred.append(pred_home)
        away_goals_pred.append(pred_away)
    
    return np.array(home_goals_pred), np.array(away_goals_pred)

# Calculate the sum of squarred errors between the predicted and actual goals 
def objective_function(params, X, y):
    y_pred_home, y_pred_away = logistic_goal_model(params, X)
    y_true_home, y_true_away = y.T  # Transpose y to separate home and away goals
    
    # Return the sum of squared errors
    return np.sum((y_pred_home - y_true_home) ** 2 + (y_pred_away - y_true_away) ** 2)

# Prepare the data for optimization
X_data = df_bundesliga_recencyfeatures[['home_attack_strength', 'away_defense_weakness', 
                                         'away_attack_strength', 'home_defense_weakness']].values
y_data = df_bundesliga_recencyfeatures[['home_goals', 'away_goals']].values

# Initial parameters for optimization (can be tuned)
initial_params = [1, 1, 1, 1, 1, 1]

# Optimize parameters using scipy's minimize function
result = minimize(objective_function, initial_params, args=(X_data, y_data), method='L-BFGS-B')

# Extract optimized parameters
alpha_h, beta_h, gamma_h, alpha_a, beta_a, gamma_a = result.x

# Predict goals using the optimized parameters
df_bundesliga_recencyfeatures['predicted_home_goals'], df_bundesliga_recencyfeatures['predicted_away_goals'] = \
    logistic_goal_model(result.x, X_data)

# Print the optimized parameters
print(f"Optimized parameters: {result.x}")


Optimized parameters: [189.5595189    0.34141602  -5.94520181 155.78303938   0.52973958
  -6.11049824]


In [9]:
from sklearn.metrics import mean_squared_error

# Calculate mean squared error for the predictions
mse_home = mean_squared_error(df_bundesliga_recencyfeatures['home_goals'], df_bundesliga_recencyfeatures['predicted_home_goals'])
mse_away = mean_squared_error(df_bundesliga_recencyfeatures['away_goals'], df_bundesliga_recencyfeatures['predicted_away_goals'])

print(f"Mean Squared Error for Home Goals: {mse_home:.2f}")
print(f"Mean Squared Error for Away Goals: {mse_away:.2f}")

Mean Squared Error for Home Goals: 2.00
Mean Squared Error for Away Goals: 1.34


In [11]:
from pyswarm import pso

# Define the objective function for PSO
def pso_objective(params):
    return objective_function(params, X_data, y_data)

# Bounds for the parameters
lb = [0, 0, 0, 0, 0, 0]
ub = [10, 10, 10, 10, 10, 10]

# Run PSO
pso_params, pso_obj_value = pso(pso_objective, lb, ub, swarmsize=100, maxiter=100)

# Extract optimized parameters from PSO
alpha_h, beta_h, gamma_h, alpha_a, beta_a, gamma_a = pso_params

# Predict goals using the PSO optimized parameters
df_bundesliga_recencyfeatures['predicted_home_goals_pso'], df_bundesliga_recencyfeatures['predicted_away_goals_pso'] = \
    logistic_goal_model(pso_params, X_data)

# Calculate mean squared error for the PSO predictions
mse_home_pso = mean_squared_error(df_bundesliga_recencyfeatures['home_goals'], df_bundesliga_recencyfeatures['predicted_home_goals_pso'])
mse_away_pso = mean_squared_error(df_bundesliga_recencyfeatures['away_goals'], df_bundesliga_recencyfeatures['predicted_away_goals_pso'])

print(f"Mean Squared Error for Home Goals (PSO): {mse_home_pso:.2f}")
print(f"Mean Squared Error for Away Goals (PSO): {mse_away_pso:.2f}")


Stopping search: maximum iterations reached --> 100
Mean Squared Error for Home Goals (PSO): 2.10
Mean Squared Error for Away Goals (PSO): 1.38


In [16]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Define the target variable for classification (adjusted to be 0, 1, 2)
df_bundesliga_recencyfeatures['target'] = np.where(df_bundesliga_recencyfeatures['home_goals'] > df_bundesliga_recencyfeatures['away_goals'], 2, 
                        np.where(df_bundesliga_recencyfeatures['home_goals'] < df_bundesliga_recencyfeatures['away_goals'], 0, 1))

# Features for XGBoost (using PSO optimized ratings)
X = df_bundesliga_recencyfeatures[['home_attack_strength', 'home_defense_weakness', 'away_attack_strength', 'away_defense_weakness']]
y = df_bundesliga_recencyfeatures['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model and parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

param_grid = {
    'n_estimators': [84],
    'learning_rate': [0.06],
    'max_depth': [5],
    'subsample': [0.9],
    'colsample_bytree': [1.0]
}

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_xgb_model = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(f"Classification Report:\n", classification_report(y_test, y_pred))

# Cross-validated accuracy
cv_scores = cross_val_score(best_xgb_model, X_train, y_train, cv=5)
print(f"Cross-validated accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")



Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.06, 'max_depth': 5, 'n_estimators': 84, 'subsample': 0.9}
Accuracy: 0.51
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.15      0.22       889
           1       0.30      0.02      0.03       950
           2       0.53      0.93      0.67      1883

    accuracy                           0.51      3722
   macro avg       0.40      0.37      0.31      3722
weighted avg       0.43      0.51      0.40      3722



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validated accuracy: 0.52 ± 0.00


In [17]:
import numpy as np
from pyswarm import pso
from scipy.optimize import minimize
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

# Initialize columns for the ratings
df_bundesliga_recencyfeatures['home_attack_strength'] = np.nan
df_bundesliga_recencyfeatures['home_defense_weakness'] = np.nan
df_bundesliga_recencyfeatures['away_attack_strength'] = np.nan
df_bundesliga_recencyfeatures['away_defense_weakness'] = np.nan

team_stats = {}

# Loop through each season
for season in df_bundesliga_recencyfeatures['season_start_year'].unique():
    # Filter matches for the current season
    season_matches = df_bundesliga_recencyfeatures[df_bundesliga_recencyfeatures['season_start_year'] == season]

    # Loop through each match in the season
    for index, row in season_matches.iterrows():
        home_team = row['home_team']
        away_team = row['away_team']

        # Initialize team stats if not already done
        if home_team not in team_stats:
            team_stats[home_team] = {'home_goals_scored': 0, 'home_goals_conceded': 0, 'home_matches': 0,
                                     'away_goals_scored': 0, 'away_goals_conceded': 0, 'away_matches': 0}
        if away_team not in team_stats:
            team_stats[away_team] = {'home_goals_scored': 0, 'home_goals_conceded': 0, 'home_matches': 0,
                                     'away_goals_scored': 0, 'away_goals_conceded': 0, 'away_matches': 0}

        # Calculate current ratings
        home_attack_strength = team_stats[home_team]['home_goals_scored'] / (team_stats[home_team]['home_matches'] + 1)
        home_defense_weakness = team_stats[home_team]['home_goals_conceded'] / (team_stats[home_team]['home_matches'] + 1)
        away_attack_strength = team_stats[away_team]['away_goals_scored'] / (team_stats[away_team]['away_matches'] + 1)
        away_defense_weakness = team_stats[away_team]['away_goals_conceded'] / (team_stats[away_team]['away_matches'] + 1)

        # Assign ratings to the dataframe
        df_bundesliga_recencyfeatures.loc[index, 'home_attack_strength'] = home_attack_strength
        df_bundesliga_recencyfeatures.loc[index, 'home_defense_weakness'] = home_defense_weakness
        df_bundesliga_recencyfeatures.loc[index, 'away_attack_strength'] = away_attack_strength
        df_bundesliga_recencyfeatures.loc[index, 'away_defense_weakness'] = away_defense_weakness

        # Update team stats with the current match
        team_stats[home_team]['home_goals_scored'] += row['home_goals']
        team_stats[home_team]['home_goals_conceded'] += row['away_goals']
        team_stats[home_team]['home_matches'] += 1

        team_stats[away_team]['away_goals_scored'] += row['away_goals']
        team_stats[away_team]['away_goals_conceded'] += row['home_goals']
        team_stats[away_team]['away_matches'] += 1

# Define the logistic function for goal prediction
def logistic_goal_model(params, X):
    alpha_h, beta_h, gamma_h, alpha_a, beta_a, gamma_a = params
    home_goals_pred = []
    away_goals_pred = []

    for row in X:
        home_attack_strength, away_defense_weakness, away_attack_strength, home_defense_weakness = row
        
        # Calculate predicted goals using the logistic function
        pred_home = alpha_h / (1 + np.exp(-beta_h * (home_attack_strength + away_defense_weakness) - gamma_h))
        pred_away = alpha_a / (1 + np.exp(-beta_a * (away_attack_strength + home_defense_weakness) - gamma_a))
        
        home_goals_pred.append(pred_home)
        away_goals_pred.append(pred_away)
    
    return np.array(home_goals_pred), np.array(away_goals_pred)

# Define the objective function for optimization
def objective_function(params, X, y):
    y_pred_home, y_pred_away = logistic_goal_model(params, X)
    y_true_home, y_true_away = y.T  # Transpose y to separate home and away goals
    
    # Return the sum of squared errors
    return np.sum((y_pred_home - y_true_home) ** 2 + (y_pred_away - y_true_away) ** 2)

# Prepare the data for optimization
X_data = df_bundesliga_recencyfeatures[['home_attack_strength', 'away_defense_weakness', 
                                         'away_attack_strength', 'home_defense_weakness']].values
y_data = df_bundesliga_recencyfeatures[['home_goals', 'away_goals']].values

# Initial parameters for optimization (can be tuned)
initial_params = [1, 1, 1, 1, 1, 1]

# Optimize parameters using PSO
lb = [0, 0, 0, 0, 0, 0]
ub = [10, 10, 10, 10, 10, 10]
pso_params, pso_obj_value = pso(pso_objective, lb, ub, swarmsize=100, maxiter=100)

# Predict goals using the PSO optimized parameters
df_bundesliga_recencyfeatures['predicted_home_goals_pso'], df_bundesliga_recencyfeatures['predicted_away_goals_pso'] = \
    logistic_goal_model(pso_params, X_data)

# Calculate mean squared error for the PSO predictions
mse_home_pso = mean_squared_error(df_bundesliga_recencyfeatures['home_goals'], df_bundesliga_recencyfeatures['predicted_home_goals_pso'])
mse_away_pso = mean_squared_error(df_bundesliga_recencyfeatures['away_goals'], df_bundesliga_recencyfeatures['predicted_away_goals_pso'])

print(f"Mean Squared Error for Home Goals (PSO): {mse_home_pso:.2f}")
print(f"Mean Squared Error for Away Goals (PSO): {mse_away_pso:.2f}")

# Now use the predicted goals as features for XGBoost
df_bundesliga_recencyfeatures['target'] = np.where(df_bundesliga_recencyfeatures['home_goals'] > df_bundesliga_recencyfeatures['away_goals'], 2, 
                        np.where(df_bundesliga_recencyfeatures['home_goals'] < df_bundesliga_recencyfeatures['away_goals'], 0, 1))

# Features for XGBoost (including PSO predicted goals)
X = df_bundesliga_recencyfeatures[['home_attack_strength', 'home_defense_weakness', 
                                   'away_attack_strength', 'away_defense_weakness', 
                                   'predicted_home_goals_pso', 'predicted_away_goals_pso']]
y = df_bundesliga_recencyfeatures['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model and parameters
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

param_grid = {
    'n_estimators': [84],
    'learning_rate': [0.06],
    'max_depth': [5],
    'subsample': [0.9],
    'colsample_bytree': [1.0]
}

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

best_xgb_model = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(f"Classification Report:\n", classification_report(y_test, y_pred))

# Cross-validated accuracy
cv_scores = cross_val_score(best_xgb_model, X_train, y_train, cv=5)
print(f"Cross-validated accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


Stopping search: maximum iterations reached --> 100
Mean Squared Error for Home Goals (PSO): 2.10
Mean Squared Error for Away Goals (PSO): 1.38
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.06, 'max_depth': 5, 'n_estimators': 84, 'subsample': 0.9}
Accuracy: 0.51
Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.15      0.21       889
           1       0.35      0.01      0.03       950
           2       0.53      0.94      0.68      1883

    accuracy                           0.51      3722
   macro avg       0.42      0.37      0.31      3722
weighted avg       0.45      0.51      0.40      3722



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validated accuracy: 0.51 ± 0.00
