# Requirement: Calculate the probability of score, for example, what's the probability of score 3-1

# Idea:
- Train 2 models to predict goals that home team can score, goals that away team can score
- Calculate probability of score 3 from home team (prob_home) and score 1 from away team (prob_away)
- Calculate the probability of score 3-1 by: final_prob = prob_home * prob_away

# Set up

In [11]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from scipy.stats import poisson

# Sourcing

In [2]:
df = pd.read_feather(path="../data/processed/regression/processed_train_data.feather")

In [3]:
df.head()

Unnamed: 0,id,is_cup,home_team_days_from_last_match,home_team_total_matches_last_30_days,away_team_total_matches_last_30_days,home_team_matches_play_home_last_30_days,away_team_matches_play_home_last_30_days,home_team_matches_cup_comp_last_10_matches,away_team_matches_cup_comp_last_10_matches,home_team_is_last_match_cup,...,home_team_same_league,away_team_same_league,home_team_leagues_last_30_days,away_team_leagues_last_30_days,dow_match,month_match,year_match,week_match,home_score,away_score
0,11906497,0,5.024305,3,4,1.0,2.0,0.0,5.0,0,...,1,0,1,3,6,12,2019,48,2,3
1,11984383,0,3.166667,4,4,2.0,1.0,0.0,0.0,0,...,1,1,1,1,6,12,2019,48,1,0
2,11983301,0,2.989583,3,2,1.0,0.0,0.0,0.0,0,...,1,1,1,1,6,12,2019,48,2,2
3,11983471,0,3.0,5,5,2.0,3.0,0.0,1.0,0,...,1,1,1,1,6,12,2019,48,1,2
4,11883005,0,3.291667,4,4,2.0,3.0,0.0,0.0,0,...,1,1,1,1,6,12,2019,48,1,0


# Train model to predict goals scored by home team

In [41]:
#Split the train and test data
X_home = df.drop(columns=['home_score','away_score'], axis=1)
y_home = df['home_score']

In [42]:
X_home_train, X_home_test, y_home_train, y_home_test = \
    train_test_split(X_home, y_home, test_size=0.2, random_state=42)

In [44]:
#Train the model
model = xgb.XGBRegressor()
model.fit(X_home_train.drop(columns='id', axis=1), y_home_train)

In [45]:
y_home_pred = model.predict(X_home_test.drop(columns='id', axis=1))

In [46]:
mse = mean_squared_error(y_home_test, y_home_pred)
rmse = mean_squared_error(y_home_test, y_home_pred, squared=False)
mae = mean_absolute_error(y_home_test, y_home_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 1.7605455240644285
Root Mean Squared Error: 1.326855502330389
Mean Absolute Error: 1.0252992919428976


In [47]:
#Calculate probability of home team to score 3 goals each match
lambda_param = np.mean(y_home_train)
prob_3_list_home = []
for i, y_pred_i in enumerate(y_home_pred):
    prob_3 = poisson.pmf(3, mu=y_pred_i)
    prob_3_list_home.append(prob_3)

# Train model to predict goals scored by away team

In [48]:
#Split the train and test data
X_away = df.drop(columns=['home_score','away_score'], axis=1)
y_away = df['away_score']

In [49]:
X_away_train, X_away_test, y_away_train, y_away_test = \
    train_test_split(X_away, y_away, test_size=0.2, random_state=42)

In [50]:
#Train the model
model = xgb.XGBRegressor()
model.fit(X_away_train.drop(columns='id', axis=1), y_away_train)

In [51]:
y_away_pred = model.predict(X_away_test.drop(columns='id', axis=1))

In [52]:
mse = mean_squared_error(y_away_test, y_home_pred)
rmse = mean_squared_error(y_away_test, y_home_pred, squared=False)
mae = mean_absolute_error(y_away_test, y_home_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 2.125031077089003
Root Mean Squared Error: 1.4577486330259422
Mean Absolute Error: 1.1431191292987182


In [53]:
lambda_param = np.mean(y_away_train)
prob_1_list_away = []
for i, y_pred_i in enumerate(y_away_pred):
    prob_1 = poisson.pmf(1, mu=y_pred_i)
    prob_1_list_away.append(prob_1)

# Calculate the probability of score 3-1

In [59]:
X_home_test['prob_3_home'] = prob_3_list_home

In [60]:
X_away_test['prob_1_away'] = prob_1_list_away

In [65]:
final_df = X_home_test.merge(right=X_away_test,
                             on='id',
                             how='inner',
                             )

In [67]:
final_df['prob_3_1_score'] = final_df['prob_3_home'] * final_df['prob_1_away']

# Final prediction for the probability for score 3-1

In [70]:
final_df[['id','prob_3_home','prob_1_away','prob_3_1_score']].head()

Unnamed: 0,id,prob_3_home,prob_1_away,prob_3_1_score
0,11898753,0.195613,0.326735,0.063913
1,17778474,0.095673,0.366476,0.035062
2,17724594,0.085427,0.366985,0.03135
3,17710758,0.063056,0.352655,0.022237
4,16935301,0.117585,0.322681,0.037942


In [74]:
final_df.to_csv("../data/predicted/question_3_result.csv")