In [1]:
import pandas as pd
import numpy as np
import joblib
import json
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss, brier_score_loss

In [2]:
matches = pd.read_csv("../../preparation_before_models/data/matches.csv")

In [3]:
matches['Date'] = pd.to_datetime(matches['Date'])

In [4]:
matches=matches.drop(columns=['player1_bet_odds','player2_bet_odds',"w_ace_avg", "l_ace_avg", "w_CO_ace_avg", "l_CO_ace_avg","w_df_avg", "l_df_avg", "w_CO_df_avg", "l_CO_df_avg", "w_2ndIn_avg","l_2ndIn_avg","w_CO_2ndIn_avg", "l_CO_2ndIn_avg"])

In [5]:
test_data = matches[matches["Date"].dt.year == 2023]

In [6]:
with open("../XGBoost/best_models/best_params_log_nested.json", "r") as f:
    best_params_log_nested = json.load(f)
best_features_log_nested = np.load("../XGBoost/best_models/best_features_log_nested.npy")
best_model_log_nest = joblib.load("../XGBoost/best_models/best_model_log_nested.pkl")

In [7]:
X = test_data.drop(columns=['target','Date', 'match_id'])
y = test_data['target']

In [8]:
X_selected = X.iloc[:, best_features_log_nested]

In [24]:
y_pred_nest = best_model_log_nest.predict_proba(X_selected)[:,1]

In [25]:
print("Accuracy: ", accuracy_score(y, best_model_log_nest.predict(X_selected)))
print("Log loss: ", log_loss(y, y_pred_nest))
print("Brier score: ", brier_score_loss(y, y_pred_nest))

Accuracy:  0.6484375
Log loss:  0.6213728333593187
Brier score:  0.2165740698104031


In [26]:
test_data_rank_50 = test_data[(test_data["player1_rank"] <= 50) & (test_data["player2_rank"] <= 50)]
X_rank_50 = test_data_rank_50.drop(columns=['target','Date', 'match_id'])
y_rank_50 = test_data_rank_50['target']
X_selected_rank_50 = X_rank_50.iloc[:, best_features_log_nested]
y_pred_rank_50 = best_model_log_nest.predict_proba(X_selected_rank_50)[:,1]

In [27]:
print("Accuracy rank 50: ", accuracy_score(y_rank_50, best_model_log_nest.predict(X_selected_rank_50)))
print("Log loss rank 50: ", log_loss(y_rank_50, y_pred_rank_50))
print("Brier score rank 50: ", brier_score_loss(y_rank_50, y_pred_rank_50))

Accuracy rank 50:  0.6939799331103679
Log loss rank 50:  0.5881639165342
Brier score rank 50:  0.2012278250883115


In [29]:
test_certain_data = test_data.nsmallest(int(len(test_data) * 0.5), 'CO_uncertainty')
test_certain_data = test_certain_data[(test_certain_data["player1_rank"] <= 50) & (test_certain_data["player2_rank"] <= 50)]
X_certain = test_certain_data.drop(columns=['target','Date', 'match_id'])
y_certain = test_certain_data['target']
X_selected_certain = X_certain.iloc[:, best_features_log_nested]
y_pred_certain = best_model_log_nest.predict_proba(X_selected_certain)[:,1]


In [30]:
print("Accuracy certain: ", accuracy_score(y_certain, best_model_log_nest.predict(X_selected_certain)))
print("Log loss certain: ", log_loss(y_certain, y_pred_certain))
print("Brier score certain: ", brier_score_loss(y_certain, y_pred_certain))

Accuracy certain:  0.7021660649819494
Log loss certain:  0.586703443843803
Brier score certain:  0.2004924447565322


##################################################################

In [9]:
with open("../XGBoost/best_models/best_params_log_cros.json", "r") as f:
    best_params_log_cros = json.load(f)
best_features_log_cros = np.load("../XGBoost/best_models/best_features_log_cros.npy")
best_model_log_cros = joblib.load("../XGBoost/best_models/best_model_log_cros.pkl")

In [32]:
X_selected_cros = X.iloc[:, best_features_log_cros]
y_pred_cros_prob = best_model_log_cros.predict_proba(X_selected_cros)[:,1]
y_pred_cros = best_model_log_cros.predict(X_selected_cros)
print("Accuracy cros: ", accuracy_score(y, y_pred_cros))
print("Log loss cros: ", log_loss(y, y_pred_cros_prob))
print("Brier score cros: ", brier_score_loss(y, y_pred_cros_prob))

Accuracy cros:  0.6372767857142857
Log loss cros:  0.6252536073981937
Brier score cros:  0.21834964874050367


In [33]:
X_selected_rank_50_cros = X_rank_50.iloc[:, best_features_log_cros]
y_pred_rank_50_cros_prob = best_model_log_cros.predict_proba(X_selected_rank_50_cros)[:,1]
y_pred_rank_50_cros = best_model_log_cros.predict(X_selected_rank_50_cros)
print("Accuracy rank 50 cros: ", accuracy_score(y_rank_50, y_pred_rank_50_cros))
print("Log loss rank 50 cros: ", log_loss(y_rank_50, y_pred_rank_50_cros_prob))
print("Brier score rank 50 cros: ", brier_score_loss(y_rank_50, y_pred_rank_50_cros_prob))

Accuracy rank 50 cros:  0.6872909698996655
Log loss rank 50 cros:  0.590921719254777
Brier score rank 50 cros:  0.2028729814714609


In [34]:
X_selected_certain_cros = X_certain.iloc[:, best_features_log_cros]
y_pred_certain_cros_prob = best_model_log_cros.predict_proba(X_selected_certain_cros)[:,1]
y_pred_certain_cros = best_model_log_cros.predict(X_selected_certain_cros)
print("Accuracy certain cros: ", accuracy_score(y_certain, y_pred_certain_cros))
print("Log loss certain cros: ", log_loss(y_certain, y_pred_certain_cros_prob))
print("Brier score certain cros: ", brier_score_loss(y_certain, y_pred_certain_cros_prob))

Accuracy certain cros:  0.6931407942238267
Log loss certain cros:  0.5879786285398592
Brier score certain cros:  0.2015135854651636


In [10]:
X_selected_cros = X.iloc[:, best_features_log_cros]

In [12]:
print(X_selected_cros.columns.difference(X_selected.columns))


Index(['Surface_Clay', 'Surface_Grass', 'outdoor', 'player1_h2h_surface_wins',
       'player1_injury_score', 'player1_is_seeded', 'player1_right_handed',
       'player2_h2h_surface_wins', 'player2_h2h_wins', 'player2_injury_score',
       'player2_right_handed'],
      dtype='object')
