In [None]:
import pandas as pd 
import numpy as np
import random
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

# directories to download data from 
fileClearedData = "dataCleared (1).csv"
filePlayers = "players (2).csv"

# proportion of the dataset dedicated to the training
trainSize = 0.95

# minimum probability of a player winning over an other to bet
minproba = 0.55
# ,inimum amount of game of each player to bet
minnbgames = 5
# the earnings are used to link the probability of a player winning and the odds
earnings = 1

In [None]:
#Read the files
clearedData = pd.read_csv(fileClearedData) 
players = pd.read_csv(filePlayers)
clearedData.drop(['Surface'], axis=1, inplace=True)
clearedData.head()

Unnamed: 0,Winner,Loser,WRank,LRank,Winrate winner,Winrate loser,Nb Game winner,Nb Game loser,Result win,Result loose
0,Dosedel S.,Ljubicic I.,0.015873,0.012987,0.37931,0.594761,58.0,649.0,1,0
1,Enqvist T.,Clement A.,0.2,0.017857,0.543651,0.5,252.0,508.0,1,0
2,Escude N.,Baccanello P.,0.025,0.001527,0.582888,0.222222,187.0,9.0,1,0
3,Federer R.,Knippschild J.,0.015385,0.011494,0.827393,0.333333,1431.0,42.0,1,0
4,Fromberg R.,Woodbridge T.,0.012346,0.005051,0.387755,0.4,49.0,20.0,1,0


In [None]:
# take the dataset and reverse each game (reverse the winner and the looser) to double the size of the dataset
dataM = clearedData.values
def doubleGame(M,trainSize):
    np.random.shuffle(M)
    M2 = []
    for match in M:
        reversematch = [match[1],match[0],match[3],match[2],match[5],match[4],match[7],match[6],match[9],match[8]]   
        M2.append(match)
        M2.append(reversematch)
    Train = np.array(M2[:int(len(M2)*trainSize)])
    Test = np.array(M2[int(len(M2)*trainSize):])
    alldata = np.concatenate((Train, Test), axis=0)
    np.random.shuffle(Train)
    np.random.shuffle(Test)
    
    #TrainX = inputs, TrainY = outputs
    TrainX = np.transpose(np.transpose(Train)[2:8])
    TrainY = np.transpose(np.transpose(Train)[8:])
 

    TestX = np.transpose(np.transpose(Test)[2:8])
    TestY = np.transpose(np.transpose(Test)[8:])
    return TrainX, TrainY, TestX, TestY, alldata

X_train, y_train, X_test, y_test, alldata = doubleGame(dataM,trainSize)
print(X_train.shape, X_test.shape)

(116409, 6) (6127, 6)


In [None]:
model1 = XGBClassifier(n_estimators=150, max_depth=4, learning_rate=0.11)
pred_idx: int = 1
model1.fit(X_train, y_train[:, pred_idx])

XGBClassifier(learning_rate=0.11, max_depth=4, n_estimators=150)

In [None]:
preds1 = model1.predict(X_test)
print(balanced_accuracy_score(list(preds1), list(y_test[:, pred_idx])))

0.6920205260725106


In [None]:
# check prob and skip rate
correct, incorrect, skipped = 0, 0, 0
confidence = 0.8

preds_prob = model1.predict_proba(X_test)
for pred_arr, actual in zip(preds_prob, list(y_test[:, pred_idx])):
  # highest probability prediction
  pred = np.argmax(pred_arr)
  prob = pred_arr[[pred]]

  if prob > confidence:
    if pred == actual: 
      correct += 1
    else: 
      incorrect += 1 
  else: 
    skipped += 1

print(correct / (correct + incorrect))
print(skipped / (len(X_test)))

0.8741554054054054
0.8067569773135302


In [None]:
class NotEnoughGame(Exception):
    """One of the player need to play more for the data to be usefull"""
    pass

def prediction(model, p1, p2, surface, minproba, minnbgames, earnings):
    rankP1 = float(players.loc[players['Name'] == p1, "Rank"])
    rankP2 = float(players.loc[players['Name'] == p2, "Rank"])
    winrateP1 = float(players.loc[players['Name'] == p1, "Winrate"])
    winrateP2 = float(players.loc[players['Name'] == p2, "Winrate"])
    nbGameP1 = float(players.loc[players['Name'] == p1, "NbGames"])
    nbGameP2 = float(players.loc[players['Name'] == p2, "NbGames"])
    
    if (nbGameP1 < minnbgames) or (nbGameP2 < minnbgames):
        raise NotEnoughGame
    
    dataPredict = np.array([1/rankP1, 1/rankP2, winrateP1, winrateP2, nbGameP1, nbGameP2])
    res = model.predict_proba(np.array([dataPredict]))[0]
    
    print(p1, "- rank", rankP1, "- winrate", winrateP1, "- nbGames", nbGameP1)
    print(p2, "- rank", rankP2, "- winrate", winrateP2, "- nbGames", nbGameP2)
    print("Prediction :", res)
    if (res[0] >= minproba):
        print("Bet on", p1, "if odds >", earnings/res[0])
    elif (res[1] >= minproba):
        print("Bet on", p2, "if odds >", earnings/res[1])
    else:
        print("Do not bet")
    return res

In [None]:
#I predict between p1 and p2 which player i should bet (or not bet at all)
p1 = 'Djokovic N.'
p2 = 'Nadal R.'
surface: int = 1

prediction(model1, p1, p2, surface, minproba, minnbgames, earnings)

Djokovic N. - rank 8.0 - winrate 0.8374679213002566 - nbGames 1169.0
Nadal R. - rank 2.0 - winrate 0.826645264847512 - nbGames 1246.0
Prediction : [0.45057148 0.5494285 ]
Do not bet


array([0.45057148, 0.5494285 ], dtype=float32)