In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
split_date = '2023-01-01'

In [3]:
li = []

li.append(pd.read_csv("../prepared_data/test.csv", index_col=0))

matches = pd.concat(li, axis=0, ignore_index=True)
matches = matches[matches["venue"] == "Home"]
matches.shape

(3335, 130)

In [4]:
rf = RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state=1, max_depth=3)

In [5]:
train = matches[matches["date"] < split_date]
test = matches[matches["date"] > split_date]

In [6]:
predictors = ["gf_rolling", "ga_rolling", "sh_rolling", "sot%_rolling", "dist_rolling"]

In [7]:
rf.fit(train[predictors], train["target"])

In [8]:
predsWinsTrain = rf.predict(train[predictors])

probabilities = rf.predict_proba(test[predictors])
probabilities = probabilities[:, 1]

combinedProbabilities = pd.DataFrame(dict(actual=test["target"], probabilities=probabilities), index=test.index)
combinedProbabilities["date"] = test["date"]

#### Finding the right treshhold

In [9]:
best_threshold = 0
min_diff = float('inf')

thresholds = range(0, 1000, 1)
for threshold in thresholds:
	threshold = threshold / 1000
	predicted = (probabilities >= threshold).astype('int')
	tn, fp, fn, tp = confusion_matrix(combinedProbabilities["actual"], predicted).ravel()
        
	pos = tp + fp
	neg = tn + fn
	diff = abs(neg - (100 / 37.8) * pos)
	
	if diff < min_diff:
		min_diff = diff
		best_threshold = threshold

print(best_threshold)
predicted = (probabilities >= best_threshold).astype('int')
combinedProbabilities["predicted"] = predicted

0.471


In [10]:
print(int(len(combinedProbabilities[(combinedProbabilities["actual"] == 1) & (combinedProbabilities["predicted"] == 1)]) / len(combinedProbabilities[combinedProbabilities["actual"] == 1]) * 1_000_000) / 10_000, "The higher the better, but doesn't have to be high")
print(int(len(combinedProbabilities[(combinedProbabilities["actual"] == 0) & (combinedProbabilities["predicted"] == 1)]) / len(combinedProbabilities[combinedProbabilities["actual"] == 0]) * 1_000_000) / 10_000, "The smaller the better, has to be small")

print(int(len(combinedProbabilities[(combinedProbabilities["actual"] == 1) & (combinedProbabilities["predicted"] == 1)]) / len(combinedProbabilities[combinedProbabilities["predicted"] == 1]) * 1_000_000) / 10_000, "Overall measure of performance in %")

35.6294 The higher the better, but doesn't have to be high
20.1773 The smaller the better, has to be small
62.2406 Overall measure of performance in %


In [11]:
pd.DataFrame(confusion_matrix(combinedProbabilities["actual"], combinedProbabilities["predicted"]))

Unnamed: 0,0,1
0,360,91
1,271,150


In [12]:
combinedProbabilities.sort_values("date").tail(50)

Unnamed: 0,actual,probabilities,date,predicted
3082,1,0.495553,2024-01-03,1
3213,1,0.365436,2024-01-03,0
5037,1,0.525725,2024-01-03,1
5371,0,0.402965,2024-01-04,0
4775,1,0.411239,2024-01-04,0
3263,0,0.364524,2024-01-04,0
5372,0,0.362953,2024-01-12,0
1680,0,0.375093,2024-01-12,0
4482,0,0.385601,2024-01-13,0
3264,1,0.358964,2024-01-13,0
