In [1]:
import pandas as pd
import numpy as np

matches = pd.read_csv("Matches.csv") 

print("Shape of matches:", matches.shape)
matches.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Matches.csv'

In [2]:
# Basic cleaning and create a target column

# MatchDate is a proper datetime
matches["MatchDate"] = pd.to_datetime(matches["MatchDate"], errors="coerce")

# Year column 
matches["Year"] = matches["MatchDate"].dt.year

#Keep only rows with a clear full-time result (H, D, A)
valid_results = ["H", "D", "A"]
matches = matches[matches["FTResult"].isin(valid_results)].copy()

print("Rows after keeping only valid FTResult (H/D/A):", matches.shape[0])

# we will PREDICT: home win vs away win (no draws) so, remove draws (1 = home wins, 0 = away wins)
matches_no_draw = matches[matches["FTResult"] != "D"].copy()

matches_no_draw["home_win"] = (matches_no_draw["FTResult"] == "H").astype(int)

print("Rows after removing draws:", matches_no_draw.shape[0])
print("home_win value counts (1 = home, 0 = away):")
print(matches_no_draw["home_win"].value_counts())

#peek
matches_no_draw[["Division", "MatchDate", "HomeTeam", "AwayTeam",
                 "HomeElo", "AwayElo", "Form3Home", "Form3Away",
                 "OddHome", "OddDraw", "OddAway",
                 "FTResult", "home_win"]].head()

Rows after keeping only valid FTResult (H/D/A): 230554
Rows after removing draws: 169433
home_win value counts (1 = home, 0 = away):
home_win
1    102873
0     66560
Name: count, dtype: int64


Unnamed: 0,Division,MatchDate,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form3Away,OddHome,OddDraw,OddAway,FTResult,home_win
0,F1,2000-07-28,Marseille,Troyes,1686.34,1586.57,0.0,0.0,1.65,3.3,4.3,H,1
1,F1,2000-07-28,Paris SG,Strasbourg,1714.89,1642.51,0.0,0.0,1.6,3.4,4.6,H,1
2,F2,2000-07-28,Wasquehal,Nancy,1465.08,1633.8,0.0,0.0,,,,A,0
3,F1,2000-07-29,Auxerre,Sedan,1635.58,1624.22,0.0,0.0,1.7,3.3,3.9,A,0
8,F1,2000-07-29,Nantes,Lens,1664.36,1730.89,0.0,0.0,2.15,3.0,3.0,A,0


In [5]:
from sklearn.model_selection import train_test_split

# NEW feature set: team strength (Elo) + form + betting odds
feature_cols = [
    "HomeElo", "AwayElo",
    "Form3Home", "Form5Home",
    "Form3Away", "Form5Away",
    "OddHome", "OddDraw", "OddAway"
]

# Build X and y from matches_no_draw
X = matches_no_draw[feature_cols].copy()
y = matches_no_draw["home_win"].copy()

# Drop any rows with missing values
data = pd.concat([X, y], axis=1).dropna()
X = data[feature_cols]
y = data["home_win"]

# 80% train, 20% test (again, but now with new features)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

X_train.head()

Training size: 75948
Test size: 18987


Unnamed: 0,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,OddHome,OddDraw,OddAway
98374,1351.16,1280.94,3.0,6.0,2.0,3.0,2.15,3.25,3.4
71546,1471.79,1360.43,5.0,9.0,2.0,5.0,1.57,3.75,6.0
41167,1856.29,1465.46,6.0,10.0,0.0,3.0,1.61,4.0,4.0
69658,1312.44,1609.6,0.0,0.0,5.0,8.0,7.0,4.0,1.5
42091,1643.06,1472.33,7.0,10.0,3.0,3.0,1.2,5.0,13.0


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Base Random Forest model (no pipeline needed since all features are numeric)
rf_model = RandomForestClassifier(
    random_state=42,
    n_jobs=-1   # use all CPU cores
)

# Hyperparameter grid to search
param_grid = {
    "n_estimators": [100, 200, 400],        # number of trees
    "max_depth": [None, 10, 20],            # how deep each tree can grow
    "min_samples_split": [2, 5, 10],        # min rows needed to split a node
    "min_samples_leaf": [1, 2, 4],          # min rows in each leaf
}

# GridSearchCV: train many RF models with different settings
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,                 # 3-fold cross-validation
    scoring="accuracy",   # compare by accuracy
    n_jobs=-1,            # parallel
    verbose=2
)

# Fit on training data
grid_search.fit(X_train, y_train)

print("Best accuracy from CV:", grid_search.best_score_)
print("Best hyperparameters:", grid_search.best_params_)

# Best model
best_rf_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_rf_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print("\nTest accuracy of tuned Random Forest:", test_acc)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  10.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  40.1s


Exception ignored in: <function ResourceTracker.__del__ at 0x104a89e40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  40.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   9.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=  37.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  32.0s


Exception ignored in: <function ResourceTracker.__del__ at 0x10685de40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  40.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  17.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   9.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  19.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   8.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  16.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=  28.9s


Exception ignored in: <function ResourceTracker.__del__ at 0x1051d9e40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=  37.7s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   8.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  15.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  14.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=  23.9s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=  19.6s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   9.4s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  18.5s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_

Exception ignored in: <function ResourceTracker.__del__ at 0x10515de40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  10.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  10.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  37.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=400; total time=  36.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   8.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=  29.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=  24.4s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  10.7s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_est

Exception ignored in: <function ResourceTracker.__del__ at 0x106639e40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  11.4s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=  19.5s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   9.5s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  19.2s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=  18.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=  11.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  11.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=

Exception ignored in: <function ResourceTracker.__del__ at 0x1051f5e40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  20.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  19.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=  38.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  34.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   6.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  14.6s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   5.2s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   5.7s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_es

Exception ignored in: <function ResourceTracker.__del__ at 0x105bc9e40>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
