### 1) Pre-processing of the data.

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Load & Pre-process
df = pd.read_csv("JGM.csv")
df = df.drop(columns=["team_nr", "tijd"])
df["gehaald"] = df["gehaald"].map({1: 0, 2: 1})

X = df.drop("gehaald", axis=1)
y = df["gehaald"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 2) Baseline model.

In [12]:
# Train
model = RandomForestClassifier(random_state=42)

pipe_base = Pipeline([("model", model)])
pipe_base.fit(X_train, y_train)
y_pred = pipe_base.predict(X_test)

In [13]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7966101694915254
precision_score: 0.8048780487804879
recall_score: 0.8918918918918919
f1_score: 0.8461538461538461
---
[[14  8]
 [ 4 33]]
---
Cross-val scores:
[0.81578947 0.84615385 0.85714286 0.75324675 0.82051282]
0.8185691501480974 - 0.03613133354227177


### 3) MinMax-Scaler model.

In [14]:
from sklearn.preprocessing import MinMaxScaler

minMaxScaler = MinMaxScaler()

# Train
pipe_minmax = Pipeline([("scaler", minMaxScaler), ("model", model)])
pipe_minmax.fit(X_train, y_train)
y_pred = pipe_minmax.predict(X_test)

In [15]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_minmax, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7966101694915254
precision_score: 0.8048780487804879
recall_score: 0.8918918918918919
f1_score: 0.8461538461538461
---
[[14  8]
 [ 4 33]]
---
Cross-val scores:
[0.81578947 0.84615385 0.85714286 0.75324675 0.82051282]
0.8185691501480974 - 0.03613133354227177


### 4) Standardized-Scaler model.

In [16]:
from sklearn.preprocessing import StandardScaler

stdScaler = StandardScaler()

# Train
pipe_standard = Pipeline([("scaler", stdScaler), ("model", model)])
pipe_standard.fit(X_train, y_train)
y_pred = pipe_standard.predict(X_test)

In [17]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_standard, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7966101694915254
precision_score: 0.8048780487804879
recall_score: 0.8918918918918919
f1_score: 0.8461538461538461
---
[[14  8]
 [ 4 33]]
---
Cross-val scores:
[0.80519481 0.84615385 0.85714286 0.75324675 0.82051282]
0.8164502164502165 - 0.03654056217224949


### 5) Baseline model - Feature Selection

In [18]:
from sklearn.feature_selection import SelectKBest, chi2

# Preprocessing
X_feature = SelectKBest(chi2).fit_transform(X, y)
X_train_feature, X_test_feature, y_train_feature, y_test_feature = train_test_split(X_feature, y, test_size=0.2, random_state=42, stratify=y)

# Train
pipe_feature = Pipeline([("model", model)])
pipe_feature.fit(X_train_feature, y_train_feature)
y_pred = pipe_feature.predict(X_test_feature)

In [19]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test_feature, y_pred))
print("precision_score:", precision_score(y_test_feature, y_pred))
print("recall_score:", recall_score(y_test_feature, y_pred))
print("f1_score:", f1_score(y_test_feature, y_pred))
print("---")
print(confusion_matrix(y_test_feature, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_feature, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8813559322033898
precision_score: 0.8947368421052632
recall_score: 0.918918918918919
f1_score: 0.9066666666666666
---
[[18  4]
 [ 3 34]]
---
Cross-val scores:
[0.81578947 0.83116883 0.83116883 0.83783784 0.84057971]
0.8313089368009277 - 0.008596124495992586


### 6) Baseline, feature-selected, model - Hyperparameter optimalization

In [20]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Grid Search
param_grid = {
    "model__n_estimators": range(10, 250, 50),
    "model__max_depth": range(2, 50, 5),
    "model__min_samples_split": range(2, 10),
    "model__min_samples_leaf": range(1, 10)
}

gridsearch = GridSearchCV(pipe_feature, param_grid, n_jobs=-1, scoring="f1", verbose=5)
gridsearch.fit(X_train_feature, y_train_feature)

print(gridsearch.best_params_)
print(gridsearch.best_score_)

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test_feature)

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits
{'model__max_depth': 2, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 210}
0.8226294962958555


In [21]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test_feature, y_pred))
print("precision_score:", precision_score(y_test_feature, y_pred))
print("recall_score:", recall_score(y_test_feature, y_pred))
print("f1_score:", f1_score(y_test_feature, y_pred))
print("---")
print(confusion_matrix(y_test_feature, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8305084745762712
precision_score: 0.8292682926829268
recall_score: 0.918918918918919
f1_score: 0.8717948717948718
---
[[15  7]
 [ 3 34]]
---
Cross-val scores:
[0.81578947 0.83116883 0.85333333 0.78947368 0.83783784]
0.8255206320469478 - 0.021685202125245023
