### 1) Pre-processing of the data.

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

# Load & Pre-process
df = pd.read_csv("JGM.csv")
df = df.drop(columns=["team_nr", "tijd"])
df["gehaald"] = df["gehaald"].map({1: 0, 2: 1})

X = df.drop("gehaald", axis=1)
y = df["gehaald"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 2) Baseline model.

In [13]:
# Train
model = MLPClassifier(random_state=42)

pipe_base = Pipeline([("model", model)])
pipe_base.fit(X_train, y_train)
y_pred = pipe_base.predict(X_test)



In [14]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_base, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8135593220338984
precision_score: 0.8095238095238095
recall_score: 0.918918918918919
f1_score: 0.8607594936708861
---
[[14  8]
 [ 3 34]]
---
Cross-val scores:




[0.76315789 0.86842105 0.81012658 0.74358974 0.69565217]
0.7761894894299378 - 0.05895288390127686




### 3) MinMax-Scaler model.

In [15]:
from sklearn.preprocessing import MinMaxScaler

minMaxScaler = MinMaxScaler()

# Train
pipe_minmax = Pipeline([("scaler", minMaxScaler), ("model", model)])
pipe_minmax.fit(X_train, y_train)
y_pred = pipe_minmax.predict(X_test)



In [16]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_minmax, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7627118644067796
precision_score: 0.7674418604651163
recall_score: 0.8918918918918919
f1_score: 0.825
---
[[12 10]
 [ 4 33]]
---
Cross-val scores:




[0.8        0.86842105 0.80519481 0.75324675 0.74285714]
0.7939439507860561 - 0.04466815417774877




### 4) Standardized-Scaler model.

In [17]:
from sklearn.preprocessing import StandardScaler

stdScaler = StandardScaler()

# Train
pipe_standard = Pipeline([("scaler", stdScaler), ("model", model)])
pipe_standard.fit(X_train, y_train)
y_pred = pipe_standard.predict(X_test)



In [18]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_standard, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7627118644067796
precision_score: 0.7555555555555555
recall_score: 0.918918918918919
f1_score: 0.8292682926829268
---
[[11 11]
 [ 3 34]]
---
Cross-val scores:




[0.88       0.84615385 0.80519481 0.74666667 0.83544304]
0.8226916711980004 - 0.04490714239925179




### 5) Baseline model - Feature Selection

In [19]:
from sklearn.feature_selection import SelectKBest, chi2

# Preprocessing
X_feature = SelectKBest(chi2).fit_transform(X, y)
X_train_feature, X_test_feature, y_train_feature, y_test_feature = train_test_split(X_feature, y, test_size=0.2, random_state=42, stratify=y)

# Train
pipe_feature = Pipeline([("model", model)])
pipe_feature.fit(X_train_feature, y_train_feature)
y_pred = pipe_feature.predict(X_test_feature)



In [20]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test_feature, y_pred))
print("precision_score:", precision_score(y_test_feature, y_pred))
print("recall_score:", recall_score(y_test_feature, y_pred))
print("f1_score:", f1_score(y_test_feature, y_pred))
print("---")
print(confusion_matrix(y_test_feature, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_feature, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8135593220338984
precision_score: 0.7954545454545454
recall_score: 0.9459459459459459
f1_score: 0.8641975308641975
---
[[13  9]
 [ 2 35]]
---
Cross-val scores:




[0.79487179 0.83333333 0.83950617 0.73972603 0.76315789]
0.7941190446357472 - 0.03876986356676732


### 6) Baseline model - Hyperparameter optimalization

In [21]:
from sklearn.model_selection import GridSearchCV

# Grid Search

param_grid = {
    "model__hidden_layer_sizes": [(10,), (25,), (50,), (75,), (100,), (150,)],
    "model__activation": ["identity", "logistic", "tanh", "relu"],
    "model__solver": ["lbfgs", "sgd", "adam"],
    "model__max_iter": range(50, 500, 50)
}

gridsearch = GridSearchCV(pipe_feature, param_grid, n_jobs=-1, scoring="f1", verbose=5)
gridsearch.fit(X_train_feature, y_train_feature)

print(gridsearch.best_params_)
print(gridsearch.best_score_)

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test_feature)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
{'model__activation': 'identity', 'model__hidden_layer_sizes': (50,), 'model__max_iter': 100, 'model__solver': 'adam'}
0.8181536626793393




In [22]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test_feature, y_pred))
print("precision_score:", precision_score(y_test_feature, y_pred))
print("recall_score:", recall_score(y_test_feature, y_pred))
print("f1_score:", f1_score(y_test_feature, y_pred))
print("---")
print(confusion_matrix(y_test_feature, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")



Test scores:
Accuracy: 0.7966101694915254
precision_score: 0.8048780487804879
recall_score: 0.8918918918918919
f1_score: 0.8461538461538461
---
[[14  8]
 [ 4 33]]
---
Cross-val scores:




[0.77333333 0.84210526 0.87671233 0.78378378 0.8       ]
0.8151869418084271 - 0.038674268816592285


