### 1) Pre-processing of the data.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Load & Pre-process
df = pd.read_csv("JGM.csv")
df = df.drop(columns=["team_nr", "tijd"])
df["gehaald"] = df["gehaald"].map({1: 0, 2: 1})

X = df.drop("gehaald", axis=1)
y = df["gehaald"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 2) Baseline model.

In [3]:
# Train
model = SVC(random_state=42)

pipe_base = Pipeline([("model", model)])
pipe_base.fit(X_train, y_train)
y_pred = pipe_base.predict(X_test)

In [4]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_base, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.6271186440677966
precision_score: 0.6271186440677966
recall_score: 1.0
f1_score: 0.7708333333333334
---
[[ 0 22]
 [ 0 37]]
---
Cross-val scores:
[0.77083333 0.76595745 0.77894737 0.76595745 0.76086957]
0.7685130321177598 - 0.006094978106998167


### 3) MinMax-Scaler model.

In [5]:
from sklearn.preprocessing import MinMaxScaler

minMaxScaler = MinMaxScaler()

# Train
pipe_minmax = Pipeline([("scaler", minMaxScaler), ("model", model)])
pipe_minmax.fit(X_train, y_train)
y_pred = pipe_minmax.predict(X_test)

In [6]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_minmax, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7796610169491526
precision_score: 0.7857142857142857
recall_score: 0.8918918918918919
f1_score: 0.8354430379746836
---
[[13  9]
 [ 4 33]]
---
Cross-val scores:
[0.79487179 0.86075949 0.80519481 0.75949367 0.79452055]
0.8029680625137535 - 0.03277725027414578


### 4) Standardized-Scaler model.

In [7]:
from sklearn.preprocessing import StandardScaler

stdScaler = StandardScaler()

# Train
pipe_standard = Pipeline([("scaler", stdScaler), ("model", model)])
pipe_standard.fit(X_train, y_train)
y_pred = pipe_standard.predict(X_test)

In [8]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_standard, X, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.7966101694915254
precision_score: 0.8048780487804879
recall_score: 0.8918918918918919
f1_score: 0.8461538461538461
---
[[14  8]
 [ 4 33]]
---
Cross-val scores:
[0.82051282 0.85       0.80519481 0.75949367 0.82666667]
0.8123735926520738 - 0.03011347419412958


### 5) Standardized model - Feature Selection (SelectKBest)

In [9]:
from sklearn.feature_selection import SelectKBest, chi2

# Preprocessing
X_feature = SelectKBest(chi2).fit_transform(X, y)
X_train_feature, X_test_feature, y_train_feature, y_test_feature = train_test_split(X_feature, y, test_size=0.2, random_state=42, stratify=y)

# Train
pipe_feature = Pipeline([("scaler", stdScaler), ("model", model)])
pipe_feature.fit(X_train_feature, y_train_feature)
y_pred = pipe_feature.predict(X_test_feature)

In [11]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test_feature, y_pred))
print("precision_score:", precision_score(y_test_feature, y_pred))
print("recall_score:", recall_score(y_test_feature, y_pred))
print("f1_score:", f1_score(y_test_feature, y_pred))
print("---")
print(confusion_matrix(y_test_feature, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe_feature, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.847457627118644
precision_score: 0.8181818181818182
recall_score: 0.972972972972973
f1_score: 0.8888888888888888
---
[[14  8]
 [ 1 36]]
---
Cross-val scores:
[0.8        0.86842105 0.86842105 0.80519481 0.86111111]
0.8406296043138148 - 0.031210930954354205


### 6) Standardized, feature selected, model - Hyperparameter optimalization

In [12]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Grid Search
param_grid = {
    "model__C": np.arange(0.1, 2.0, 0.1),
    "model__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "model__gamma": ["scale", "auto"]
}

gridsearch = GridSearchCV(pipe_feature, param_grid, n_jobs=-1, scoring="f1", verbose=5)
gridsearch.fit(X_train_feature, y_train_feature)

print(gridsearch.best_params_)
print(gridsearch.best_score_)

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test_feature)

Fitting 5 folds for each of 152 candidates, totalling 760 fits
{'model__C': np.float64(1.8000000000000003), 'model__gamma': 'scale', 'model__kernel': 'rbf'}
0.8323934525832712


In [13]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8135593220338984
precision_score: 0.8095238095238095
recall_score: 0.918918918918919
f1_score: 0.8607594936708861
---
[[14  8]
 [ 3 34]]
---
Cross-val scores:
[0.83116883 0.88       0.85714286 0.8        0.85714286]
0.8450909090909091 - 0.02733553682701663
