In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2

# Load & Pre-process
df = pd.read_csv("JGM.csv")
df = df.drop(columns=["team_nr", "tijd"])
df["gehaald"] = df["gehaald"].map({1: 0, 2: 1})

X = df.drop("gehaald", axis=1)
y = df["gehaald"]

# Preprocessing
X_feature = SelectKBest(chi2).fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_feature, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Train
RFC_model = RandomForestClassifier(random_state=42, max_depth=2, min_samples_leaf=1, min_samples_split=2, n_estimators=210)
SVC_model = SVC(random_state=42, C=1.8)
MLP_model = MLPClassifier(random_state=42, activation="identity", hidden_layer_sizes=(50,), max_iter=100)
LR_model = LogisticRegression(random_state=42, C=0.2, max_iter=25, penalty="l2")

scaler = StandardScaler()

voting_model = VotingClassifier(
    estimators=[
        ("RFC", RFC_model),
        ("SVC", Pipeline([("scaler", scaler), ("SVC", SVC_model)])),
        ("MLP", MLP_model),
        ("LR", Pipeline([("scaler", scaler), ("LR", LR_model)]))
        ]
    )

voting_model.fit(X_train, y_train)

y_pred = voting_model.predict(X_test)



In [8]:
# Eval

print("Test scores:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("precision_score:", precision_score(y_test, y_pred))
print("recall_score:", recall_score(y_test, y_pred))
print("f1_score:", f1_score(y_test, y_pred))
print("---")
print(confusion_matrix(y_test, y_pred))
print("---")
print("Cross-val scores:")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(voting_model, X_feature, y, cv=cv, scoring="f1")

print(cv_scores)
print(f"{cv_scores.mean()} - {cv_scores.std()}")

Test scores:
Accuracy: 0.8135593220338984
precision_score: 0.825
recall_score: 0.8918918918918919
f1_score: 0.8571428571428571
---
[[15  7]
 [ 4 33]]
---
Cross-val scores:




[0.80519481 0.87671233 0.89189189 0.78378378 0.84507042]
0.8405306464345632 - 0.0410768288338184


