In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

df = pd.read_csv("JGM.csv")
df = df.drop(columns=["team_nr", "tijd"])
df["gehaald"] = df['gehaald'].map({1: 0, 2: 1})

X = df.drop("gehaald", axis=1)
y = df["gehaald"]

# Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# List of classifiers to test
classifiers = [
    LogisticRegression(max_iter=200),
    KNeighborsClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    RidgeClassifier(),
    LinearDiscriminantAnalysis(),
    MLPClassifier(max_iter=500)
]

# Function to evaluate classifiers
def evaluate_classifiers(classifiers, X_train, X_test, y_train, y_test):
    results = []

    for clf in classifiers:

        clf_name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        
        # Accuracy on test data
        y_pred = clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Cross-validation score
        cv_score = cross_val_score(clf, X_train, y_train, cv=5, scoring="f1").mean()
        
        results.append({
            'Classifier': clf_name,
            'Test - Accuracy': accuracy,
            "Test - F1": f1,
            'Cross-validation F1': cv_score
        })
    
    # Create DataFrame to display results
    results_df = pd.DataFrame(results)
    return results_df

# Run the evaluation
results_df = evaluate_classifiers(classifiers, X_train, X_test, y_train, y_test)
results_df_scaled = evaluate_classifiers(classifiers, X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled)

# Print the results
print("\nDefault results:")
print(results_df.sort_values(by='Cross-validation F1', ascending=False))

print("\nScaled results:")
print(results_df_scaled.sort_values(by='Cross-validation F1', ascending=False))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Default results:
                    Classifier  Test - Accuracy  Test - F1  \
10               MLPClassifier         0.728814   0.794872   
7                   GaussianNB         0.661017   0.729730   
4       RandomForestClassifier         0.677966   0.765432   
2                          SVC         0.610169   0.757895   
0           LogisticRegression         0.661017   0.743590   
5   GradientBoostingClassifier         0.779661   0.839506   
6           AdaBoostClassifier         0.644068   0.734177   
3       DecisionTreeClassifier         0.661017   0.729730   
8              RidgeClassifier         0.745763   0.805195   
9   LinearDiscriminantAnalysis         0.745763   0.805195   
1         KNeighborsClassifier         0.644068   0.727273   

    Cross-validation F1  
10             0.791706  
7              0.771710  
4              0.771637  
2              0.768967  
0              0.747543  
5              0.740835  
6              0.725363  
3              0.721240  
8  