In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as mtr

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [2]:
dataset = pd.read_csv('../SerieA/Season21_22/diffs_dataset.csv', index_col=0)
rank = pd.read_csv('../SerieA/Season21_22/rank.csv', index_col=0)
dataset = dataset.drop(columns=['date', 'matchday', 'goals'])

In [3]:
for i, v in dataset.iterrows():
    rank_h = rank[rank.Team == v.h_team]
    rank_a = rank[rank.Team == v.a_team]
    rank_h = rank_h.iloc[0]["Rank"]
    rank_a = rank_a.iloc[0]["Rank"]
    
    dataset.at[i, 'rank_h'] = rank_h
    dataset.at[i, 'rank_a'] = rank_a

In [4]:
dataset["home_code"] = dataset["h_team"].astype("category").cat.codes
dataset["away_code"] = dataset["a_team"].astype("category").cat.codes
dataset = dataset.drop(columns=['h_team', 'a_team'])

In [5]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000), #max_iter di default vale 100, ho dovuto alzarlo se no non converge
    'Support Vector Machine': SVC(),
    #'Multinomial Naive Bayes': MultinomialNB(), #con valori negativi non funziona
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [6]:
features = [x for x in dataset.columns if x != 'result']
X, y = dataset[features], dataset.result.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
for model_name, model in models.items():
    models[model_name].fit(X_train, y_train)

predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict(X_test)

In [8]:
E = []
for estimator, y_pred in predictions.items():
    report = mtr.classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    E.append({
        'Model': estimator, 'Accuracy': report['accuracy'],
        'Avg Precision (macro)': report['macro avg']['precision'],
        'Avg Recall (macro)': report['macro avg']['recall'],
        'Avg F1-score (macro)': report['macro avg']['f1-score'],
        'Avg Precision (weighted)': report['weighted avg']['precision'],
        'Avg Recall (weighted)': report['weighted avg']['recall'],
        'Avg F1-score (weighted)': report['weighted avg']['f1-score']
    })
E = pd.DataFrame(E).set_index('Model', inplace=False)

In [9]:
E

Unnamed: 0_level_0,Accuracy,Avg Precision (macro),Avg Recall (macro),Avg F1-score (macro),Avg Precision (weighted),Avg Recall (weighted),Avg F1-score (weighted)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.539474,0.472768,0.47601,0.466391,0.536766,0.539474,0.529989
Support Vector Machine,0.513158,0.343497,0.432099,0.381241,0.413038,0.513158,0.455947
Decision Tree,0.355263,0.341872,0.335859,0.337262,0.369782,0.355263,0.360601
Random Forest,0.473684,0.412023,0.416526,0.411508,0.465941,0.473684,0.467246
K-Nearest Neighbors,0.434211,0.423008,0.424663,0.423671,0.435639,0.434211,0.434797
