In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as mtr

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Classificazione rank
Il rank è un valore tra 0 e 20 calcolato nello script ranks.ipynb.
In questo script ho due metodi per gestire i nomi delle squadre: 
- utilizzo dummy 
- utilizzo i codici 

In [2]:
#dataset = pd.read_csv('../SerieA/Season21_22/matches.csv', index_col=0)
#dataset = dataset.drop(columns=['date', 'matchday', 'h_goals', 'a_goals'])

dataset = pd.read_csv('../SerieA/Season21_22/diffs_dataset.csv', index_col=0)
dataset = dataset.drop(columns=['date', 'matchday', 'goals'])


## Codici associati

In [3]:
dataset["home_code"] = dataset["h_team"].astype("category").cat.codes
dataset["away_code"] = dataset["a_team"].astype("category").cat.codes
dataset = dataset.drop(columns=['h_team', 'a_team'])

## Universale (continuo sia di Dummy che Codici associati)

In [4]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000), #max_iter di default vale 100, ho dovuto alzarlo se no non converge
    'Support Vector Machine': SVC(),
    #'Multinomial Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [5]:
features = [x for x in dataset.columns if x != 'result']
X, y = dataset[features], dataset.result.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
for model_name, model in models.items():
    models[model_name].fit(X_train, y_train)

predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict(X_test)

In [7]:
E = []
for estimator, y_pred in predictions.items():
    report = mtr.classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    E.append({
        'Model': estimator, 'Accuracy': report['accuracy'],
        'Avg Precision (macro)': report['macro avg']['precision'],
        'Avg Recall (macro)': report['macro avg']['recall'],
        'Avg F1-score (macro)': report['macro avg']['f1-score'],
        'Avg Precision (weighted)': report['weighted avg']['precision'],
        'Avg Recall (weighted)': report['weighted avg']['recall'],
        'Avg F1-score (weighted)': report['weighted avg']['f1-score']
    })
E = pd.DataFrame(E).set_index('Model', inplace=False)

In [8]:
E

Unnamed: 0_level_0,Accuracy,Avg Precision (macro),Avg Recall (macro),Avg F1-score (macro),Avg Precision (weighted),Avg Recall (weighted),Avg F1-score (weighted)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.486842,0.426942,0.435606,0.42169,0.474016,0.486842,0.471169
Support Vector Machine,0.513158,0.343497,0.432099,0.381241,0.413038,0.513158,0.455947
Decision Tree,0.381579,0.381419,0.365039,0.362297,0.424056,0.381579,0.389925
Random Forest,0.473684,0.44283,0.433993,0.426418,0.475326,0.473684,0.463176
K-Nearest Neighbors,0.434211,0.43117,0.443883,0.432851,0.442559,0.434211,0.434926
