In [285]:
# setup e test librerie
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Random forest

XGboost

In [286]:
mushrooms = pd.read_csv("mushrooms_cleaned.csv")

y = mushrooms["is-edible"]
X = mushrooms.drop(columns="is-edible")

mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18356 entries, 0 to 18355
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   is-edible             18356 non-null  bool   
 1   cap-diameter          18356 non-null  float64
 2   cap-shape             18356 non-null  int64  
 3   cap-surface           18356 non-null  int64  
 4   cap-color             18356 non-null  int64  
 5   does-bruise-or-bleed  18356 non-null  bool   
 6   gill-attachment       18356 non-null  int64  
 7   gill-spacing          18356 non-null  int64  
 8   gill-color            18356 non-null  int64  
 9   stem-height           18356 non-null  float64
 10  stem-width            18356 non-null  float64
 11  stem-root             18356 non-null  int64  
 12  stem-surface          18356 non-null  int64  
 13  stem-color            18356 non-null  int64  
 14  veil-color            18356 non-null  int64  
 15  has-ring           

In [287]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=1/3,
    random_state=42
)

In [288]:
def print_eval(X, y, model):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy/Score: {accuracy:.5}")
    print("Classification report")
    print(classification_report(y, y_pred))

In [289]:
def print_coefficients(classifier):
    print("Coefficients")
    for i in range(len(X.columns)):
        print(f"{X.columns[i]}: {classifier.coef_[0][i]}")

In [290]:
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(solver="saga"))
])
model.fit(X_train, y_train)

# print_coefficients(model.named_steps["lr"])
print_eval(X_val, y_val, model)

Accuracy/Score: 0.80585
Classification report
              precision    recall  f1-score   support

       False       0.83      0.88      0.85      3933
        True       0.75      0.68      0.71      2186

    accuracy                           0.81      6119
   macro avg       0.79      0.78      0.78      6119
weighted avg       0.80      0.81      0.80      6119



In [291]:
from sklearn.tree import DecisionTreeClassifier

model = Pipeline([
    ("scaler", StandardScaler()),
    ("dtc", DecisionTreeClassifier(max_depth=5))
])
model.fit(X_train, y_train)

print_eval(X_val, y_val, model)

Accuracy/Score: 0.94296
Classification report
              precision    recall  f1-score   support

       False       0.97      0.94      0.95      3933
        True       0.89      0.95      0.92      2186

    accuracy                           0.94      6119
   macro avg       0.93      0.95      0.94      6119
weighted avg       0.94      0.94      0.94      6119



In [292]:
from sklearn.linear_model import RidgeClassifier
# from sklearn.linear_model import RidgeClassifierCV

model = Pipeline([
    ("scaler", StandardScaler()),
    ("rc", RidgeClassifier(alpha=0.5))
])
model.fit(X_train, y_train)

# print_coefficients(model.named_steps["rc"])
print_eval(X_val, y_val, model)

Accuracy/Score: 0.80324
Classification report
              precision    recall  f1-score   support

       False       0.83      0.87      0.85      3933
        True       0.75      0.68      0.71      2186

    accuracy                           0.80      6119
   macro avg       0.79      0.78      0.78      6119
weighted avg       0.80      0.80      0.80      6119



In [293]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold

kf = KFold(3, shuffle=True, random_state=42)
skf = StratifiedKFold(3, shuffle=True, random_state=42)

model = Pipeline([
    ("scaler", None),
    ("lr", LogisticRegression(solver="saga"))
])
grid = [
    {
        "scaler": [None, StandardScaler(), MinMaxScaler()],
        "lr__penalty": [None]
    },
    {
        "scaler": [None, StandardScaler(), MinMaxScaler()],
        "lr__penalty": ["l2", "l1"],
        "lr__C": np.logspace(-2, 2, 5)
    },
    {
        "scaler": [None, StandardScaler(), MinMaxScaler()],
        "lr__penalty": ["elasticnet"],
        "lr__C": np.logspace(-2, 2, 5),
        "lr__l1_ratio": [0.2, 0.5]
    }
]
# gs = GridSearchCV(model, grid, cv=skf)
# gs.fit(X, y)
# pd.DataFrame(gs.cv_results_).sort_values("rank_test_score").head(5)

batch size aiuta tantissimo

In [294]:
from sklearn.neural_network import MLPClassifier

model = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(batch_size=50, activation="relu", random_state=42))
])
grid = {
    "mlp__hidden_layer_sizes": [6, (6, 4), (20, 20, 20)],
}
gs = GridSearchCV(model, grid, cv=skf)
gs.fit(X_train, y_train)
pd.DataFrame(gs.cv_results_).sort_values("rank_test_score").head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3.66001,0.311435,0.006516,0.002934,6,{'mlp__hidden_layer_sizes': 6},1.0,1.0,1.0,1.0,0.0,1
2,1.908324,0.062062,0.006004,0.000329,"(20, 20, 20)","{'mlp__hidden_layer_sizes': (20, 20, 20)}",0.999755,1.0,1.0,0.999918,0.000116,2
1,3.094453,0.658354,0.004618,0.00038,"(6, 4)","{'mlp__hidden_layer_sizes': (6, 4)}",0.99951,0.999755,1.0,0.999755,0.0002,3
