# Esercitazione 4: classificazione cani e gatti

## Caricamento librerie

In [2]:
%matplotlib inline
import time
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from joblib import Memory
from skimage import feature, color, transform
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import ml_utilities
import ml_visualization

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Caricamento dataset

In [45]:
db_path = 'DBs/CaniGatti_ML18'
exp_path = 'Experiments'
train_filelist = 'BinaryTrainingSet.txt'  

# Predisposizione di un'area di caching su disco che velocizza la riesecuzione di chiamate di funzioni con gli stessi parametri
memory = Memory(exp_path, verbose=0)  

# Caricamento delle immagini
print('Caricamento in corso ...')
start = time.time()
train_raw_x, train_y = ml_utilities.load_labeled_dataset(train_filelist, db_path, cache=memory)

print('Caricate %d immagini in %.2f s.' % (len(train_raw_x), time.time() - start))
print('Gatti:', np.count_nonzero(train_y == 0))
print('Cani:', np.count_nonzero(train_y == 1))

# Shuffle del training set
ml_utilities.shuffle_in_unison([train_raw_x, train_y], seed=1234)


Caricamento in corso ...
Caricate 900 immagini in 0.18 s.
Gatti: 509
Cani: 391


## Resizing immagini

In [46]:
image_side = 128
train_raw_x = ml_utilities.resize_images(train_raw_x, image_side, image_side, cache=memory)

## Estrazione delle feature HOG

In [5]:
def HogRider(orientation,side):
     train_feature_x = ml_utilities.extract_hog(train_raw_x, 
                                           convert_to_gray=True, orientations=orientation,
                                           pixels_per_cell=(side,side), cells_per_block=(1, 1),
                                           cache=memory)
     return train_feature_x

In [6]:
def bestHogRider(model,grid,min_i=4,max_i=10,min_j=1,max_j=2):
    best_score = 0
    best_params = None
    for i in range(min_i, max_i):
        print(i)
        for j in range(min_j, max_j):
            print("--", j)
            train_feature_x = HogRider(i, j)
            #faccio grid search
            gs = GridSearchCV(model, grid, cv=4, n_jobs=-1)
            gs.fit(train_feature_x, train_y)
            score = gs.best_score_
            
            if score > best_score:
               best_score = score
               best_params = (i, j)
    return best_params

## GridSearch sui classificatori

### SVM

In [7]:
max = 0


In [None]:
#SVM
from sklearn.preprocessing import RobustScaler
svm_model = Pipeline([
    ("scaler", RobustScaler()),
    ("svm", SVC(random_state=1234))
])

svm_grid = [
    #{
    #    'svm__kernel': ['linear'],
     #   'svm__C': np.linspace(1, 5, 10) #iperparametro di regolarizzazione
    #},
    {
        'svm__C': [2.4434343434343435], 
        'svm__gamma': [0.0008121212121212122], #iperparametro del kernel rbf
        'svm__kernel': ['poly'],
        'svm__degree': [2]}

]
train_feature_x = HogRider(9,6)
svm_gs = GridSearchCV(svm_model, svm_grid, cv=4, n_jobs=-1)
svm_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', svm_gs.best_params_)
print('Score:', svm_gs.best_score_)
if(svm_gs.best_score_ > max):
    max = svm_gs.best_score_
    print('Migliorato')

Parametri scelti: {'svm__C': 2.4434343434343435, 'svm__degree': 2, 'svm__gamma': 0.0008121212121212122, 'svm__kernel': 'poly'}
Score: 0.7688888888888888


In [49]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Supponiamo che 'HogRider' sia una funzione che genera le caratteristiche
# La funzione è chiamata per ottenere le caratteristiche di addestramento
train_feature_x = HogRider(9, 6)  # Genera le caratteristiche di addestramento

# Definizione del GridSearch per ogni SVM
svm_grid_rbf = {
    'C': [2.4434343434343435],
    'gamma': [0.0008121212121212122],
    'kernel': ['rbf']
}

svm_grid_poly = {
    'C': [2.4434343434343435],
    'kernel': ['poly'],
    'degree': [2]
}

svm_grid_linear = {
    'C': [1.0],
    'kernel': ['linear']
}

# Ottimizzazione dei parametri degli SVM con GridSearchCV
svm_rbf = GridSearchCV(SVC(probability=True), svm_grid_rbf, cv=4, n_jobs=-1)
svm_poly = GridSearchCV(SVC(probability=True), svm_grid_poly, cv=4, n_jobs=-1)
svm_linear = GridSearchCV(SVC(probability=True), svm_grid_linear, cv=4, n_jobs=-1)

# Addestramento dei modelli base
svm_rbf.fit(train_feature_x, train_y)
svm_poly.fit(train_feature_x, train_y)
svm_linear.fit(train_feature_x, train_y)

# Estimatori base ottimizzati con GridSearchCV
base_estimators = [
    ('svm_rbf', svm_rbf.best_estimator_),
    ('svm_poly', svm_poly.best_estimator_),
    ('svm_linear', svm_linear.best_estimator_)
]

# Meta-model
meta_model = LogisticRegression()

# Stacking
stacking = StackingClassifier(estimators=base_estimators, final_estimator=meta_model, cv=4, n_jobs=-1)

print(cross_val_score(stacking, train_feature_x, train_y, cv=4, n_jobs=-1).mean())

0.7277777777777777


In [42]:
print(bestHogRider(svm_model,svm_grid, 4,12,4,8))

4
-- 4
-- 5
-- 6
-- 7
5
-- 4
-- 5
-- 6
-- 7
6
-- 4
-- 5
-- 6
-- 7
7
-- 4
-- 5
-- 6
-- 7
8
-- 4
-- 5
-- 6
-- 7
9
-- 4
-- 5
-- 6
-- 7
10
-- 4
-- 5
-- 6
-- 7
11
-- 4
-- 5
-- 6
-- 7
(9, 6)


In [93]:
len(train_feature_x)
train_feature_x[0].shape

(2304,)

### Random Forest

In [44]:
#Random forest

rfc_model = Pipeline([
    ("rfc", RandomForestClassifier(random_state=1234))
])

rfc_grid = {
    "rfc__max_depth": [10],
    "rfc__max_samples": np.linspace(0.1, 1, 10),
    "rfc__max_features": ["sqrt"],
    'rfc__min_samples_split': [2],
    "rfc__n_estimators": [1200],
    "rfc__min_samples_leaf": [2],
    'rfc__bootstrap': [True],
}

rfc_gs = GridSearchCV(rfc_model, rfc_grid, cv=4, n_jobs=-1)
rfc_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', rfc_gs.best_params_)
print('Score:', rfc_gs.best_score_)

Parametri scelti: {'rfc__bootstrap': True, 'rfc__max_depth': 10, 'rfc__max_features': 'sqrt', 'rfc__max_samples': np.float64(0.6), 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 2, 'rfc__n_estimators': 1200}
Score: 0.7422222222222222


In [None]:
min_samples_split': [2],
    "rfc__n_estimators": np.linspace(755-10, 755+00, 10, dtype=int)
}
train_feature_x = HogRider(9,8)

rfc_gs = GridSearchCV(rfc_model, rfc_grid, cv=4, n_jobs=-1)
rfc_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', rfc_gs.best_params_)
print('Score:', rfc_gs.best_score_)

In [30]:
from sklearn.tree import DecisionTreeClassifier
decision_model =  DecisionTreeClassifier(random_state=1234)

decision_grid = {
     "max_depth": [7],
     "min_samples_split": [5]

     }

decision_gs = GridSearchCV(decision_model, decision_grid, cv=4, n_jobs=-1)
decision_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', decision_gs.best_params_)
print('Score:', decision_gs.best_score_)

Parametri scelti: {'max_depth': 7, 'min_samples_split': 5}
Score: 0.5888888888888889


### AdaBoost

In [35]:
#Adaboost

ada_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ada", AdaBoostClassifier(random_state=1234))
])

ada_grid = {
    'ada__estimator': [decision_model],
    "ada__n_estimators": range(1, 10, 1),
    "ada__learning_rate": np.logspace(-4,-1,4)
}

ada_gs = GridSearchCV(ada_model, ada_grid, cv=4, n_jobs=-1)
ada_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', ada_gs.best_params_)
print('Score:', ada_gs.best_score_)



Parametri scelti: {'ada__estimator': DecisionTreeClassifier(random_state=1234), 'ada__learning_rate': np.float64(0.0001), 'ada__n_estimators': 1}
Score: 0.56


### XGBoost

In [11]:
#%pip install xgboost

In [12]:
from xgboost import XGBClassifier

In [13]:
#XGBoost

xgb_model = Pipeline([
    ("scaler", StandardScaler()),
    ("xgb", XGBClassifier(objective='binary:logistic'))
])

xgb_grid = {
    "xgb__max_depth": [2, 4, 6, 8, 10],
    "xgb__n_estimators": [100, 200, 300],
    "xgb__alpha": [0.001, 0.01, 0.1, 1]
}

xgb_gs = GridSearchCV(xgb_model, xgb_grid, cv=4, n_jobs=-1)
xgb_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', xgb_gs.best_params_)
print('Score:', xgb_gs.best_score_)

Parametri scelti: {'xgb__alpha': 0.01, 'xgb__max_depth': 2, 'xgb__n_estimators': 300}
Score: 0.75


### Regressione logistica

In [143]:
from sklearn.linear_model import LogisticRegression

In [150]:
log_model = Pipeline([
    ("scaler", StandardScaler()),
    ("linreg", LogisticRegression(random_state=1234))
])

log_grid = [
    
    {
        "linreg__penalty": ['elasticnet'],
        "linreg__solver": ['saga'],
        "linreg__C": [0.8416666666666666],
        "linreg__l1_ratio": np.linspace(0.18888888888888886-0.1, 0.18888888888888886+0.1, 10)
    }
]

log_gs = GridSearchCV(log_model, log_grid, cv=4, n_jobs=-1)
log_gs.fit(train_feature_x, train_y)

print('Parametri scelti:', log_gs.best_params_)
print('Score:', log_gs.best_score_)

Parametri scelti: {'linreg__C': 0.8416666666666666, 'linreg__l1_ratio': np.float64(0.18888888888888886), 'linreg__penalty': 'elasticnet', 'linreg__solver': 'saga'}
Score: 0.731111111111111




## Test

In [None]:
# Esercizio 2: valutazione su test set

# Path contenente i pattern di test
image_side = 128
db_path = 'DBs/CaniGatti_ML18'
exp_path = 'Experiments'
train_filelist = 'BinaryTrainingSet.txt' 
test_filelist = 'Unlabeled_BinaryTestSet.txt'
result_path = 'Es4Predictions.txt'
memory = Memory(exp_path, verbose=0) 

# Caricamento dei pattern di training
train_raw_x, train_y = ml_utilities.load_labeled_dataset(train_filelist, db_path, cache=memory)
train_raw_x = ml_utilities.resize_images(train_raw_x, image_side, image_side, cache=memory)

# Preprocessing ed estrazione HOG (training)
train_feature_x = ml_utilities.extract_hog(train_raw_x, 
                                           convert_to_gray=True, orientations=9,
                                           pixels_per_cell=(8, 8), cells_per_block=(1, 1),
                                           cache=memory)

# Creazione del classificatore
clf = ... #TODO: trovare il migliore classificatore con i migliori iperparametri

# Addestramento del classificatore
clf.fit(train_feature_x, train_y)

# Caricamento dei pattern di test
test_raw_x = ml_utilities.load_unlabeled_dataset(test_filelist, db_path, cache=memory)
test_raw_x = ml_utilities.resize_images(test_raw_x, image_side, image_side, cache=memory)

# Preprocessing ed estrazione HOG (test)
test_feature_x = ml_utilities.extract_hog(test_raw_x, 
                                           convert_to_gray=True, orientations=9,
                                           pixels_per_cell=(8, 8), cells_per_block=(1, 1),
                                           cache=memory)

# Salvataggio delle predictions
predictions = clf.predict(test_feature_x)

with open(result_path, "w") as f:
    for prediction in predictions:
        f.write(str(int(prediction)) + '\n')
print('Ok')