# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn import tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
import tensorflow as tf

from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

RANDOM_STATE = 42

2024-12-01 15:46:19.636053: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 15:46:20.119581: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733064380.266116     875 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733064380.293933     875 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 15:46:20.595345: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
dev_data = pd.read_csv('../data/ml_datasets/dev_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # shuffling the data so not to introduce bias
testing_data = pd.read_csv('../data/ml_datasets/test_set.csv')

In [3]:
dev_label = dev_data.pop('label')
test_label = testing_data.pop('label')

dev_set = dev_data
test_set = testing_data

## Decision Tree

In [6]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {"max_depth": [2,3,5,6,7,10,12,None],
              "max_features": sp_randint(1, len(dev_set.iloc[0]) + 1),
              "min_samples_split": sp_randint(10, 51),
              "min_samples_leaf": sp_randint(10, 51),
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.3, 1: 0.7}]}
#define the number of iters
n_iter_search = 200
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [8]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
124,0.6931,0.048065,0.016787,0.001898,,entropy,,8,11,32,"{'class_weight': None, 'criterion': 'entropy',...",0.495647,0.501157,0.511965,0.501459,0.505496,0.503145,0.00541,1
118,0.82217,0.075483,0.013566,0.001981,balanced,entropy,,10,20,47,"{'class_weight': 'balanced', 'criterion': 'ent...",0.50529,0.495474,0.504218,0.503114,0.500845,0.501788,0.003484,2
47,0.442645,0.017986,0.011699,0.001717,balanced,entropy,12.0,9,10,26,"{'class_weight': 'balanced', 'criterion': 'ent...",0.494188,0.4937,0.503363,0.508668,0.503272,0.500638,0.005806,3
28,0.720582,0.078936,0.011691,0.001256,balanced,gini,,10,38,23,"{'class_weight': 'balanced', 'criterion': 'gin...",0.497571,0.496518,0.504316,0.5014,0.499492,0.499859,0.002785,4
61,0.215958,0.028282,0.013872,0.002418,,entropy,,3,25,35,"{'class_weight': None, 'criterion': 'entropy',...",0.495979,0.495824,0.49906,0.502563,0.505363,0.499758,0.003729,5


## SVM

In [None]:
#define the parameters' values you want to try
param_dist = {"kernel": ['poly'],
              "C": sp_uniform(0.1, 10.0),
              "gamma": ['scale']}
#define the number of iters
n_iter_search = 1
#define the model
clf = SVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## Naive Bayes

In [4]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
#define the number of iters
n_iter_search = 1
#define the model
clf = GaussianNB()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, #CrossValidation per confrontabilità, non model selection
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [5]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.040518,0.010634,0.012115,0.000611,{},0.447988,0.449436,0.449699,0.445574,0.449861,0.448512,0.001612,1


## KNN

- Rischiamo che il mapping degli attributi categorici ordinali (senza one-hot) crei problemi nel K-NN

In [5]:
param_dist = {'n_neighbors': sp_randint(10, 20),
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_dev_set = dev_set.drop(columns=['cyclist_age_group_num', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter'])
#define the number of iters
n_iter_search = 10
#define the model
clf = KNeighborsClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=n_iter_search, 
                            n_jobs=5, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(tmp_dev_set, dev_label);

  _data = np.array(data, dtype=dtype, copy=copy,


In [6]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.118903,0.008053,2.06297,0.015597,ball_tree,11,"{'algorithm': 'ball_tree', 'n_neighbors': 11}",0.52424,0.523677,0.51981,0.516649,0.519594,0.520794,0.002821,1
5,0.022907,0.009683,10.405033,0.092103,brute,11,"{'algorithm': 'brute', 'n_neighbors': 11}",0.523778,0.523622,0.51981,0.51666,0.519838,0.520742,0.002678,2
4,0.161294,0.012134,1.432873,0.108844,kd_tree,13,"{'algorithm': 'kd_tree', 'n_neighbors': 13}",0.517266,0.516298,0.51304,0.51239,0.514624,0.514723,0.001857,3
0,0.195944,0.00811,1.734364,0.044677,kd_tree,15,"{'algorithm': 'kd_tree', 'n_neighbors': 15}",0.51273,0.514391,0.507889,0.510771,0.509649,0.511086,0.002281,4
7,0.02381,0.009146,10.296053,0.139886,brute,17,"{'algorithm': 'brute', 'n_neighbors': 17}",0.507981,0.508362,0.502046,0.506062,0.504549,0.5058,0.002326,5


## Rule-Based

In [19]:
param_dist = {"prune_size": [0.5, 0.6], "k": [1, 3, 5]}

#define the number of iters
n_iter_search = 10
#define the model
clf = lw.RIPPER()
#define the grid search
rand_search = GridSearchCV(estimator=clf, param_grid=param_dist, scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty

In [20]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_k,param_prune_size,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,36.909321,16.776181,0.212072,0.002196,3,0.5,"{'k': 3, 'prune_size': 0.5}",0.398909,0.398898,0.398998,0.398898,0.398905,0.398922,3.9e-05,1
0,35.686131,16.155444,0.215591,0.007009,1,0.5,"{'k': 1, 'prune_size': 0.5}",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,2
1,36.182507,16.355361,0.213467,0.003026,1,0.6,"{'k': 1, 'prune_size': 0.6}",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,2
3,35.615997,16.148655,0.210887,0.003867,3,0.6,"{'k': 3, 'prune_size': 0.6}",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,2
4,34.403325,15.518429,0.21602,0.009103,5,0.5,"{'k': 5, 'prune_size': 0.5}",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,2


## Random Forest

In [None]:
clf = RandomForestClassifier()

param_dist = {"max_depth": [2,3,5,6,7,10,12,None],
              "max_features": sp_randint(1, len(dev_set.iloc[0]) + 1),
              "min_samples_split": sp_randint(10, 51),
              "min_samples_leaf": sp_randint(10, 51),
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.3, 1: 0.7}],
              "n_estimators": [33, 100, 250]}

n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [13]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,18.260699,0.134818,0.149433,0.021049,balanced,gini,10,9,24,18,33,"{'class_weight': 'balanced', 'criterion': 'gin...",0.602649,0.598116,0.591557,0.589197,0.599099,0.596124,0.004985,1
14,82.551927,0.816631,0.598162,0.027558,balanced,entropy,7,8,17,38,250,"{'class_weight': 'balanced', 'criterion': 'ent...",0.585775,0.57937,0.582903,0.576987,0.581477,0.581303,0.002999,2
16,27.004289,0.225192,0.208939,0.006951,balanced,gini,6,8,23,37,100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.579894,0.575508,0.573466,0.568881,0.577443,0.575038,0.003741,3
1,106.032092,0.859685,1.092117,0.046557,"{0: 0.3, 1: 0.7}",gini,12,6,22,44,250,"{'class_weight': {0: 0.3, 1: 0.7}, 'criterion'...",0.560268,0.558286,0.561903,0.557129,0.563072,0.560132,0.002201,4
19,1.888186,0.091144,0.069076,0.009992,balanced,entropy,5,1,32,32,33,"{'class_weight': 'balanced', 'criterion': 'ent...",0.564336,0.565646,0.555925,0.558594,0.555011,0.559902,0.004338,5


## XGBoost

In [8]:
clf = XGBClassifier()
param_dist = {
    "n_estimators": [25, 100, 250],
    "max_depth": [2, 3],
    "learning_rate": [1, 0.1, 0.001, 0.0001]
}
n_iter_search = 20
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

  _data = np.array(data, dtype=dtype, copy=copy,


In [9]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,2.840749,0.038625,0.116295,0.017171,250,3,1.0,"{'n_estimators': 250, 'max_depth': 3, 'learnin...",0.599068,0.596427,0.592772,0.593344,0.588721,0.594066,0.003503,1
2,1.296019,0.027099,0.064693,0.005104,100,3,1.0,"{'n_estimators': 100, 'max_depth': 3, 'learnin...",0.57834,0.577261,0.576057,0.575633,0.568354,0.575129,0.003518,2
11,2.286848,0.043136,0.082392,0.004865,250,2,1.0,"{'n_estimators': 250, 'max_depth': 2, 'learnin...",0.561102,0.561651,0.562644,0.560468,0.55333,0.559839,0.003332,3
17,0.981538,0.0096,0.031947,0.004097,100,2,1.0,"{'n_estimators': 100, 'max_depth': 2, 'learnin...",0.539781,0.543231,0.542826,0.542249,0.537396,0.541097,0.002204,4
15,0.479458,0.011165,0.032264,0.004304,25,3,1.0,"{'n_estimators': 25, 'max_depth': 3, 'learning...",0.54319,0.536616,0.533579,0.54012,0.529593,0.53662,0.004776,5


## AdaBoost

In [16]:
clf = AdaBoostClassifier()
param_dist = {
    "n_estimators": [25, 100, 250],
    "learning_rate": [1, 0.1, 0.001, 0.0001],
    "algorithm": ['SAMME']
}
n_iter_search = 10
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
rand_search.fit(dev_set, dev_label);

In [17]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_learning_rate,param_algorithm,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,24.61644,0.380028,0.342425,0.044615,250,1.0,SAMME,"{'n_estimators': 250, 'learning_rate': 1, 'alg...",0.404902,0.40644,0.405923,0.405314,0.4053,0.405576,0.000541,1
5,11.69624,0.322762,0.305693,0.016539,100,1.0,SAMME,"{'n_estimators': 100, 'learning_rate': 1, 'alg...",0.402267,0.402327,0.401861,0.405053,0.40158,0.402618,0.001248,2
0,3.414718,0.139261,0.108733,0.007507,25,1.0,SAMME,"{'n_estimators': 25, 'learning_rate': 1, 'algo...",0.402089,0.401542,0.401684,0.401449,0.40138,0.401629,0.000252,3
1,31.216062,1.081341,0.768117,0.014998,250,0.0001,SAMME,"{'n_estimators': 250, 'learning_rate': 0.0001,...",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,4
3,11.557509,0.343536,0.310554,0.018283,100,0.0001,SAMME,"{'n_estimators': 100, 'learning_rate': 0.0001,...",0.398909,0.398898,0.398898,0.398898,0.398905,0.398901,5e-06,4


## Neural Network

In [5]:
def build_model(hp):
  model = keras.Sequential()

  units_layer1 = hp.Choice('units_layer1', [32, 64, 128])
  model.add(keras.layers.Dense(
      units_layer1,
      activation='relu')),
  model.add(keras.layers.Dropout(
        hp.Float('rate', 0, 0.5, step=0.1)
  ))
  model.add(keras.layers.Dense(
      units_layer1//2,
      activation='relu'))
  model.add(keras.layers.Dense(1, activation='sigmoid'))

  model.compile(
    optimizer='adam',
    loss='binary_crossentropy')
  
  return model


In [11]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp):
        model = keras.Sequential()

        units_layer1 = hp.Choice('units_layer1', [32, 64, 128])
        model.add(keras.layers.Dense(
            units_layer1,
            activation='relu')),
        model.add(keras.layers.Dropout(
                hp.Float('rate', 0, 0.5, step=0.1)
        ))
        model.add(keras.layers.Dense(
            units_layer1//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        model.compile(
            optimizer='adam',
            loss='binary_crossentropy')
  
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [512, 1024]),
            epochs=hp.Choice("epochs", [10, 20, 30]),
            **kwargs,
        )

In [15]:
tuner = keras_tuner.tuners.SklearnTuner(
    oracle=keras_tuner.oracles.BayesianOptimizationOracle(
        objective=keras_tuner.Objective('score', 'max'),
        max_trials=2),
    hypermodel=MyHyperModel(),
    scoring=metrics.make_scorer(metrics.f1_score, average='macro'),
    cv=model_selection.KFold(5))

tuner.search(dev_set, dev_label.to_numpy())

Trial 2 Complete [00h 00m 29s]

Best score So Far: None
Total elapsed time: 00h 00m 59s
