# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import wittgenstein as lw
import keras_tuner
import keras
from keras_tuner import HyperParameters
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import f1_score
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import loguniform as sp_loguniform
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

2024-12-19 11:29:38.494145: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 11:29:38.550414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734604178.576421   56529 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734604178.584572   56529 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 11:29:38.613071: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
train_data = pd.read_csv('../../data/ml_datasets/oversampling/train_set.csv').sample(frac = 1, random_state=RANDOM_STATE) # Shuffling the data to not introduce bias
val_data = pd.read_csv('../../data/ml_datasets/oversampling/val_set.csv')
testing_data = pd.read_csv('../../data/ml_datasets/oversampling/test_set.csv')

In [3]:
train_label = train_data.pop('label')
val_label = val_data.pop('label')
test_label = testing_data.pop('label')

train_set = train_data
train_set['race_season%autumn'] = train_set['race_season%autumn'].astype(int)
train_set['race_season%spring'] = train_set['race_season%spring'].astype(int)
train_set['race_season%summer'] = train_set['race_season%summer'].astype(int)
train_set['race_season%winter'] = train_set['race_season%winter'].astype(int)

val_set = val_data
val_set['race_season%autumn'] = val_set['race_season%autumn'].astype(int)
val_set['race_season%spring'] = val_set['race_season%spring'].astype(int)
val_set['race_season%summer'] = val_set['race_season%summer'].astype(int)
val_set['race_season%winter'] = val_set['race_season%winter'].astype(int)

test_set = testing_data
test_set['race_season%autumn'] = test_set['race_season%autumn'].astype(int)
test_set['race_season%spring'] = test_set['race_season%spring'].astype(int)
test_set['race_season%summer'] = test_set['race_season%summer'].astype(int)
test_set['race_season%winter'] = test_set['race_season%winter'].astype(int)

N_JOBS = 4
USER = 'Jacopo'

In [4]:
N_FEATURES = len(train_set.iloc[0])

In [5]:
train_set = train_set.to_numpy()
train_label = train_label.to_numpy()

val_set = val_set.to_numpy()
val_label = val_label.to_numpy()

In [6]:
split_index = np.concatenate([
    np.full(len(train_set), -1),  # -1 for training
    np.zeros(len(val_set))        # 0 for validation
])

X_combined = np.vstack((train_set, val_set))
y_combined = np.concatenate((train_label, val_label))

ps = PredefinedSplit(test_fold=split_index)

In [7]:
# Function to search for the best hyperparameters
def f1_class_scorer(class_index):
    # Function to calculate F1 score for a specific class
    def score_function(y_true, y_pred):
        return f1_score(y_true, y_pred, average=None)[class_index] # Compute the F1 score for each class and return the one specified
    return make_scorer(score_function)

# Scorer for class 0 and 1
f1_class_0 = f1_class_scorer(0)
f1_class_1 = f1_class_scorer(1)

scoring={
        'f1_macro': 'f1_macro', # F1 macro for each class
        'f1_micro': 'f1_micro', # F1 micro for each class
        'f1_0': f1_class_0,     # F1 only for class 0
        'f1_1': f1_class_1      # F1 only for class 1
}

## Decision Tree

In [8]:
param_dist = {"max_depth": [3, 5, 10, 15, 20, None],
              "max_features": sp_randint(3, N_FEATURES + 1),
              "min_samples_split": [20, 30, 50, 100],
              "min_samples_leaf": [10, 20, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}]} # Class weights are related to over/undersampling chosen
n_iter_search = 400
clf = tree.DecisionTreeClassifier() # Decision Tree

# Define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [9]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_decision_tree_results.csv', index=False)

## SVM

In [10]:
param_dist = {"C": sp_loguniform(1e-4, 1e2)}
n_iter_search = 100
clf = LinearSVC() # Model

# Define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [11]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_svm_results.csv', index=False)

## Naive Bayes

In [12]:
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {}
n_iter_search = 1 # Number of iterations
clf = GaussianNB() # Model

# Define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                            n_iter=n_iter_search, 
                            n_jobs=1, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [13]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_naive_bayes_results.csv', index=False)

## KNN

We risk that the mapping of ordinal categorical attributes (without one-hot encoding) may cause issues in K-NN

In [14]:
param_dist = {'n_neighbors': [5, 15, 25, 40, 50],
              'n_neighbors': [40, 50],
              'algorithm': ['ball_tree', 'kd_tree', 'brute'],}

tmp_train_set = train_data.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter']).to_numpy()
tmp_val_set = val_data.drop(columns=['cyclist_age_group', 'race_season%autumn', 'race_season%spring', 'race_season%summer', 'race_season%winter']).to_numpy()

split_index_knn = np.concatenate([
    np.full(len(tmp_train_set), -1),  # -1 for training
    np.zeros(len(tmp_val_set))        # 0 for validation
])

X_combined_knn = np.vstack((tmp_train_set, tmp_val_set))
y_combined_knn = np.concatenate((train_label, val_label))

ps_knn = PredefinedSplit(test_fold=split_index_knn)

clf = KNeighborsClassifier() # Model

# Define the grid search
rand_search = GridSearchCV(clf, param_grid=param_dist,
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps_knn)
rand_search.fit(X_combined_knn, y_combined_knn);

In [15]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_knn_results.csv', index=False)

## Random Forest

In [16]:
param_dist = {"max_depth": [5, 10, 20, None],
              "max_features": sp_randint(3, N_FEATURES + 1),
              "min_samples_split": [20, 50, 100],
              "min_samples_leaf": [10, 30, 50, 100],
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.8, 1: 0.2}, {0: 0.6, 1: 0.4}],
              "n_estimators": [50, 100, 150]}
n_iter_search = 100
clf = RandomForestClassifier() # Model

rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [17]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_random_forest_results.csv', index=False)

## XGBoost

In [18]:
clf = XGBClassifier()
param_dist = {
    "n_estimators": [25, 50, 100, 250, 500],
    "max_depth": [2, 3, 4, 5],  
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist,  
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [19]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_xgb_results.csv', index=False)

## AdaBoost

In [20]:
clf = AdaBoostClassifier()
param_dist = {
    "n_estimators": [25, 50, 100, 250, 500],
    "learning_rate": [1, 0.1, 0.01, 0.001, 0.0001],  
    "algorithm": ['SAMME'] 
}

rand_search = GridSearchCV(clf, param_grid=param_dist, 
                            n_jobs=N_JOBS, 
                            scoring=scoring,
                            refit=False,
                            cv=ps)
rand_search.fit(X_combined, y_combined);

In [21]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_ada_boost_results.csv', index=False)

## Neural Network

In [8]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp, units, dropout_rate, learning_rate):
        model = keras.Sequential()

        model.add(keras.layers.Dense(
            units,
            activation='relu')),
        model.add(keras.layers.Dropout(rate=dropout_rate))
        model.add(keras.layers.Dense(
            units//2,
            activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid'))

        # Configures the optimizer with the chosen learning rate
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        f1 = keras.metrics.F1Score(average='macro', threshold=0.5, name="f1_macro", dtype=None)
        model.compile(
            optimizer=optimizer,
            loss='binary_crossentropy',
            metrics=[f1])
  
        return model
    
    def fit(self, hp, model, x, y, epochs, batch_size, **kwargs):
        return model.fit(
            x=x,
            y=y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=False,
            **kwargs,
        )

In [9]:
rounds = 50
config_results = []

for _ in range(rounds):
    hp = HyperParameters()
    hyper_ae = MyHyperModel()
    batch_size = hp.Fixed("batch_size", random.choice([256, 512, 1024])) 
    epochs = hp.Fixed("epochs", random.choice([10, 20, 30])) 
    units_layer1 = hp.Fixed('units_layer1', random.choice([32, 64, 128, 256]))  
    drop_rate = hp.Fixed('rate', random.choice(np.arange(0., 0.9, 0.2))) 
    learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-5, -3.5, num=10))) # Jacopo
    #learning_rate = hp.Fixed("learning_rate", random.choice(np.logspace(-3.5, -2, num=10))) Simone

    print(f"Training with batch_size={batch_size}, epochs={epochs}, units_layer1={units_layer1}, drop_rate={drop_rate}, learning_rate={learning_rate}")

    model = hyper_ae.build(hp, units_layer1, drop_rate, learning_rate)

    y_val = val_label.reshape(-1, 1)
    y_train = train_label.reshape(-1, 1)
    
    # Adapt the model with the training and validation data
    metrics = hyper_ae.fit(hp, model, train_set, y_train, epochs=epochs, batch_size=batch_size)
    
    val_out = model.predict(val_set, verbose=False)
    val_out = (val_out >= 0.5).astype(int)
    f1_0 = f1_class_0._score_func(y_val, val_out)
    f1_1 = f1_class_1._score_func(y_val, val_out)
    f1_micro = f1_score(y_val, val_out, average='micro')
    f1_macro = f1_score(y_val, val_out, average='macro')

    config = {
        "batch_size": batch_size,
        "epochs": epochs,
        "units_layer1": units_layer1,
        "units_layer2": units_layer1//2,
        "drop_rate": drop_rate,
        "learning_rate": learning_rate,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "f1_0": f1_0,
        "f1_1": f1_1,
    }

    config_results.append(config)

df = pd.DataFrame(config_results)
# In reality we exploited macro average, calculating it afterwards
df.sort_values(by='f1_micro', inplace=True, ascending=False)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_nn_results.csv', index=False)
    

Training with batch_size=512, epochs=20, units_layer1=64, drop_rate=0.8, learning_rate=0.0001467799267622069


I0000 00:00:1734604188.190632   56529 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
I0000 00:00:1734604190.089631   56819 service.cc:148] XLA service 0x7f38380049d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734604190.089758   56819 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2024-12-19 11:29:50.128266: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1734604190.330105   56819 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1734604191.538145   56819 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Training with batch_size=1024, epochs=20, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=4.641588833612782e-05
Training with batch_size=256, epochs=30, units_layer1=256, drop_rate=0.2, learning_rate=0.00031622776601683794
Training with batch_size=256, epochs=30, units_layer1=128, drop_rate=0.2, learning_rate=1.4677992676220705e-05
Training with batch_size=512, epochs=10, units_layer1=32, drop_rate=0.0, learning_rate=6.812920690579608e-05
Training with batch_size=256, epochs=20, units_layer1=128, drop_rate=0.0, learning_rate=2.1544346900318823e-05





Training with batch_size=256, epochs=20, units_layer1=64, drop_rate=0.6000000000000001, learning_rate=6.812920690579608e-05
Training with batch_size=512, epochs=10, units_layer1=32, drop_rate=0.6000000000000001, learning_rate=0.00031622776601683794
Training with batch_size=1024, epochs=30, units_layer1=64, drop_rate=0.2, learning_rate=6.812920690579608e-05
Training with batch_size=1024, epochs=30, units_layer1=256, drop_rate=0.0, learning_rate=0.00031622776601683794

















Training with batch_size=256, epochs=10, units_layer1=128, drop_rate=0.8, learning_rate=1.4677992676220705e-05
Training with batch_size=256, epochs=20, units_layer1=64, drop_rate=0.0, learning_rate=0.00031622776601683794
Training with batch_size=1024, epochs=30, units_layer1=64, drop_rate=0.8, learning_rate=1e-05
Training with batch_size=512, epochs=30, units_layer1=32, drop_rate=0.4, learning_rate=1e-05
Training with batch_size=256, epochs=10, units_layer1=128, drop_rate=0.6000000000000001, learning_rate=0.00031622776601683794
Training with batch_size=1024, epochs=30, units_layer1=256, drop_rate=0.4, learning_rate=2.1544346900318823e-05
Training with batch_size=512, epochs=10, units_layer1=64, drop_rate=0.6000000000000001, learning_rate=6.812920690579608e-05
Training with batch_size=256, epochs=10, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=3.1622776601683795e-05
Training with batch_size=1024, epochs=30, units_layer1=128, drop_rate=0.0, learning_rate=1.4677992676220













Training with batch_size=512, epochs=30, units_layer1=32, drop_rate=0.4, learning_rate=1.4677992676220705e-05
Training with batch_size=512, epochs=20, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=0.0001
Training with batch_size=1024, epochs=20, units_layer1=128, drop_rate=0.6000000000000001, learning_rate=0.00031622776601683794
Training with batch_size=256, epochs=10, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=3.1622776601683795e-05
Training with batch_size=512, epochs=10, units_layer1=256, drop_rate=0.6000000000000001, learning_rate=3.1622776601683795e-05
Training with batch_size=1024, epochs=10, units_layer1=64, drop_rate=0.8, learning_rate=1.4677992676220705e-05
Training with batch_size=1024, epochs=10, units_layer1=32, drop_rate=0.8, learning_rate=6.812920690579608e-05
Training with batch_size=512, epochs=10, units_layer1=64, drop_rate=0.4, learning_rate=0.00021544346900318823
Training with batch_size=256, epochs=30, units_layer1=256, drop_rate=0





Training with batch_size=256, epochs=20, units_layer1=128, drop_rate=0.8, learning_rate=0.00031622776601683794
Training with batch_size=1024, epochs=20, units_layer1=128, drop_rate=0.2, learning_rate=4.641588833612782e-05
Training with batch_size=256, epochs=20, units_layer1=256, drop_rate=0.0, learning_rate=6.812920690579608e-05





Training with batch_size=512, epochs=30, units_layer1=256, drop_rate=0.4, learning_rate=0.0001467799267622069
Training with batch_size=256, epochs=10, units_layer1=128, drop_rate=0.8, learning_rate=3.1622776601683795e-05
Training with batch_size=512, epochs=30, units_layer1=128, drop_rate=0.8, learning_rate=0.00021544346900318823


## Rule-Based

In [10]:
param_dist = {
    'prune_size': sp_uniform(0.1, 0.4),  # Uniform distribution between 0.1 and 0.5
    'k': sp_randint(1, 11)               # Int between 1 and 10
}
n_iter_search = 20 # Number of iterations
clf = lw.RIPPER(
    max_rules=10,        # Moderate rule complexity
    max_rule_conds=7,    # Enough room for moderately complex conditions
    max_total_conds=35   # Cap total conditions to avoid runaway complexity
)

# Define the grid search
rand_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=n_iter_search, 
                                 scoring=scoring, 
                                 refit=False, 
                                 n_jobs=2,
                                 cv=ps)
rand_search.fit(X_combined, y_combined);

In [11]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_f1_macro', inplace=True)
df.to_csv(f'../../data/ml_datasets/oversampling/model_selection/{USER}_rule_based_results.csv', index=False)