# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
#to find the best set of parameter setting, we can run a grid search
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn import tree
from sklearn.svm import SVC

from sklearn.metrics import classification_report
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

In [None]:
training_data = pd.read_csv('../data/ml_datasets/train_set.csv')
validation_data = pd.read_csv('../data/ml_datasets/val_set.csv')
testing_data = pd.read_csv('../data/ml_datasets/test_set.csv')

In [None]:
train_label = training_data.pop('label')
val_label = validation_data.pop('label')
test_label = testing_data.pop('label')

train_set = training_data
val_set = validation_data
test_set = testing_data

dev_set = pd.concat([train_set, val_set])
dev_label = pd.concat([train_label, val_label])

## Decision Tree

In [None]:
#define the parameters' values you want to try
scoring_metrics = {
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

param_dist = {"max_depth": [2,3,5,6,7,10,12,None],
              "max_features": sp_randint(1, len(train_set.iloc[0]) + 1),
              "min_samples_split": sp_randint(10, 51),
              "min_samples_leaf": sp_randint(10, 51),
              "criterion": ["entropy", "gini"],
              "class_weight":['balanced', None, {0: 0.3, 1: 0.7}]}
#define the number of iters
n_iter_search = 400
#define the model
clf = tree.DecisionTreeClassifier()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()

## SVM

In [None]:
#define the parameters' values you want to try
param_dist = {"kernel": ['sigmoid'],
              "C": sp_uniform(0.1, 10.0),
              "gamma": ['scale']}
#define the number of iters
n_iter_search = 1
#define the model
clf = SVC()
#define the grid search
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=10, 
                            scoring='f1_macro')
#run the grid search
rand_search.fit(dev_set, dev_label);

In [None]:
df = pd.DataFrame(rand_search.cv_results_)
df.sort_values(by='rank_test_score', inplace=True)
df.head()