# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import ClusterCentroids
from scipy.stats import ks_2samp

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb

In [2]:
dataset = pd.read_csv('../data/complete_dataset_fe.csv', dtype={'cyclist_team': str})
RANDOM_STATE = 42   

In [3]:
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise 

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589739 entries, 0 to 589738
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   cyclist_url           589739 non-null  object 
 1   cyclist_name          589739 non-null  object 
 2   birth_year            589739 non-null  int64  
 3   weight                480007 non-null  float64
 4   height                480007 non-null  float64
 5   nationality           589739 non-null  object 
 6   race_url              589739 non-null  object 
 7   race_name             589739 non-null  object 
 8   points                589739 non-null  float64
 9   uci_points            422184 non-null  float64
 10  length                589739 non-null  float64
 11  climb_total           475338 non-null  float64
 12  profile               475338 non-null  float64
 13  startlist_quality     589739 non-null  int64  
 14  date                  589739 non-null  object 
 15  

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> higher dimensionality (one-hot) but balanced representation 

we chose to delete cyclist_age as the dimensionality issue could be solved by the amount of data available

In [5]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 'label']
dataset = dataset[columns].dropna()

In [6]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_dataset = development_dataset.drop(columns=['race_year'])
test_set = test_dataset.drop(columns=['race_year'])
test_label = test_set.pop('label')

In [7]:
label = development_dataset.pop('label')
train_set, val_set, train_label, val_label = train_test_split(development_dataset, label, stratify=label, test_size=0.20, random_state=RANDOM_STATE)

In [8]:
train_label.value_counts(normalize=True)

label
0    0.854317
1    0.145683
Name: proportion, dtype: float64

In [9]:
models = [
    RandomUnderSampler(random_state=RANDOM_STATE),
    #CondensedNearestNeighbour(random_state=RANDOM_STATE),
    #ClusterCentroids(random_state=RANDOM_STATE)
]
undersampling_algorithms = [
    "random",
    #"condensed_rule",
    #"centroids"
]
undersampled_datasets = list()
for algorithm, model in zip(undersampling_algorithms, models):
    print(algorithm)
    if algorithm in ("condensed_rule", "centroids"):
        undersampled_data, undersampled_labels = model.fit_resample(train_set.select_dtypes(include="number"), train_label)
        undersampled_labels = undersampled_labels.to_numpy().reshape(-1, 1)
        undersampled_dataset = pd.DataFrame(numpy.hstack((undersampled_data, undersampled_labels)), columns=(development_dataset.select_dtypes(include="number").columns + ['label']))
    else:
        undersampled_data, undersampled_labels = model.fit_resample(train_set, train_label)
        undersampled_labels = undersampled_labels.to_numpy().reshape(-1, 1)
        undersampled_dataset = pd.DataFrame(numpy.hstack((undersampled_data, undersampled_labels)), columns=(list(development_dataset.columns) + ['label']))
    undersampled_dataset["algorithm"] = algorithm

    undersampled_datasets.append(undersampled_dataset)

random


In [10]:
tests_per_algorithm = list()
train_set['label'] = label
for algorithm, oversampled_dataset in zip(undersampling_algorithms, undersampled_datasets):
    columns = oversampled_dataset.columns
    test_results = [
        ks_2samp(
            train_set[column],
            oversampled_dataset[column],
            alternative="two-sided"
        )
        for column in columns if column != "algorithm"
    ]
    test_data = [(
        test.statistic,
        test.pvalue,
        test.statistic_location
        )
        for test in test_results        
    ]
    test_data = pd.DataFrame(test_data, columns=["KS_test", "p_value", "margin"])
    test_data["algorithm"] = algorithm

    tests_per_algorithm.append(test_data)

validation = pd.concat(tests_per_algorithm, axis="rows")
validation.groupby("algorithm").describe()

Unnamed: 0_level_0,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,p_value,p_value,p_value,p_value,p_value,p_value,p_value,p_value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
algorithm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
random,8.0,0.051896,0.12229,0.002948,0.003991,0.010981,0.012712,0.354317,8.0,0.141153,0.22367,0.0,5.14044e-08,3.202165e-07,0.241755,0.610258


In [11]:
train_undersample = undersampled_datasets[0].drop(columns=['algorithm'])
train_undersample['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [12]:
train_label = train_undersample.pop('label')

In [13]:
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [14]:
#discretize the variable relationship, sex, occupation, marital-status, native-country, race
variables = ['cyclist_age_group']
train_undersample = discretize_data(train_undersample, variables)
#drop the categorical variables since we don't need them anymore 
train_undersample.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
train_undersample_encoded = pd.get_dummies(train_undersample, columns = cat_columns, prefix_sep='%')
train_undersample_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,cyclist_age_group_num,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
0,182.1,21.604938,0.012169,0.090942,0.060309,3,False,True,False,False
1,220.0,22.724403,0.010786,0.117654,0.10743,2,False,True,False,False
2,163.2,21.266541,0.017304,0.103865,0.100131,0,False,False,True,False
3,179.0,20.286508,0.017804,0.128565,0.264701,2,False,False,True,False
4,206.0,22.145329,0.016757,0.400652,0.108454,4,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
85687,159.0,20.528922,0.019522,0.111226,0.11985,1,False,True,False,False
85688,174.0,20.227026,0.010356,0.035331,0.151221,0,False,False,True,False
85689,192.5,20.79673,0.022894,0.477973,0.071673,2,False,True,False,False
85690,196.0,21.545091,0.012332,0.160144,0.085918,4,False,False,False,True


In [15]:
val_set = discretize_data(val_set, variables)
val_set.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
val_set_encoded = pd.get_dummies(val_set, columns = cat_columns, prefix_sep='%')
val_set_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,cyclist_age_group_num,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
314538,208.5,23.116165,0.023981,0.469889,0.314159,4,False,False,True,False
345310,164.5,20.227026,0.011477,0.104989,0.238292,2,False,False,True,False
511941,206.0,20.380435,0.009951,0.095171,0.119338,0,False,False,True,False
388741,27.2,19.817677,0.013088,0.001090,0.258459,2,False,True,False,False
376326,203.5,23.413111,0.005715,0.026668,0.281667,3,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
501235,219.2,21.857278,0.023805,0.644428,0.144179,3,False,False,True,False
281368,169.0,23.765432,0.015657,0.201556,0.285028,2,False,False,True,False
158868,198.0,19.605192,0.012207,0.161778,0.345410,4,False,True,False,False
371671,190.5,21.329640,0.008892,0.036363,0.285028,1,False,False,True,False


In [16]:
test_set = discretize_data(test_set, variables)
test_set.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
test_set_encoded = pd.get_dummies(test_set, columns = cat_columns, prefix_sep='%')
test_set_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,cyclist_age_group_num,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
4146,254.1,21.952479,0.013617,0.198139,0.248896,4,False,True,False,False
4147,254.1,20.061728,0.013617,0.198139,0.248896,1,False,True,False,False
4148,254.1,19.944598,0.013617,0.198139,0.248896,1,False,True,False,False
4149,254.1,22.152647,0.013617,0.198139,0.248896,1,False,True,False,False
4150,254.1,21.389756,0.013617,0.198139,0.248896,2,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
589734,266.9,20.763187,0.015613,0.250646,0.584334,3,False,False,True,False
589735,266.9,19.662026,0.015613,0.250646,0.584334,3,False,False,True,False
589736,266.9,19.805342,0.015613,0.250646,0.584334,0,False,False,True,False
589737,266.9,20.598608,0.015613,0.250646,0.584334,3,False,False,True,False


Abbiamo trasformato gli attributi categorici in equivalenti numerici per poter essere processati dai modelli di learning: 
- one-hot encoding: è stato utilizzato quando l'attributo categorico non è di tipo ordinale, in questo caso non siamo interessati a preservare ordine o relazioni di vicinanza tra i valori di questo attributo e pertanto vogliamo massimizzare la dissimilarità tra le classi
- discretizzazione: abbiamo usato questa tecnica solo nel caso dell'attributo cyclist_age_group, in quanto queste classi possiedono relazioni di ordine e vicinanza (attributo ordinale)

In [17]:
train_undersample_encoded['label'] = train_label
val_set_encoded['label'] = val_label
test_set_encoded['label'] = test_label

In [18]:
train_undersample_encoded.to_csv('../data/ml_datasets/train_undersample_encoded.csv', index=False)
val_set_encoded.to_csv('../data/ml_datasets/val_set_encoded.csv', index=False)
test_set_encoded.to_csv('../data/ml_datasets/test_set_encoded.csv', index=False)