# Classification Dataset

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import ClusterCentroids
from scipy.stats import ks_2samp

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb

In [2]:
dataset = pd.read_csv('../../data/complete_dataset_fe.csv', dtype={'cyclist_team': str})
RANDOM_STATE = 42   

In [3]:
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise 

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589739 entries, 0 to 589738
Data columns (total 38 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   cyclist_url                  589739 non-null  object 
 1   cyclist_name                 589739 non-null  object 
 2   birth_year                   589739 non-null  int64  
 3   weight                       480007 non-null  float64
 4   height                       480007 non-null  float64
 5   nationality                  589739 non-null  object 
 6   race_url                     589739 non-null  object 
 7   race_name                    589739 non-null  object 
 8   points                       589739 non-null  float64
 9   uci_points                   422184 non-null  float64
 10  length                       589739 non-null  float64
 11  climb_total                  475338 non-null  float64
 12  profile                      475338 non-null  float64
 13 

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> arbitraty encoding but balanced representation 

we chose to delete cyclist_age as the encoding is arbitraty but it still respects the "order" of the age_groups

In [5]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 'label']
dataset = dataset[columns].dropna()

In [6]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_dataset = development_dataset.drop(columns=['race_year'])
test_set = test_dataset.drop(columns=['race_year'])
test_label = test_set.pop('label')

In [7]:
label = development_dataset.pop('label')
train_set, val_set, train_label, val_label = train_test_split(development_dataset, label, stratify=label, test_size=0.20, random_state=RANDOM_STATE)

In [8]:
development_set = development_dataset.copy()
developmente_label = label

In [9]:
developmente_label.value_counts(normalize=True)

label
0    0.854316
1    0.145684
Name: proportion, dtype: float64

In [10]:
model = RandomUnderSampler(random_state=RANDOM_STATE)
undersampled_data, undersampled_labels = model.fit_resample(development_set, developmente_label)
undersampled_labels = undersampled_labels.to_numpy().reshape(-1, 1)
undersampled_dataset = pd.DataFrame(numpy.hstack((undersampled_data, undersampled_labels)), columns=(list(development_dataset.columns) + ['label']))

In [34]:
development_set['label'] = label
columns = undersampled_dataset.columns
test_results = [
    ks_2samp(
        development_set[column],
        undersampled_dataset[column],
        alternative="two-sided"
    )
    for column in columns
]
test_data = [(
    test.statistic,
    test.pvalue,
    test.statistic_location
    )
    for test in test_results        
]
test_data = pd.DataFrame(test_data, columns=["KS_test", "p_value", "margin"])

test_data.describe()

Unnamed: 0,KS_test,p_value
count,8.0,8.0
mean,0.051858,0.09602672
std,0.122303,0.1983946
min,0.002717,0.0
25%,0.004456,2.400411e-10
50%,0.010436,6.464247e-07
75%,0.012913,0.07882611
max,0.354316,0.5718545


In [35]:
dev_undersample = undersampled_dataset
dev_undersample['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [36]:
dev_label = dev_undersample.pop('label')

In [37]:
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [38]:
#discretize the variable relationship, sex, occupation, marital-status, native-country, race
variables = ['cyclist_age_group']
dev_undersample = discretize_data(dev_undersample, variables)
#drop the categorical variables since we don't need them anymore 
dev_undersample.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
dev_undersample_encoded = pd.get_dummies(dev_undersample, columns = cat_columns, prefix_sep='%')
dev_undersample_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,cyclist_age_group_num,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
0,194.0,20.303698,0.012588,0.16015,0.104101,4,False,False,True,False
1,203.0,20.406081,0.008419,0.039092,0.253017,0,False,False,True,False
2,15.1,22.992624,0.014901,0.000764,0.081597,0,False,False,False,True
3,204.0,21.366869,0.00976,0.045767,0.294632,1,False,False,True,False
4,178.0,19.84127,0.013573,0.242296,0.276385,1,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
107111,256.0,20.830818,0.006281,0.046385,0.788245,0,True,False,False,False
107112,143.0,20.830818,0.012441,0.057332,0.074522,1,False,True,False,False
107113,184.0,20.830818,0.010522,0.08028,0.074522,1,False,True,False,False
107114,214.0,20.830818,0.011561,0.119316,0.074522,1,False,True,False,False


In [39]:
test_set = discretize_data(test_set, variables)
test_set.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
test_set_encoded = pd.get_dummies(test_set, columns = cat_columns, prefix_sep='%')
test_set_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,cyclist_age_group_num,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
201,203.9,21.705129,0.009858,0.092363,0.225487,3,False,False,False,True
202,254.1,21.705129,0.013617,0.198139,0.248896,3,False,True,False,False
204,204.2,20.756387,0.008159,0.076668,0.264741,0,False,False,False,True
205,203.9,20.756387,0.009858,0.092363,0.225487,0,False,False,False,True
206,183.7,20.756387,0.008977,0.068267,0.228368,0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...
589373,154.0,20.305175,0.010779,0.028805,0.078556,4,False,False,False,True
589374,294.0,20.305175,0.007327,0.142718,0.455848,4,False,False,False,True
589375,204.1,20.305175,0.009882,0.092776,0.304715,4,False,False,False,True
589376,273.4,20.305175,0.007992,0.134628,0.399068,4,False,True,False,False


Abbiamo trasformato gli attributi categorici in equivalenti numerici per poter essere processati dai modelli di learning: 
- one-hot encoding: è stato utilizzato quando l'attributo categorico non è di tipo ordinale, in questo caso non siamo interessati a preservare ordine o relazioni di vicinanza tra i valori di questo attributo e pertanto vogliamo massimizzare la dissimilarità tra le classi
- discretizzazione: abbiamo usato questa tecnica solo nel caso dell'attributo cyclist_age_group, in quanto queste classi possiedono relazioni di ordine e vicinanza (attributo ordinale)

In [40]:
dev_undersample_encoded['label'] = dev_label
test_set_encoded['label'] = test_label

In [41]:
#dev_undersample_encoded.to_csv('../data/ml_datasets/undersampling/dev_set.csv', index=False)
#test_set_encoded.to_csv('../data/ml_datasets/undersampling/test_set_encoded.csv', index=False)