# Classification Dataset

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [69]:
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.over_sampling import ADASYN
from scipy.stats import ks_2samp

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb

In [70]:
dataset = pd.read_csv('../../data/complete_dataset_fe.csv', dtype={'cyclist_team': str})
RANDOM_STATE = 42   

In [71]:
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise 

In [72]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589739 entries, 0 to 589738
Data columns (total 38 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   cyclist_url                  589739 non-null  object 
 1   cyclist_name                 589739 non-null  object 
 2   birth_year                   589739 non-null  int64  
 3   weight                       480007 non-null  float64
 4   height                       480007 non-null  float64
 5   nationality                  589739 non-null  object 
 6   race_url                     589739 non-null  object 
 7   race_name                    589739 non-null  object 
 8   points                       589739 non-null  float64
 9   uci_points                   422184 non-null  float64
 10  length                       589739 non-null  float64
 11  climb_total                  475338 non-null  float64
 12  profile                      475338 non-null  float64
 13 

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> arbitraty encoding but balanced representation 

we chose to delete cyclist_age as the encoding is arbitraty but it still respects the "order" of the age_groups

In [73]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 'label']
dataset = dataset[columns].dropna()

In [74]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_dataset = development_dataset.drop(columns=['race_year'])
test_set = test_dataset.drop(columns=['race_year'])
test_label = test_set.pop('label')

In [75]:
label = development_dataset.pop('label')
train_set, val_set, train_label, val_label = train_test_split(development_dataset, label, stratify=label, test_size=0.20, random_state=RANDOM_STATE)

In [76]:
development_set = development_dataset.copy()
developmente_label = label

In [77]:
developmente_label.value_counts(normalize=True)

label
0    0.854316
1    0.145684
Name: proportion, dtype: float64

In [78]:
development_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 367631 entries, 4 to 589737
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   length                367631 non-null  float64
 1   race_season           367631 non-null  object 
 2   cyclist_bmi           367631 non-null  float64
 3   cyclist_age_group     367631 non-null  object 
 4   climb_percentage      367631 non-null  float64
 5   race_physical_effort  367631 non-null  float64
 6   race_prestige         367631 non-null  float64
dtypes: float64(5), object(2)
memory usage: 22.4+ MB


In [79]:
def discretize_data(dataset, variables):
    df = dataset.copy()
    for variable in variables:
        #get the unique variable's values
        var = sorted(df[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        df[variable] = df[variable].map(mapping).astype(int)
    return df

In [80]:
categorical_features = dataset.select_dtypes(exclude="number").columns.tolist()
models = [
    RandomOverSampler(random_state=RANDOM_STATE),
    SMOTE(random_state=RANDOM_STATE, k_neighbors=10),
    SMOTENC(random_state=RANDOM_STATE, k_neighbors=10, categorical_features=['race_season']),
    ADASYN(random_state=RANDOM_STATE, n_neighbors=10)
]
oversampling_algorithms = [
    "random",
    "smote_interpolation",
    "smote_interpolation_w_categorical",
    "adasyn"
]
oversampled_datasets = list()

for algorithm, model in zip(oversampling_algorithms, models):
    if algorithm in ("smote_interpolation", "adasyn"):
        oversampled_data, oversampled_labels = model.fit_resample(development_set.select_dtypes(include="number"), developmente_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=list(development_set.select_dtypes(include="number").columns) + ['label'])

    elif algorithm == "smote_interpolation_w_categorical":
        df = discretize_data(development_set, ['cyclist_age_group'])
        oversampled_data, oversampled_labels = model.fit_resample(df, developmente_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=(list(df.columns) + ['label']))
    
    else:
        oversampled_data, oversampled_labels = model.fit_resample(development_set, developmente_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=(list(development_set.columns) + ['label']))
    
    oversampled_dataset["algorithm"] = algorithm

    oversampled_datasets.append(oversampled_dataset)

In [82]:
tests_per_algorithm = list()
development_dataset['label'] = label
for algorithm, oversampled_dataset in zip(oversampling_algorithms, oversampled_datasets):
    print(algorithm)
    columns = oversampled_dataset.columns
    test_results = list()
    for column in columns:
        if column != "algorithm":
            if not (column == 'cyclist_age_group' and algorithm == "smote_interpolation_w_categorical"):
                test_results.append(ks_2samp(development_dataset[column], oversampled_dataset[column], alternative="two-sided"))
            else:
                df = discretize_data(development_dataset, ['cyclist_age_group'])
                test_results.append(ks_2samp(df[column], oversampled_dataset[column], alternative="two-sided"))

    test_data = [(
        test.statistic,
        test.pvalue,
        test.statistic_location
        )
        for test in test_results        
    ]
    test_data = pd.DataFrame(test_data, columns=["KS_test", "p_value", "margin"])
    test_data["algorithm"] = algorithm

    tests_per_algorithm.append(test_data)

validation = pd.concat(tests_per_algorithm, axis="rows")
validation.groupby("algorithm").describe()

random
smote_interpolation
smote_interpolation_w_categorical
adasyn


Unnamed: 0_level_0,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,p_value,p_value,p_value,p_value,p_value,p_value,p_value,p_value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
algorithm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
adasyn,6.0,0.069661,0.138384,0.00791,0.009586,0.015453,0.018514,0.351986,6.0,9.088186e-14,2.105817e-13,0.0,1.204922e-62,7.466517e-37,1.878594e-14,5.202432e-13
random,8.0,0.051825,0.122321,0.002998,0.00405,0.010453,0.01327,0.354316,8.0,0.005663544,0.01131402,0.0,3.0253699999999998e-30,3.1095469999999995e-19,0.003779986,0.03091219
smote_interpolation,6.0,0.070539,0.139053,0.009836,0.012258,0.01448,0.017577,0.354316,6.0,1.080129e-20,2.645764e-20,0.0,9.352613999999999e-56,4.306074e-32,1.875026e-30,6.480773e-20
smote_interpolation_w_categorical,8.0,0.060544,0.119019,0.00904,0.010767,0.02013,0.027268,0.354316,8.0,8.606971000000001e-18,2.434303e-17,0.0,8.102964e-124,1.565896e-79,7.181693e-22,6.885290000000001e-17


In [60]:
dev_oversample = oversampled_datasets[0] #TODO: choose the best one
dev_oversample['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [61]:
dev_label = dev_oversample.pop('label')

In [64]:
#drop the categorical variables since we don't need them anymore 
cat_columns = ['race_season']
dev_oversample_encoded = pd.get_dummies(dev_oversample, columns = cat_columns, prefix_sep='%').drop(columns=['algorithm'])
dev_oversample_encoded

Unnamed: 0,length,cyclist_bmi,cyclist_age_group,climb_percentage,race_physical_effort,race_prestige,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
0,5.2,22.256908,27-29,0.005192,0.000015,0.193636,False,True,False,False
1,135.0,22.256908,27-29,0.003244,0.006662,0.193636,False,True,False,False
2,100.0,22.256908,27-29,0.00398,0.004484,0.193636,False,True,False,False
3,199.0,22.256908,27-29,0.009281,0.041416,0.193636,False,False,True,False
4,244.0,22.256908,27-29,0.006988,0.140635,0.193636,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
628141,155.7,23.054562,29-32,0.008985,0.049089,0.127661,False,False,True,False
628142,167.2,21.887076,<25,0.016184,0.152948,0.058228,False,True,False,False
628143,173.0,19.834711,25-27,0.019682,0.331889,0.122667,False,True,False,False
628144,177.0,21.366869,25-27,0.009147,0.03229,0.09152,False,False,False,True


In [63]:
test_set = discretize_data(test_set, ['cyclist_age_group'])
test_set.drop(columns=['cyclist_age_group'], axis=1,inplace=True)
cat_columns = ['race_season']
test_set_encoded = pd.get_dummies(test_set, columns = cat_columns, prefix_sep='%')
test_set_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
201,203.9,21.705129,0.009858,0.092363,0.225487,False,False,False,True
202,254.1,21.705129,0.013617,0.198139,0.248896,False,True,False,False
204,204.2,20.756387,0.008159,0.076668,0.264741,False,False,False,True
205,203.9,20.756387,0.009858,0.092363,0.225487,False,False,False,True
206,183.7,20.756387,0.008977,0.068267,0.228368,False,False,False,True
...,...,...,...,...,...,...,...,...,...
589373,154.0,20.305175,0.010779,0.028805,0.078556,False,False,False,True
589374,294.0,20.305175,0.007327,0.142718,0.455848,False,False,False,True
589375,204.1,20.305175,0.009882,0.092776,0.304715,False,False,False,True
589376,273.4,20.305175,0.007992,0.134628,0.399068,False,True,False,False


Abbiamo trasformato gli attributi categorici in equivalenti numerici per poter essere processati dai modelli di learning: 
- one-hot encoding: è stato utilizzato quando l'attributo categorico non è di tipo ordinale, in questo caso non siamo interessati a preservare ordine o relazioni di vicinanza tra i valori di questo attributo e pertanto vogliamo massimizzare la dissimilarità tra le classi
- discretizzazione: abbiamo usato questa tecnica solo nel caso dell'attributo cyclist_age_group, in quanto queste classi possiedono relazioni di ordine e vicinanza (attributo ordinale)

In [None]:
dev_oversample_encoded['label'] = dev_label
test_set_encoded['label'] = test_label

In [None]:
#dev_oversample_encoded.to_csv('../data/ml_datasets/oversampling/dev_set.csv', index=False)
#test_set_encoded.to_csv('../data/ml_datasets/oversampling/test_set.csv', index=False)