# Classification Dataset

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.over_sampling import ADASYN
from scipy.stats import ks_2samp

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb

In [2]:
dataset = pd.read_csv('../../data/complete_dataset_no_outliers.csv', dtype={'cyclist_team': str})
RANDOM_STATE = 42   

In [3]:
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise 

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566936 entries, 0 to 566935
Data columns (total 39 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   cyclist_url                  566936 non-null  object 
 1   cyclist_name                 566936 non-null  object 
 2   birth_year                   566936 non-null  int64  
 3   weight                       457204 non-null  float64
 4   height                       457204 non-null  float64
 5   nationality                  566936 non-null  object 
 6   race_url                     566936 non-null  object 
 7   race_name                    566936 non-null  object 
 8   points                       566936 non-null  float64
 9   uci_points                   399381 non-null  float64
 10  length                       566936 non-null  float64
 11  climb_total                  452535 non-null  float64
 12  profile                      452535 non-null  float64
 13 

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> arbitraty encoding but balanced representation 

we chose to delete cyclist_age as the encoding is arbitraty but it still respects the "order" of the age_groups

In [5]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 
           'previous_mean_position', 'previous_mean_delta', 'previous_mean_cp', 'cyclist_previous_experience', 'num_participants', 'label']
dataset = dataset[columns].dropna()

In [6]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_dataset = development_dataset.drop(columns=['race_year'])
test_set = test_dataset.drop(columns=['race_year'])
test_label = test_set.pop('label')

In [7]:
label = development_dataset.pop('label')
train_set, val_set, train_label, val_label = train_test_split(development_dataset, label, stratify=label, test_size=0.20, random_state=RANDOM_STATE)

In [8]:
train_label.value_counts(normalize=True)

label
0    0.86524
1    0.13476
Name: proportion, dtype: float64

In [9]:
val_label.value_counts(normalize=True)

label
0    0.865244
1    0.134756
Name: proportion, dtype: float64

In [10]:
test_label.value_counts(normalize=True)

label
0    0.8672
1    0.1328
Name: proportion, dtype: float64

In [9]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 269076 entries, 297965 to 104278
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   length                       269076 non-null  float64
 1   race_season                  269076 non-null  object 
 2   cyclist_bmi                  269076 non-null  float64
 3   cyclist_age_group            269076 non-null  object 
 4   climb_percentage             269076 non-null  float64
 5   race_physical_effort         269076 non-null  float64
 6   race_prestige                269076 non-null  float64
 7   previous_mean_position       269076 non-null  float64
 8   previous_mean_delta          269076 non-null  float64
 9   previous_mean_cp             269076 non-null  float64
 10  cyclist_previous_experience  269076 non-null  float64
 11  num_participants             269076 non-null  int64  
dtypes: float64(9), int64(1), object(2)
memory usage: 26.7+ MB


In [10]:
def discretize_data(dataset, variables):
    df = dataset.copy()
    for variable in variables:
        #get the unique variable's values
        var = sorted(df[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        df[variable] = df[variable].map(mapping).astype(int)
    return df

In [11]:
models = [
    RandomOverSampler(random_state=RANDOM_STATE),
    SMOTE(random_state=RANDOM_STATE, k_neighbors=10),
    SMOTENC(random_state=RANDOM_STATE, k_neighbors=10, categorical_features=['race_season']),
    ADASYN(random_state=RANDOM_STATE, n_neighbors=10)
]
oversampling_algorithms = [
    "random",
    "smote_interpolation",
    "smote_interpolation_w_categorical",
    "adasyn"
]
oversampled_datasets = list()

for algorithm, model in zip(oversampling_algorithms, models):
    if algorithm in ("smote_interpolation", "adasyn"):
        oversampled_data, oversampled_labels = model.fit_resample(train_set.select_dtypes(include="number"), train_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=list(train_set.select_dtypes(include="number").columns) + ['label'])

    elif algorithm == "smote_interpolation_w_categorical":
        df = discretize_data(train_set, ['cyclist_age_group'])
        oversampled_data, oversampled_labels = model.fit_resample(df, train_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=(list(df.columns) + ['label']))
    
    else:
        oversampled_data, oversampled_labels = model.fit_resample(train_set, train_label)
        oversampled_labels = oversampled_labels.to_numpy().reshape(-1, 1)
        oversampled_dataset = pd.DataFrame(numpy.hstack((oversampled_data, oversampled_labels)), columns=(list(train_set.columns) + ['label']))
    
    oversampled_dataset["algorithm"] = algorithm

    oversampled_datasets.append(oversampled_dataset)

In [12]:
tests_per_algorithm = list()
train_set['label'] = label
for algorithm, oversampled_dataset in zip(oversampling_algorithms, oversampled_datasets):
    print(algorithm)
    columns = oversampled_dataset.columns
    test_results = list()
    for column in columns:
        if column != "algorithm":
            if not (column == 'cyclist_age_group' and algorithm == "smote_interpolation_w_categorical"):
                test_results.append(ks_2samp(train_set[column], oversampled_dataset[column], alternative="two-sided"))
            else:
                df = discretize_data(train_set, ['cyclist_age_group'])
                test_results.append(ks_2samp(df[column], oversampled_dataset[column], alternative="two-sided"))

    test_data = [(
        test.statistic,
        test.pvalue,
        test.statistic_location
        )
        for test in test_results        
    ]
    test_data = pd.DataFrame(test_data, columns=["KS_test", "p_value", "margin"])
    test_data["algorithm"] = algorithm

    tests_per_algorithm.append(test_data)

validation = pd.concat(tests_per_algorithm, axis="rows")
validation.groupby("algorithm").describe()

random
smote_interpolation
smote_interpolation_w_categorical
adasyn


Unnamed: 0_level_0,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,KS_test,p_value,p_value,p_value,p_value,p_value,p_value,p_value,p_value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
algorithm,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
adasyn,11.0,0.071334,0.101582,0.00398,0.022093,0.037125,0.069958,0.366114,11.0,0.000811,0.002691,0.0,0.0,7.204361e-205,5.293136000000001e-66,0.008924
random,13.0,0.05502,0.098115,0.002658,0.003976,0.020585,0.06157,0.36389,13.0,0.031696,0.065552,0.0,0.0,3.6563999999999995e-63,0.009100322,0.179475
smote_interpolation,11.0,0.075947,0.100417,0.004604,0.024649,0.037732,0.080203,0.36389,11.0,0.000132,0.000437,0.0,0.0,2.901017e-211,2.760838e-61,0.00145
smote_interpolation_w_categorical,13.0,0.072113,0.092874,0.004475,0.022016,0.037582,0.080104,0.36389,13.0,0.000166,0.000599,0.0,0.0,1.38792e-209,3.422946e-72,0.002161


In [15]:
train_oversample = oversampled_datasets[0]
train_oversample['label'].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [16]:
#drop the categorical variables since we don't need them anymore 
cat_columns = ['race_season']
train_oversample_encoded = discretize_data(train_oversample, ['cyclist_age_group'])
train_oversample_encoded = pd.get_dummies(train_oversample_encoded, columns = cat_columns, prefix_sep='%').drop(columns=['algorithm'])
train_oversample_encoded

Unnamed: 0,length,cyclist_bmi,cyclist_age_group,climb_percentage,race_physical_effort,race_prestige,previous_mean_position,previous_mean_delta,previous_mean_cp,cyclist_previous_experience,num_participants,label,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
0,184.5,20.305175,2,0.015989,0.122661,0.053187,0.347688,175.977695,0.004965,0.508931,156,0,False,True,False,False
1,189.3,21.329438,4,0.010909,0.088096,0.071833,0.591226,515.807571,0.001473,0.570698,194,0,False,False,True,False
2,199.5,19.655451,2,0.012997,0.116582,0.110119,0.473325,380.292135,0.018633,0.564302,179,0,False,False,True,False
3,28.0,22.229062,3,0.000429,0.000037,0.130478,0.543691,1808.685714,0.000678,0.045057,141,0,False,True,False,False
4,20.1,21.222808,4,0.020199,0.000918,0.047825,0.333908,216.910853,0.021283,0.73723,141,0,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464899,173.7,21.46915,1,0.014157,0.09626,0.138801,0.407372,864.225,0.00328,0.227586,175,1,False,False,True,False
464900,158.5,21.534908,2,0.01371,0.038809,0.267582,0.471775,498.337931,0.002434,0.269113,152,1,False,False,True,False
464901,178.7,20.160701,2,0.012938,0.186222,0.059909,0.295722,170.483283,0.008042,0.44944,151,1,False,False,False,True
464902,202.0,21.55102,2,0.008827,0.040584,0.09232,0.487357,492.120567,0.001941,0.247494,157,1,False,False,False,True


In [17]:
val_set['label'] = val_label
val_set = discretize_data(val_set, ['cyclist_age_group'])
cat_columns = ['race_season']
val_set_encoded = pd.get_dummies(val_set, columns = cat_columns, prefix_sep='%')
val_set_encoded

Unnamed: 0,length,cyclist_bmi,cyclist_age_group,climb_percentage,race_physical_effort,race_prestige,previous_mean_position,previous_mean_delta,previous_mean_cp,cyclist_previous_experience,num_participants,label,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
423629,173.0,21.733535,4,0.002329,0.007855,0.109222,0.572124,511.289474,0.001593,0.483521,181,0,False,True,False,False
69029,191.5,21.672110,4,0.008846,0.109663,0.285028,0.444177,364.053333,0.004331,0.574693,188,0,False,False,True,False
356685,203.4,20.987654,2,0.007040,0.065641,0.231249,0.660182,503.238095,0.001238,0.017938,162,0,False,False,False,True
308027,181.0,20.558647,4,0.017829,0.329084,0.055988,0.523625,476.236948,0.033896,0.415779,139,0,False,True,False,False
253992,9.7,20.233553,4,0.015979,0.000507,0.102308,0.558513,565.878277,0.008049,0.810517,174,0,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115858,175.0,21.877551,1,0.007640,0.026364,0.151221,0.531247,363.287449,0.003730,0.312296,148,0,False,False,True,False
234101,183.5,22.160665,4,0.012044,0.137090,0.258459,0.665553,727.026756,0.002956,0.336063,180,0,False,True,False,False
207601,207.5,20.199470,4,0.017634,0.427770,0.244214,0.434589,341.524017,0.004337,0.727773,158,0,False,False,True,False
336570,217.7,21.847009,2,0.021943,0.585927,0.059909,0.553596,625.113636,0.004602,0.107992,157,0,False,False,False,True


In [18]:
test_set['label'] = test_label
test_set = discretize_data(test_set, ['cyclist_age_group'])
cat_columns = ['race_season']
test_set_encoded = pd.get_dummies(test_set, columns = cat_columns, prefix_sep='%')
test_set_encoded

Unnamed: 0,length,cyclist_bmi,cyclist_age_group,climb_percentage,race_physical_effort,race_prestige,previous_mean_position,previous_mean_delta,previous_mean_cp,cyclist_previous_experience,num_participants,label,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
196,203.9,21.705129,3,0.009858,0.092363,0.225487,0.729362,470.333333,0.000047,0.009954,111,0,False,False,False,True
197,254.1,21.705129,3,0.013617,0.198139,0.248896,0.666310,458.285714,0.000047,0.012297,126,0,False,True,False,False
199,183.7,20.756387,0,0.008977,0.068267,0.228368,0.828177,559.000000,0.000032,0.006341,108,0,False,False,False,True
431,123.4,18.827160,3,0.018476,0.158518,0.000000,0.454167,211.500000,0.000211,0.000813,116,0,False,False,False,True
432,164.6,18.827160,3,0.012704,0.116349,0.051346,0.424452,156.250000,0.000243,0.001490,172,0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568453,218.0,20.305175,4,0.011482,0.184458,0.078556,0.416825,464.256065,0.002305,0.598772,170,0,False,False,False,True
568454,165.6,20.305175,4,0.022234,0.343537,0.078556,0.417961,466.758065,0.002295,0.599307,163,0,False,False,False,True
568455,193.0,20.305175,4,0.018705,0.235529,0.078556,0.419099,469.501340,0.002286,0.600163,153,0,False,False,False,True
568456,154.0,20.305175,4,0.010779,0.028805,0.078556,0.420047,468.336898,0.002282,0.600797,147,0,False,False,False,True


Abbiamo trasformato gli attributi categorici in equivalenti numerici per poter essere processati dai modelli di learning: 
- one-hot encoding: è stato utilizzato quando l'attributo categorico non è di tipo ordinale, in questo caso non siamo interessati a preservare ordine o relazioni di vicinanza tra i valori di questo attributo e pertanto vogliamo massimizzare la dissimilarità tra le classi
- discretizzazione: abbiamo usato questa tecnica solo nel caso dell'attributo cyclist_age_group, in quanto queste classi possiedono relazioni di ordine e vicinanza (attributo ordinale)

In [19]:
train_oversample_encoded.to_csv('../../data/ml_datasets/oversampling/train_set.csv', index=False)
val_set_encoded.to_csv('../../data/ml_datasets/oversampling/val_set.csv', index=False)
test_set_encoded.to_csv('../../data/ml_datasets/oversampling/test_set.csv', index=False)