# Classification Dataset

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [1]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import ClusterCentroids
from scipy.stats import ks_2samp
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb
from IPython.display import display

RANDOM_STATE = 42  

In [2]:
dataset = pd.read_csv('../../data/complete_dataset_fe.csv', dtype={'cyclist_team': str})
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise
dataset.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589739 entries, 0 to 589738
Data columns (total 39 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   cyclist_url                  589739 non-null  object 
 1   cyclist_name                 589739 non-null  object 
 2   birth_year                   589739 non-null  int64  
 3   weight                       480007 non-null  float64
 4   height                       480007 non-null  float64
 5   nationality                  589739 non-null  object 
 6   race_url                     589739 non-null  object 
 7   race_name                    589739 non-null  object 
 8   points                       589739 non-null  float64
 9   uci_points                   422184 non-null  float64
 10  length                       589739 non-null  float64
 11  climb_total                  475338 non-null  float64
 12  profile                      475338 non-null  float64
 13 

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> arbitraty encoding but balanced representation 

we chose to delete cyclist_age as the encoding is arbitraty but it still respects the "order" of the age_groups

In [3]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 
           'previous_mean_position', 'previous_mean_delta', 'previous_mean_cp', 'cyclist_previous_experience', 'label']
dataset = dataset[columns].dropna()

In [4]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_set = development_dataset.drop(columns=['race_year'])
developmente_label = development_set.pop('label')
test_set = test_dataset.drop(columns=['race_year'])
test_label = test_set.pop('label')

In [5]:
developmente_label.value_counts(normalize=True)

label
0    0.855363
1    0.144637
Name: proportion, dtype: float64

In [6]:
test_label.value_counts(normalize=True)

label
0    0.851987
1    0.148013
Name: proportion, dtype: float64

In [7]:
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable] = dataset[variable].map(mapping).astype(int)
    return dataset

In [8]:
variables = ['cyclist_age_group']
development_set = discretize_data(development_set, variables)
cat_columns = ['race_season']
development_set = pd.get_dummies(development_set, columns = cat_columns, prefix_sep='%')
development_set

Unnamed: 0,length,cyclist_bmi,cyclist_age_group,climb_percentage,race_physical_effort,race_prestige,previous_mean_position,previous_mean_delta,previous_mean_cp,cyclist_previous_experience,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
5,135.0,22.256908,1,0.003244,0.006662,0.193636,0.823060,161.333333,0.000118,0.024713,False,True,False,False
6,100.0,22.256908,1,0.003980,0.004484,0.193636,0.780877,138.285714,0.001042,0.049022,False,True,False,False
7,199.0,22.256908,1,0.009281,0.041416,0.193636,0.738174,121.000000,0.003588,0.071902,False,False,True,False
8,244.0,22.256908,1,0.006988,0.140635,0.193636,0.739753,128.111111,0.002734,0.097660,False,False,True,False
9,244.0,22.256908,1,0.009520,0.063869,0.193636,0.743873,115.300000,0.003841,0.131921,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589733,142.0,20.830818,1,0.006106,0.027745,0.074522,0.590384,510.512500,0.001754,0.146172,False,True,False,False
589734,146.0,20.830818,1,0.015171,0.072880,0.074522,0.594460,514.234568,0.001715,0.147058,False,True,False,False
589735,228.0,20.830818,1,0.010531,0.123371,0.074522,0.598033,520.695122,0.001689,0.148332,False,True,False,False
589736,210.0,20.830818,1,0.000814,0.004045,0.074522,0.591069,514.421687,0.001688,0.150036,False,True,False,False


In [9]:
def ks_test(undersampled_dataset, dataset):    
    columns = undersampled_dataset.columns
    test_results = [
        ks_2samp(
            dataset[column],
            undersampled_dataset[column],
            alternative="two-sided"
        )
        for column in columns if column != 'label'
    ]
    test_data = [(
        test.statistic,
        test.pvalue,
        test.statistic_location
        )
        for test in test_results        
    ]
    test_data = pd.DataFrame(test_data, columns=["KS_test", "p_value", "margin"])

    return test_data.describe()

In [10]:
scv = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
ks_tests = []

for train_index, val_index in scv.split(development_set, developmente_label):
    X_train, X_val = development_set.iloc[train_index], development_set.iloc[val_index]
    y_train, y_val = developmente_label.iloc[train_index], developmente_label.iloc[val_index]

    rus = RandomUnderSampler(random_state=RANDOM_STATE)
    X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
    y_resampled = y_resampled.to_numpy().reshape(-1, 1)

    undersampled_train_set = pd.DataFrame(numpy.hstack((X_resampled, y_resampled)), columns=(list(X_train.columns) + ['label']))

    ks_tests.append(ks_test(undersampled_train_set, X_train))
    

In [11]:
for i, test in enumerate(ks_tests, start=1):
    print(f"KS_Test Fold {i}:")
    display(test)
    print("\n")

KS_Test Fold 1:


Unnamed: 0,KS_test,p_value
count,14.0,14.0
mean,0.024873,0.2735942
std,0.03377,0.4002973
min,0.00045,0.0
25%,0.003315,1.200613e-18
50%,0.009396,0.000100343
75%,0.028808,0.4894856
max,0.106386,1.0




KS_Test Fold 2:


Unnamed: 0,KS_test,p_value
count,14.0,14.0
mean,0.024984,0.2923578
std,0.034256,0.403749
min,0.001401,0.0
25%,0.003022,1.2678300000000001e-17
50%,0.009163,0.001563463
75%,0.028864,0.6128003
max,0.107759,0.9996316




KS_Test Fold 3:


Unnamed: 0,KS_test,p_value
count,14.0,14.0
mean,0.025058,0.2546855
std,0.034066,0.3902587
min,0.000349,0.0
25%,0.003736,2.539229e-10
50%,0.010703,1.768629e-06
75%,0.028626,0.3405756
max,0.106312,1.0




KS_Test Fold 4:


Unnamed: 0,KS_test,p_value
count,14.0,14.0
mean,0.025261,0.2117167
std,0.033893,0.3659932
min,0.001126,0.0
25%,0.004118,9.825125e-12
50%,0.010127,1.137677e-05
75%,0.02868,0.2403337
max,0.107721,0.9999979




KS_Test Fold 5:


Unnamed: 0,KS_test,p_value
count,14.0,14.0
mean,0.025015,0.2904432
std,0.033811,0.40915
min,0.001226,0.0
25%,0.002791,5.93811e-17
50%,0.010177,3.894749e-06
75%,0.028927,0.7043987
max,0.106657,0.999979






In [12]:
test_set['label'] = test_label
test_set = discretize_data(test_set, variables)
test_set.drop(columns=variables, axis=1,inplace=True)
cat_columns = ['race_season']
test_set_encoded = pd.get_dummies(test_set, columns = cat_columns, prefix_sep='%')
test_set_encoded

Unnamed: 0,length,cyclist_bmi,climb_percentage,race_physical_effort,race_prestige,previous_mean_position,previous_mean_delta,previous_mean_cp,cyclist_previous_experience,label,race_season%autumn,race_season%spring,race_season%summer,race_season%winter
201,203.9,21.705129,0.009858,0.092363,0.225487,0.729362,470.333333,0.000047,0.009954,0,False,False,False,True
202,254.1,21.705129,0.013617,0.198139,0.248896,0.666310,458.285714,0.000047,0.012297,0,False,True,False,False
204,204.2,20.756387,0.008159,0.076668,0.264741,0.906142,591.500000,0.000036,0.002329,0,False,False,False,True
205,203.9,20.756387,0.009858,0.092363,0.225487,0.876822,643.666667,0.000031,0.003379,0,False,False,False,True
206,183.7,20.756387,0.008977,0.068267,0.228368,0.828177,559.000000,0.000032,0.006341,0,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589373,154.0,20.305175,0.010779,0.028805,0.078556,0.420047,468.336898,0.002282,0.600797,0,False,False,False,True
589374,294.0,20.305175,0.007327,0.142718,0.455848,0.419732,467.264000,0.002279,0.611609,0,False,False,False,True
589375,204.1,20.305175,0.009882,0.092776,0.304715,0.420876,467.380319,0.002274,0.746805,0,False,False,False,True
589376,273.4,20.305175,0.007992,0.134628,0.399068,0.421826,468.395225,0.002271,0.604920,0,False,True,False,False


Abbiamo trasformato gli attributi categorici in equivalenti numerici per poter essere processati dai modelli di learning: 
- one-hot encoding: è stato utilizzato quando l'attributo categorico non è di tipo ordinale, in questo caso non siamo interessati a preservare ordine o relazioni di vicinanza tra i valori di questo attributo e pertanto vogliamo massimizzare la dissimilarità tra le classi
- discretizzazione: abbiamo usato questa tecnica solo nel caso dell'attributo cyclist_age_group, in quanto queste classi possiedono relazioni di ordine e vicinanza (attributo ordinale)

In [13]:
development_set['label'] = developmente_label

In [14]:
development_dataset['label'].value_counts(normalize=True)

label
0    0.855363
1    0.144637
Name: proportion, dtype: float64

In [15]:
development_set.to_csv('../../data/ml_datasets/undersampling/dev_set.csv', index=False)
test_set_encoded.to_csv('../../data/ml_datasets/undersampling/test_set.csv', index=False)