# Classification

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import ClusterCentroids

import pandas as pd
import numpy 
from sklearn.model_selection import train_test_split
import seaborn as sb

In [None]:
dataset = pd.read_csv('../data/complete_dataset_fe.csv', dtype={'cyclist_team': str})
RANDOM_STATE = 42   

In [None]:
dataset['label'] = (dataset['raw_position'] <= 20).astype(int) #1 if top 20, 0 otherwise 

In [None]:
dataset.info()

Feature da eliminare:

- cyclist_url (non generalizzabili)
- cyclist_name (non generalizzabili)
- birth_year (gia incluso in age)
- nationality (troppo sbilanciamento nel dataset)
- weight, height (gia incluso in BMI)
- uci_points (troppo pochi e gia inclusi in points)
- race_url (non generalizzabili)
- race_name (non generalizzabili)
- climb_total, profile (gia inclusi in race_physical_effort)
- mostly_tarmac (sbilanciato)
- cyclist_team (non generalizzabili)
- date (non generalizzabili)
- raw_position, position (obv)
- delta (obv) 
- race_year (non generalizzabili)
- race_stage (non generalizzabili)
- cyclist_age_group (??)
- cyclist_climb_power (feature generata con delta, obv)
- points, startlist_quality (gia inclusti in race_prestige)

We must choose between cyclist_age and cyclist_age_group as both these features represent the same information bu give different advantage/disadvantage to our classification process: 
- cyclist_age -> lower dimensionality (no one-hot) but unbalanced representation
- cyclist_age_group -> higher dimensionality (one-hot) but balanced representation 

we chose to delete cyclist_age as the dimensionality issue could be solved by the amount of data available

In [None]:
columns = ['length', 'race_season', 'cyclist_bmi', 'cyclist_age_group', 'climb_percentage', 'race_physical_effort', 'race_prestige', 'race_year', 'label']
dataset = dataset[columns].dropna()

In [None]:
development_dataset = dataset[dataset['race_year'] < 2022]
test_dataset = dataset[dataset['race_year'] >= 2022]

development_dataset = development_dataset.drop(columns=['race_year'])
test_dataset = test_dataset.drop(columns=['race_year'])

In [None]:
label = development_dataset.pop('label')
train_set, val_set, train_label, val_label = train_test_split(development_dataset, label, stratify=label, test_size=0.20, random_state=RANDOM_STATE)

In [None]:
train_label.value_counts(normalize=True)

In [None]:
models = [
    RandomUnderSampler(random_state=RANDOM_STATE),
    CondensedNearestNeighbour(random_state=RANDOM_STATE),
    ClusterCentroids(random_state=RANDOM_STATE)
]
undersampling_algorithms = [
    "random",
    "condensed_rule",
    "centroids"
]
undersampled_datasets = list()

for algorithm, model in zip(undersampling_algorithms, models):
    print(algorithm)
    if algorithm in ("condensed_rule", "centroids"):
        undersampled_data, undersampled_labels = model.fit_resample(train_set.select_dtypes(include="number"), train_label)
        undersampled_dataset = pd.DataFrame(numpy.hstack((undersampled_data, undersampled_labels)), columns=development_dataset.select_dtypes(include="number").columns)
    else:
        undersampled_data, undersampled_labels = model.fit_resample(train_set, train_label)
        undersampled_labels = undersampled_labels.to_numpy().reshape(-1, 1)
        undersampled_dataset = pd.DataFrame(numpy.hstack((undersampled_data, undersampled_labels)), columns=(list(development_dataset.columns) + ['label']))
    undersampled_dataset["algorithm"] = algorithm

    undersampled_datasets.append(undersampled_dataset)