# Feature Engineering

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv('../data/complete_dataset.csv')
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')
dataset.head()

## Normalizzazione delle posizioni

In [None]:
max_position = dataset.groupby('race_url')['position'].max().reset_index()
max_pos_dict = max_position.set_index('race_url')['position'].to_dict()

dataset['position'] = dataset['position'] / max_position['race_url'].map(max_pos_dict)

## Scaling Lunghezza e Climb Total

In [None]:
dataset['length'] = dataset['length'] / 1000
dataset['climb_total'] = dataset['climb_total'] / 1000

## Stagione

In [None]:
def get_season(month):
    if month in [7, 8, 9]:
        return 'summer'
    elif month in [1, 2, 3]:
        return 'winter'
    elif month in [4, 5, 6]:
        return 'spring'
    else:
        return 'autumn'
    
dataset['season'] = dataset['date'].dt.month.apply(get_season)

## BMI

In [None]:
dataset['cyclist_bmi'] = dataset['weight'] / (dataset['height'] / 100) ** 2

## Fascia Età

In [None]:
bins = [0, 18, 25, 30, float('inf')]
labels = ['<18', '18-25', '25-30', '>30']

dataset['cyclist_age_group'] = pd.cut(dataset['cyclist_age'], bins=bins, labels=labels, right=False)

## Climb Power (Power-Weight Ratio Proxy)

In [None]:
dataset['cyclist_climb_power'] = (dataset['climb_total'] * dataset['profile']) / ((dataset['delta']+1) * dataset['cyclist_bmi'])

## Difficoltà Fisica

In [None]:
dataset['race_physical_effort'] = dataset['length'] * dataset['climb_total'] * (dataset['profile']+1)

min_value = dataset['race_physical_effort'].min()
max_value = dataset['race_physical_effort'].max()
dataset['race_physical_effort'] = (dataset['race_physical_effort'] - min_value) / (max_value - min_value)

## Prestigio

In [None]:
dataset['race_prestige'] = dataset['points'] * dataset['startlist_quality']

min_value = dataset['race_prestige'].min()
max_value = dataset['race_prestige'].max()
dataset['race_prestige'] = (dataset['race_prestige'] - min_value) / (max_value - min_value)

## Esperienza

In [None]:
dataset = dataset.sort_values(by='date').reset_index(drop=True)

prestige_coeff = 1
physical_effort_coeff = 0.2

# prestige_coeff * prestige + physical_effort_coeff * physical_effort