# Feature Engineering

### Data Mining Project 2024/25

Authors: Nicola Emmolo, Simone Marzeddu, Jacopo Raffi

In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import pandas as pd

In [20]:
dataset = pd.read_csv('../data/complete_dataset.csv')
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y-%m-%d')
dataset.head()

Unnamed: 0,race_url,race_name,points,uci_points,length,climb_total,profile,startlist_quality,date,position,cyclist_url,cyclist_age,mostly_tarmac,cyclist_team,delta,cyclist_name,birth_year,weight,height,nationality
0,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,1978-07-05,0,sean-kelly,22.0,True,vini-ricordi-pinarello-sidermec-1986,0.0,Sean Kelly,1956.0,77.0,180.0,Ireland
1,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,1978-07-05,1,gerrie-knetemann,27.0,True,norway-1987,0.0,Gerrie Knetemann,1951.0,,,Netherlands
2,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,1978-07-05,2,rene-bittinger,24.0,True,,0.0,René Bittinger,1954.0,69.0,174.0,France
3,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,1978-07-05,3,joseph-bruyere,30.0,True,navigare-blue-storm-1993,0.0,Joseph Bruyère,1948.0,,,Belgium
4,tour-de-france/1978/stage-6,Tour de France,100.0,,162000.0,1101.0,1.0,1241,1978-07-05,4,sven-ake-nilsson,27.0,True,spain-1991,0.0,Sven-Åke Nilsson,1951.0,63.0,172.0,Sweden


## Normalizzazione delle posizioni

In [4]:
max_position = dataset.groupby('race_url')['position'].max().reset_index()
max_pos_dict = max_position.set_index('race_url')['position'].to_dict()

dataset['position'] = dataset['position'] / max_position['race_url'].map(max_pos_dict)

## Scaling Lunghezza e Climb Total

In [5]:
dataset['length'] = dataset['length'] / 1000
dataset['climb_total'] = dataset['climb_total'] / 1000

## Stagione

In [6]:
def get_season(month):
    if month in [7, 8, 9]:
        return 'summer'
    elif month in [1, 2, 3]:
        return 'winter'
    elif month in [4, 5, 6]:
        return 'spring'
    else:
        return 'autumn'
    
dataset['season'] = dataset['date'].dt.month.apply(get_season)

## BMI

In [7]:
dataset['cyclist_bmi'] = dataset['weight'] / (dataset['height'] / 100) ** 2

## Fascia Età

In [8]:
bins = [0, 18, 25, 30, float('inf')]
labels = ['<18', '18-25', '25-30', '>30']

dataset['cyclist_age_group'] = pd.cut(dataset['cyclist_age'], bins=bins, labels=labels, right=False)

## Climb Power (Power-Weight Ratio Proxy)

In [9]:
dataset['cyclist_climb_power'] = (dataset['climb_total'] * dataset['profile']) / ((dataset['delta']+1) * dataset['cyclist_bmi'])

## Difficoltà Fisica

In [10]:
dataset['race_physical_effort'] = dataset['length'] * dataset['climb_total'] * (dataset['profile']+1)

min_value = dataset['race_physical_effort'].min()
max_value = dataset['race_physical_effort'].max()
dataset['race_physical_effort'] = (dataset['race_physical_effort'] - min_value) / (max_value - min_value)

## Prestigio

In [11]:
dataset['race_prestige'] = dataset['points'] * dataset['startlist_quality']

min_value = dataset['race_prestige'].min()
max_value = dataset['race_prestige'].max()
dataset['race_prestige'] = (dataset['race_prestige'] - min_value) / (max_value - min_value)

## Esperienza

In [None]:
dataset = dataset.sort_values(by=['date'])

prestige_coeff = 1
physical_effort_coeff = 0.2

#TODO: when the dataset is cleaned, fill NaN values with 0
dataset['previous_experience'] = dataset['race_prestige'] * prestige_coeff + dataset['race_physical_effort'] * physical_effort_coeff
dataset['previous_experience'] = dataset.groupby('cyclist_url')['previous_experience'].transform(lambda x: x.shift().cumsum())