In [1]:
%load_ext autoreload
%autoreload 2

Using simpler models with the same results. Can this be considered a victory?

In [2]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, ParameterSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import classification_report

from utility.classification_utility import *

2025-01-02 15:18:31.414611: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735827511.501181    6189 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735827511.530545    6189 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 15:18:31.704175: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Let's start by building our beautiful dataset

In [3]:
cyc = '../dataset/cyclists_cleaned.csv'
races = '../dataset/races_cleaned.csv'
df = make_dataset_for_classification(races, cyc, make_stage_type=True)

100.00%  


In [4]:
# dumb imputations
df['height'] = df['height'].fillna(df['height'].mean())
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['bmi'] = df['weight']/np.square(df['height']/100)
df['cyclist_age_rac'] = df['cyclist_age_rac'].fillna(df['cyclist_age_rac'].mean())
df['climb_total'] = df['climb_total'].fillna(df['length']*0.05) # could be differentiated by lengths
df['steepness'] = df['length'] / df['climb_total']

In [5]:
# for profile, let's use the big stick
# Features and target
# Add a column 'is_ITT' where 1 indicates stage_type is 'ITT', 0 otherwise
df['is_ITT'] = (df['stage_type'] == 'ITT').astype(int)
df_filtered = df[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT', 'profile']].dropna()
X = df_filtered[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']]
y = df_filtered['profile']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [6]:
# TODO: test the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {np.mean(y_pred == y_test)}")
print(f"Feature importance: {clf.feature_importances_}")

Accuracy: 0.9991312777770783
Feature importance: [0.21644605 0.27903885 0.30263699 0.20187812 0.        ]


Well.. this might mean that the inferences over the climb total and the length were quite accurate

In [7]:
# Predict missing profiles
missing_profiles = df[df['profile'].isna()]
df.loc[missing_profiles.index, 'profile'] = clf.predict(missing_profiles[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']])

In [8]:
TO_USE_COLS = [
    # over time
    'total_points',
    'avg_points_per_race', 
    'average_position',
    'avg_speed_cyclist', 
    'mean_stamina_index',
    'race_count',
    # race related
    'length',
    'climb_total', 
    'profile', 
    'startlist_quality', 
    'cyclist_age_rac', 
    'steepness', 
    'is_tarmac', 
    'stage_type',
    # cyclist related
    # 'height',
    # 'weight',
    'bmi',
    'home_game',
    # for the split
    'date',
    'target'
]

df_to_use = df[TO_USE_COLS]

In [9]:
df_tr, df_vl, df_ts = get_data_split(df_to_use)

df_tr = df_tr.drop(columns=['date'])
df_vl = df_vl.drop(columns=['date'])
df_ts = df_ts.drop(columns=['date'])

X_tr, y_tr = split_features_target(df_tr)
X_vl, y_vl = split_features_target(df_vl)
X_ts, y_ts = split_features_target(df_ts)

Normalize the features before predicting

In [10]:
scal = StandardScaler()
X_tr = scal.fit_transform(X_tr)
X_vl = scal.transform(X_vl)
X_ts = scal.transform(X_ts)

# Nn test

Let's build this model

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_tr.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam()


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[
    'accuracy',
    'recall'
    ])

Now fit it

In [None]:
history = model.fit(
    X_tr, y_tr, 
    validation_data=(X_vl, y_vl), 
    epochs=10, 
    batch_size=64,
    verbose=2,
    callbacks=[early_stopping]
)

In [None]:
val_pred = model.predict(X_vl)
val_pred = (val_pred > 0.5).astype(int)
print(classification_report(y_vl, val_pred))

# Grid Search

In [11]:
def build_model(num_layers=2, num_units=64, optimizer='adam', activation='relu', dropout=0.2):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(num_units, activation=activation, input_shape=(X_tr.shape[1],)))
    model.add(tf.keras.layers.Dropout(dropout))
    
    for _ in range(num_layers - 1):
        model.add(tf.keras.layers.Dense(num_units, activation=activation))
        model.add(tf.keras.layers.Dropout(dropout))
    
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    dict_learning = {
    'adam': tf.keras.optimizers.Adam(),
    'sgd': tf.keras.optimizers.SGD(),
    'lion': tf.keras.optimizers.Lion()
    }
    
    optimizer = dict_learning[optimizer]
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', 'recall'])
    return model

param_grid = {
    'num_layers': [1, 2],
    'num_units': [32, 64],
    'optimizer': ['adam', 'sgd', 'lion'],
    'activation': ['relu', 'tanh'],
    'batch_size': [16, 32, 64],
    'epochs': [5],
}

This proto grid search has already been ran over various parameters. Without verbosity. Won't be uploading on git a long version of the last.

In [None]:
param_list = list(ParameterSampler(param_grid, n_iter=5, random_state=42))

for params in param_list:
    print(f"Training model with parameters: {params}")
    model = build_model(params['num_layers'], params['num_units'], params['optimizer'], params['activation'])
    history = model.fit(
        X_tr, y_tr, 
        validation_data=(X_vl, y_vl), 
        epochs=params['epochs'], 
        batch_size=params['batch_size'],
        verbose=2,
        # callbacks=[early_stopping]
    )
    val_pred = model.predict(X_vl)
    val_pred = (val_pred > 0.5).astype(int)
    # write metrics and params in a file
    with open('model_eval.txt', 'a') as f:
        f.write(f"Parameters: {params}\n")
        f.write(classification_report(y_vl, val_pred))
        f.write('\n\n')
        f.write('\n\n')


Nothing much changes between these different hyperparameters. Maybe the answer lies elsewhere, like in regularization: oversampling and dropout?

In [12]:
param_grid = {
    'dropout': [0.2, 0.5],
    'num_layers': [1, 2],
    'num_units': [32, 64],
    'optimizer': ['adam'],
    'activation': ['relu'],
    'batch_size': [64],
    'epochs': [5],
}

In [None]:
for params in param_list:
    print(f"Training model with parameters: {params} (no oversampling)")
    model = build_model(params['num_layers'], params['num_units'], params['optimizer'], params['activation'], params['dropout'])
    history = model.fit(
        X_tr, y_tr, 
        validation_data=(X_vl, y_vl), 
        epochs=params['epochs'], 
        batch_size=params['batch_size'],
        verbose=2,
        # callbacks=[early_stopping]
    )
    val_pred = model.predict(X_vl)
    val_pred = (val_pred > 0.5).astype(int)
    # write metrics and params in a file
    with open('model_eval.txt', 'a') as f:
        f.write(f"Parameters: {params}\n")
        f.write(classification_report(y_vl, val_pred))
        f.write('\n\n')
        f.write('\n\n')

In [24]:
# ratio = 0.3
# oversample = RandomOverSampler(sampling_strategy=ratio, random_state=42)
# oversampled_X, oversampled_y = oversample.fit_resample(X_tr, y_tr)

param_list = list(ParameterSampler(param_grid, n_iter=8, random_state=42))

for params in param_list:
    print(f"Training model with parameters: {params}")
    model = build_model(params['num_layers'], params['num_units'], params['optimizer'], params['activation'], params['dropout'])
    history = model.fit(
        oversampled_X, oversampled_y, 
        validation_data=(X_vl, y_vl), 
        epochs=params['epochs'], 
        batch_size=params['batch_size'],
        verbose=2,
        callbacks=[early_stopping]
    )
    val_pred = model.predict(X_vl)
    val_pred = (val_pred > 0.5).astype(int)
    # write metrics and params in a file
    with open('model_eval.txt', 'a') as f:
        f.write(f"Parameters: {params}, oversampled: {ratio}\n")
        f.write(classification_report(y_vl, val_pred))
        f.write('\n\n')
        f.write('\n\n')


Training model with parameters: {'optimizer': 'adam', 'num_units': 64, 'num_layers': 2, 'epochs': 10, 'dropout': 0.2, 'batch_size': 64, 'activation': 'relu'}
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


6639/6639 - 11s - 2ms/step - accuracy: 0.7520 - loss: 0.5102 - recall: 0.3380 - val_accuracy: 0.8413 - val_loss: 0.3919 - val_recall: 0.4315
Epoch 2/10
6639/6639 - 10s - 1ms/step - accuracy: 0.7583 - loss: 0.4996 - recall: 0.3591 - val_accuracy: 0.8486 - val_loss: 0.3733 - val_recall: 0.4044
Epoch 3/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7599 - loss: 0.4968 - recall: 0.3637 - val_accuracy: 0.8390 - val_loss: 0.4023 - val_recall: 0.4568
Epoch 4/10
6639/6639 - 10s - 1ms/step - accuracy: 0.7607 - loss: 0.4948 - recall: 0.3692 - val_accuracy: 0.8517 - val_loss: 0.3786 - val_recall: 0.3856
Epoch 5/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7612 - loss: 0.4930 - recall: 0.3733 - val_accuracy: 0.8503 - val_loss: 0.3806 - val_recall: 0.4018
Epoch 6/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7625 - loss: 0.4919 - recall: 0.3771 - val_accuracy: 0.8425 - val_loss: 0.3882 - val_recall: 0.4470
Epoch 7/10
6639/6639 - 10s - 1ms/step - accuracy: 0.7635 - loss: 0.4908 - recall: 0.3828 - val_accura

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


6639/6639 - 13s - 2ms/step - accuracy: 0.7514 - loss: 0.5116 - recall: 0.3330 - val_accuracy: 0.8482 - val_loss: 0.3805 - val_recall: 0.4109
Epoch 2/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7582 - loss: 0.5003 - recall: 0.3728 - val_accuracy: 0.8473 - val_loss: 0.3821 - val_recall: 0.3976
Epoch 3/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7596 - loss: 0.4975 - recall: 0.3797 - val_accuracy: 0.8498 - val_loss: 0.3724 - val_recall: 0.4058
Epoch 4/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7603 - loss: 0.4951 - recall: 0.3861 - val_accuracy: 0.8435 - val_loss: 0.3737 - val_recall: 0.4358
Epoch 5/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7615 - loss: 0.4936 - recall: 0.3925 - val_accuracy: 0.8506 - val_loss: 0.3669 - val_recall: 0.4029
Epoch 6/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7618 - loss: 0.4925 - recall: 0.3927 - val_accuracy: 0.8403 - val_loss: 0.3874 - val_recall: 0.4736
Epoch 7/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7624 - loss: 0.4914 - recall: 0.3935 - val_accu

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7432 - loss: 0.5241 - recall: 0.2804 - val_accuracy: 0.8478 - val_loss: 0.3988 - val_recall: 0.3880
Epoch 2/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7524 - loss: 0.5100 - recall: 0.3336 - val_accuracy: 0.8532 - val_loss: 0.3850 - val_recall: 0.3640
Epoch 3/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7536 - loss: 0.5070 - recall: 0.3409 - val_accuracy: 0.8571 - val_loss: 0.3823 - val_recall: 0.3101
Epoch 4/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7537 - loss: 0.5053 - recall: 0.3459 - val_accuracy: 0.8487 - val_loss: 0.3816 - val_recall: 0.4002
Epoch 5/10
6639/6639 - 10s - 1ms/step - accuracy: 0.7541 - loss: 0.5047 - recall: 0.3461 - val_accuracy: 0.8430 - val_loss: 0.3937 - val_recall: 0.4255
Epoch 6/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7553 - loss: 0.5040 - recall: 0.3497 - val_accuracy: 0.8485 - val_loss: 0.3767 - val_recall: 0.4018
Epoch 7/10
6639/6639 - 9s - 1ms/step - accuracy: 0.7554 - loss: 0.5037 - recall: 0.3513 - va

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


6639/6639 - 12s - 2ms/step - accuracy: 0.7385 - loss: 0.5298 - recall: 0.2567 - val_accuracy: 0.8499 - val_loss: 0.4017 - val_recall: 0.3793
Epoch 2/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7513 - loss: 0.5121 - recall: 0.3359 - val_accuracy: 0.8535 - val_loss: 0.3942 - val_recall: 0.3449
Epoch 3/10
6639/6639 - 12s - 2ms/step - accuracy: 0.7520 - loss: 0.5096 - recall: 0.3407 - val_accuracy: 0.8517 - val_loss: 0.3932 - val_recall: 0.3750
Epoch 4/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7522 - loss: 0.5074 - recall: 0.3473 - val_accuracy: 0.8453 - val_loss: 0.3942 - val_recall: 0.4066
Epoch 5/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7526 - loss: 0.5068 - recall: 0.3447 - val_accuracy: 0.8487 - val_loss: 0.3962 - val_recall: 0.3919
Epoch 6/10
6639/6639 - 11s - 2ms/step - accuracy: 0.7538 - loss: 0.5060 - recall: 0.3515 - val_accuracy: 0.8483 - val_loss: 0.3937 - val_recall: 0.3987
Epoch 7/10
6639/6639 - 10s - 2ms/step - accuracy: 0.7535 - loss: 0.5053 - recall: 0.3491 - val_accu

Oversampling boosts recall to good levels but mutilates precision, while dropout does the opposite. Tuning them seems the best option
Two layers and more units seem to improve a bit the overall metrics.
As professor Micheli said, I have to listen to the neural network and move where it tells me. 
Also, early stopping and more epochs might help. Maybe let's reduce the parameters.

In [25]:
param_grid = {
    'dropout': [0.2],
    'num_layers': [2, 3],
    'num_units': [128],
    'optimizer': ['adam'],
    'activation': ['relu'],
    'batch_size': [64],
    'epochs': [10],
}

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_recall',  
    patience=4,            
    mode='max',            
    restore_best_weights=True  
)

In [23]:
# oversample with SMOTE
ratio = 0.4
from imblearn.over_sampling import SMOTE
oversample = SMOTE(sampling_strategy=ratio, random_state=42)
oversampled_X, oversampled_y = oversample.fit_resample(X_tr, y_tr)