In [1]:
%load_ext autoreload
%autoreload 2

Using simpler models with the same results. Can this be considered a victory?

In [2]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, ParameterSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import classification_report

from utility.classification_utility import *

2025-01-02 18:47:39.946739: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735840060.028073  107237 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735840060.047647  107237 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 18:47:40.108614: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Let's start by building our beautiful dataset

In [3]:
cyc = '../dataset/cyclists_cleaned.csv'
races = '../dataset/races_cleaned.csv'
df = make_dataset_for_classification(races, cyc, make_stage_type=True, make_race_participants=True)

100.00%  


In [4]:
# dumb imputations
df['height'] = df['height'].fillna(df['height'].mean())
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['bmi'] = df['weight']/np.square(df['height']/100)
df['cyclist_age_rac'] = df['cyclist_age_rac'].fillna(df['cyclist_age_rac'].mean())
df['climb_total'] = df['climb_total'].fillna(df['length']*0.05) # could be differentiated by lengths
df['steepness'] = df['length'] / df['climb_total']

In [5]:
# for profile, let's use the big stick
# Features and target
# Add a column 'is_ITT' where 1 indicates stage_type is 'ITT', 0 otherwise
df['is_ITT'] = (df['stage_type'] == 'ITT').astype(int)
df_filtered = df[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT', 'profile']].dropna()
X = df_filtered[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']]
y = df_filtered['profile']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [6]:
# TODO: test the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {np.mean(y_pred == y_test)}")
print(f"Feature importance: {clf.feature_importances_}")

Accuracy: 0.9991312777770783
Feature importance: [0.21644605 0.27903885 0.30263699 0.20187812 0.        ]


Well.. this might mean that the inferences over the climb total and the length were quite accurate

In [7]:
# Predict missing profiles
missing_profiles = df[df['profile'].isna()]
df.loc[missing_profiles.index, 'profile'] = clf.predict(missing_profiles[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']])

In [8]:
TO_USE_COLS = [
    # over time
    'total_points',
    'avg_points_per_race', 
    'average_position',
    'avg_speed_cyclist', 
    'mean_stamina_index',
    'race_count',
    # race related
    'length',
    'climb_total', 
    'profile', 
    'startlist_quality', 
    'cyclist_age_rac', 
    'steepness', 
    'is_tarmac', 
    'stage_type',
    'season',
    'total_participants',
    # cyclist related
    'height',
    'weight',
    'bmi',
    'home_game',
    # for the split
    'date',
    'target'
]

df_to_use = df[TO_USE_COLS]

In [9]:
# useless
# season_dummies = pd.get_dummies(df['season'], prefix='season')
# df_to_use = pd.concat([df_to_use.drop(columns=['season']), season_dummies], axis=1)

In [10]:
df_tr, df_vl, df_ts = get_data_split(df_to_use)

df_tr = df_tr.drop(columns=['date'])
df_vl = df_vl.drop(columns=['date'])
df_ts = df_ts.drop(columns=['date'])

X_tr, y_tr = split_features_target(df_tr)
X_vl, y_vl = split_features_target(df_vl)
X_ts, y_ts = split_features_target(df_ts)

Normalize the features before predicting

In [11]:
scal = StandardScaler()
X_tr = scal.fit_transform(X_tr)
X_vl = scal.transform(X_vl)
X_ts = scal.transform(X_ts)

# Nn test

Let's build this model

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_tr.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam()


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[
    'accuracy',
    'recall'
    ])

Now fit it

In [None]:
history = model.fit(
    X_tr, y_tr, 
    validation_data=(X_vl, y_vl), 
    epochs=10, 
    batch_size=64,
    verbose=2,
    callbacks=[early_stopping]
)

In [None]:
val_pred = model.predict(X_vl)
val_pred = (val_pred > 0.5).astype(int)
print(classification_report(y_vl, val_pred))

# Grid Search

In [16]:
dict_learning = {
    'adam': tf.keras.optimizers.Adam(),
    'sgd': tf.keras.optimizers.SGD(),
    'lion': tf.keras.optimizers.Lion()
}

def build_model(num_layers=2, num_units=64, optimizer='adam', activation='relu', dropout=0.2):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input(shape=(X_tr.shape[1],)))
    
    model.add(tf.keras.layers.Dense(num_units, activation=activation))
    model.add(tf.keras.layers.Dropout(dropout))
    
    for _ in range(num_layers - 1):
        model.add(tf.keras.layers.Dense(num_units, activation=activation))
        model.add(tf.keras.layers.Dropout(dropout))
    
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    opt = dict_learning[optimizer]
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', 'recall'])
    return model

# param_grid = {
#     'num_layers': [1, 2],
#     'num_units': [32, 64],
#     'optimizer': ['adam', 'sgd', 'lion'],
#     'activation': ['relu', 'tanh'],
#     'batch_size': [16, 32, 64],
#     'epochs': [5],
# }

This proto grid search has already been ran over various parameters. Without verbosity. Won't be uploading on git a long version of the last.

Nothing much changes between these different hyperparameters. Maybe the answer lies elsewhere, like in regularization: oversampling and dropout?

In [17]:
param_grid = {
    'dropout': [0.2],
    'num_layers': [2, 3],
    'num_units': [64, 128],
    'optimizer': ['adam'],
    'activation': ['relu'],
    'batch_size': [64],
    # 'batch_norm': [True],
    'epochs': [10],
}

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_recall',  
    patience=5,            
    mode='max',            
    restore_best_weights=True  
)

In [18]:
ratio = 0.35
# oversample = RandomOverSampler(sampling_strategy=ratio, random_state=42)
oversample = SMOTE(sampling_strategy=ratio, random_state=42)
oversampled_X, oversampled_y = oversample.fit_resample(X_tr, y_tr)

In [19]:
param_list = list(ParameterSampler(param_grid, n_iter=8, random_state=42))

for params in param_list:
    print(f"Training model with parameters: {params}")
    model = build_model(params['num_layers'], params['num_units'], params['optimizer'], params['activation'], params['dropout'])
    history = model.fit(
        oversampled_X, oversampled_y, 
        validation_data=(X_vl, y_vl), 
        epochs=params['epochs'], 
        batch_size=params['batch_size'],
        verbose=0,
        callbacks=[early_stopping]
    )
    val_pred = model.predict(X_vl)
    val_pred = (val_pred > 0.5).astype(int)
    # write metrics and params in a file
    with open('model_eval.txt', 'a') as f:
        f.write(f"Parameters: {params}, oversampled: {ratio}\n")
        f.write(classification_report(y_vl, val_pred))
        f.write('\n\n')




Training model with parameters: {'optimizer': 'adam', 'num_units': 64, 'num_layers': 2, 'epochs': 10, 'dropout': 0.2, 'batch_size': 64, 'activation': 'relu'}
[1m1359/1359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 605us/step
Training model with parameters: {'optimizer': 'adam', 'num_units': 128, 'num_layers': 2, 'epochs': 10, 'dropout': 0.2, 'batch_size': 64, 'activation': 'relu'}
[1m1359/1359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 662us/step
Training model with parameters: {'optimizer': 'adam', 'num_units': 64, 'num_layers': 3, 'epochs': 10, 'dropout': 0.2, 'batch_size': 64, 'activation': 'relu'}
[1m1359/1359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 559us/step
Training model with parameters: {'optimizer': 'adam', 'num_units': 128, 'num_layers': 3, 'epochs': 10, 'dropout': 0.2, 'batch_size': 64, 'activation': 'relu'}
[1m1359/1359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 692us/step


Oversampling boosts recall to good levels but mutilates precision, while dropout does the opposite. Tuning them seems the best option
Two layers and more units seem to improve a bit the overall metrics.
As professor Micheli said, I have to listen to the neural network and move where it tells me. 
Also, early stopping and more epochs might help. Maybe let's reduce the parameters.

In [17]:
tf.config.run_functions_eagerly(True)
