In [1]:
%load_ext autoreload
%autoreload 2

Using simpler models with the same results. Can this be considered a victory?

In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from utility.classification_utility import *

2024-12-31 18:31:48.704683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735666308.818533   68460 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735666308.851408   68460 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 18:31:49.089376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Let's start by building our beautiful dataset

In [2]:
cyc = '../dataset/cyclists_cleaned.csv'
races = '../dataset/races_cleaned.csv'
df = make_dataset_for_classification(races, cyc, make_stage_type=True)

100.00%  


In [3]:
# dumb imputations
df['height'] = df['height'].fillna(df['height'].mean())
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['bmi'] = df['weight']/np.square(df['height']/100)
df['cyclist_age_rac'] = df['cyclist_age_rac'].fillna(df['cyclist_age_rac'].mean())
df['climb_total'] = df['climb_total'].fillna(df['length']*0.05) # could be differentiated by lengths
df['steepness'] = df['length'] / df['climb_total']

In [None]:
# for profile, let's use the big stick
# Features and target
# Add a column 'is_ITT' where 1 indicates stage_type is 'ITT', 0 otherwise
df['is_ITT'] = (df['stage_type'] == 'ITT').astype(int)
df_filtered = df[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT', 'profile']].dropna()
X = df_filtered[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']]
y = df_filtered['profile']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [40]:
# TODO: test the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {np.mean(y_pred == y_test)}")
print(f"Feature importance: {clf.feature_importances_}")

Accuracy: 0.9991312777770783
Feature importance: [0.21644605 0.27903885 0.30263699 0.20187812 0.        ]


Well.. this might mean that the inferences over the climb total and the length were quite accurate

In [6]:
# Predict missing profiles
missing_profiles = df[df['profile'].isna()]
df.loc[missing_profiles.index, 'profile'] = clf.predict(missing_profiles[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']])

In [48]:
TO_USE_COLS = [
    # over time
    'total_points',
    'avg_points_per_race', 
    'average_position',
    'avg_speed_cyclist', 
    'mean_stamina_index',
    'race_count',
    # race related
    'length',
    'climb_total', 
    'profile', 
    'startlist_quality', 
    'cyclist_age_rac', 
    'steepness', 
    'is_tarmac', 
    'stage_type',
    # cyclist related
    # 'height',
    # 'weight',
    'bmi',
    'home_game',
    # for the split
    'date',
    'target'
]

df_to_use = df[TO_USE_COLS]

In [49]:
df_tr, df_vl, df_ts = get_data_split(df_to_use)

df_tr = df_tr.drop(columns=['date'])
df_vl = df_vl.drop(columns=['date'])
df_ts = df_ts.drop(columns=['date'])

X_tr, y_tr = split_features_target(df_tr)
X_vl, y_vl = split_features_target(df_vl)
X_ts, y_ts = split_features_target(df_ts)

Normalize the features before predicting

In [50]:
scal = StandardScaler()
X_tr = scal.fit_transform(X_tr)
X_vl = scal.transform(X_vl)
X_ts = scal.transform(X_ts)

Let's build this model

In [51]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_tr.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam()

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Metric to monitor (validation recall)
    patience=6,            # Number of epochs with no improvement before stopping
    mode='max',            # We want to maximize recall
    restore_best_weights=True  # Restore the model weights from the epoch with the best recall
)


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[
    'accuracy',
    'recall'
    ])

Now fit it

In [None]:
history = model.fit(
    X_tr, y_tr, 
    validation_data=(X_vl, y_vl), 
    epochs=30, 
    batch_size=64,
    verbose=2,
    callbacks=[early_stopping]
)

Epoch 1/30
5550/5550 - 9s - 2ms/step - accuracy: 0.8629 - loss: 0.3452 - recall: 0.1511 - val_accuracy: 0.8665 - val_loss: 0.3336 - val_recall: 0.2299
Epoch 2/30
5550/5550 - 9s - 2ms/step - accuracy: 0.8631 - loss: 0.3449 - recall: 0.1511 - val_accuracy: 0.8650 - val_loss: 0.3334 - val_recall: 0.2156
Epoch 3/30
5550/5550 - 8s - 1ms/step - accuracy: 0.8632 - loss: 0.3445 - recall: 0.1543 - val_accuracy: 0.8650 - val_loss: 0.3336 - val_recall: 0.1478
Epoch 4/30
5550/5550 - 8s - 1ms/step - accuracy: 0.8634 - loss: 0.3444 - recall: 0.1537 - val_accuracy: 0.8651 - val_loss: 0.3333 - val_recall: 0.1934
Epoch 5/30
5550/5550 - 8s - 1ms/step - accuracy: 0.8632 - loss: 0.3440 - recall: 0.1542 - val_accuracy: 0.8665 - val_loss: 0.3322 - val_recall: 0.1876
Epoch 6/30


In [53]:
from sklearn.metrics import classification_report

val_pred = model.predict(X_vl)
val_pred = (val_pred > 0.5).astype(int)
print(classification_report(y_vl, val_pred))

[1m1359/1359[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 610us/step
              precision    recall  f1-score   support

       False       0.88      0.97      0.93     37243
        True       0.58      0.22      0.32      6237

    accuracy                           0.87     43480
   macro avg       0.73      0.60      0.62     43480
weighted avg       0.84      0.87      0.84     43480



In [15]:
print(y_vl)

449164    False
449165     True
449166    False
449167    False
449168    False
          ...  
492639    False
492640    False
492641    False
492642    False
492643    False
Name: target, Length: 43480, dtype: bool
