In [1]:
%load_ext autoreload
%autoreload 2

Using simpler models with the same results. Can this be considered a victory?

In [2]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from utility.classification_utility import *

2024-12-31 16:21:00.156365: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735658460.228879   23424 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735658460.251512   23424 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 16:21:00.408026: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Let's start by building our beautiful dataset

In [3]:
cyc = '../dataset/cyclists_cleaned.csv'
races = '../dataset/races_cleaned.csv'
df = make_dataset_for_classification(races, cyc, make_stage_type=True)

100.00%  


In [17]:
df.columns

Index(['_url_rac', 'name_rac', 'stage', 'stage_type', 'points', 'length',
       'climb_total', 'profile', 'startlist_quality', 'date', 'position',
       'cyclist', 'cyclist_age_rac', 'is_tarmac', 'delta', 'time',
       'time_seconds', 'average_speed', 'steepness', 'season', 'is_staged',
       'race_country', 'age_performance_index', 'quality_adjusted_points',
       'stamina_index', 'birth_year', 'weight', 'height', 'nationality', 'bmi',
       'race_count', 'experience_level', 'total_points', 'avg_points_per_race',
       'average_position', 'avg_speed_cyclist', 'cyclist_age_cyc',
       'mean_stamina_index', 'elapsed_from_last', 'target', 'home_game'],
      dtype='object')

In [None]:
# dumb imputations
df['height'] = df['height'].fillna(df['height'].mean())
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['bmi'] = df['weight']/np.square(df['height']/100)
df['cyclist_age_rac'] = df['cyclist_age_rac'].fillna(df['cyclist_age_rac'].mean())
df['climb_total'] = df['climb_total'].fillna(df['length']*0.05) # could be differentiated by lengths
df['steepness'] = df['length'] / df['climb_total']

In [None]:
# for profile, let's use the big stick
# Features and target
# Add a column 'is_ITT' where 1 indicates stage_type is 'ITT', 0 otherwise
df['is_ITT'] = (df['stage_type'] == 'ITT').astype(int)
X = df[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']].dropna()
y = df['profile'].dropna()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict missing profiles
missing_profiles = df[df['profile'].isna()]
df.loc[missing_profiles.index, 'profile'] = clf.predict(missing_profiles[['length', 'climb_total', 'steepness', 'average_speed']])


In [4]:
TO_USE_COLS = [
    # over time
    'total_points',
    'avg_points_per_race', 
    'average_position',
    'avg_speed_cyclist', 
    'mean_stamina_index',
    'race_count',
    # race related
    'length',
    'climb_total', 
    'profile', 
    'startlist_quality', 
    'cyclist_age_rac', 
    'steepness', 
    'is_tarmac', 
    'stage_type',
    # cyclist related
    'height',
    'weight',
    'bmi',
    'home_game',
    # for the split
    'date',
    'target'
]

df_to_use = df[TO_USE_COLS]

In [None]:
df_tr, df_vl, df_ts = get_data_split(df_to_use)

df_tr = df_tr.drop(columns=['date'])
df_vl = df_vl.drop(columns=['date'])
df_ts = df_ts.drop(columns=['date'])

X_tr, y_tr = split_features_target(df_tr)
X_vl, y_vl = split_features_target(df_vl)
X_ts, y_ts = split_features_target(df_ts)

In [None]:
print(df_to_use.isna().sum())

NaNs in df_to_use
total_points                0
avg_points_per_race         0
average_position            0
avg_speed_cyclist           0
mean_stamina_index          0
race_count                  0
length                      0
climb_total            137267
profile                125939
startlist_quality           0
cyclist_age_rac            43
steepness              137267
is_tarmac                   0
stage_type                  0
height                 102628
weight                 103761
bmi                    104035
home_game                   0
date                        0
target                      0
dtype: int64


dio cane

Normalize the features before predicting

In [5]:
scal = StandardScaler()
X_tr = scal.fit_transform(X_tr)
X_vl = scal.transform(X_vl)
X_ts = scal.transform(X_ts)

Let's build this model

In [10]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_tr.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[
    'accuracy',
    'recall',
    'f1_score'
    ])

Now fit it

In [11]:
history = model.fit(
    X_tr, y_tr, 
    validation_data=(X_vl, y_vl), 
    epochs=5, 
    batch_size=32,
    verbose=2
)

Epoch 1/30
11099/11099 - 15s - 1ms/step - accuracy: 0.8545 - f1_score: 3.8699e-05 - loss: nan - recall: 1.9357e-05 - val_accuracy: 0.8566 - val_f1_score: 0.0000e+00 - val_loss: nan - val_recall: 0.0000e+00
Epoch 2/30
11099/11099 - 15s - 1ms/step - accuracy: 0.8545 - f1_score: 0.0000e+00 - loss: nan - recall: 0.0000e+00 - val_accuracy: 0.8566 - val_f1_score: 0.0000e+00 - val_loss: nan - val_recall: 0.0000e+00
Epoch 3/30
11099/11099 - 17s - 2ms/step - accuracy: 0.8545 - f1_score: 0.0000e+00 - loss: nan - recall: 0.0000e+00 - val_accuracy: 0.8566 - val_f1_score: 0.0000e+00 - val_loss: nan - val_recall: 0.0000e+00
Epoch 4/30
11099/11099 - 17s - 1ms/step - accuracy: 0.8545 - f1_score: 0.0000e+00 - loss: nan - recall: 0.0000e+00 - val_accuracy: 0.8566 - val_f1_score: 0.0000e+00 - val_loss: nan - val_recall: 0.0000e+00
Epoch 5/30
11099/11099 - 15s - 1ms/step - accuracy: 0.8545 - f1_score: 0.0000e+00 - loss: nan - recall: 0.0000e+00 - val_accuracy: 0.8566 - val_f1_score: 0.0000e+00 - val_loss:

KeyboardInterrupt: 