In [1]:
%load_ext autoreload
%autoreload 2

Using simpler models with the same results. Can this be considered a victory?

In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from utility.classification_utility import *

2024-12-31 18:31:48.704683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735666308.818533   68460 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735666308.851408   68460 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 18:31:49.089376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Let's start by building our beautiful dataset

In [2]:
cyc = '../dataset/cyclists_cleaned.csv'
races = '../dataset/races_cleaned.csv'
df = make_dataset_for_classification(races, cyc, make_stage_type=True)

100.00%  


In [3]:
# dumb imputations
df['height'] = df['height'].fillna(df['height'].mean())
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['bmi'] = df['weight']/np.square(df['height']/100)
df['cyclist_age_rac'] = df['cyclist_age_rac'].fillna(df['cyclist_age_rac'].mean())
df['climb_total'] = df['climb_total'].fillna(df['length']*0.05) # could be differentiated by lengths
df['steepness'] = df['length'] / df['climb_total']

In [None]:
# for profile, let's use the big stick
# Features and target
# Add a column 'is_ITT' where 1 indicates stage_type is 'ITT', 0 otherwise
df['is_ITT'] = (df['stage_type'] == 'ITT').astype(int)
df_filtered = df[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT', 'profile']].dropna()
X = df_filtered[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']]
y = df_filtered['profile']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# TODO: test the model


In [6]:
# Predict missing profiles
missing_profiles = df[df['profile'].isna()]
df.loc[missing_profiles.index, 'profile'] = clf.predict(missing_profiles[['length', 'climb_total', 'steepness', 'average_speed', 'is_ITT']])

In [9]:
TO_USE_COLS = [
    # over time
    'total_points',
    'avg_points_per_race', 
    'average_position',
    'avg_speed_cyclist', 
    'mean_stamina_index',
    'race_count',
    # race related
    'length',
    'climb_total', 
    'profile', 
    'startlist_quality', 
    'cyclist_age_rac', 
    'steepness', 
    'is_tarmac', 
    'stage_type',
    # cyclist related
    'height',
    'weight',
    'bmi',
    'home_game',
    # for the split
    'date',
    'target'
]

df_to_use = df[TO_USE_COLS]

In [10]:
df_tr, df_vl, df_ts = get_data_split(df_to_use)

df_tr = df_tr.drop(columns=['date'])
df_vl = df_vl.drop(columns=['date'])
df_ts = df_ts.drop(columns=['date'])

X_tr, y_tr = split_features_target(df_tr)
X_vl, y_vl = split_features_target(df_vl)
X_ts, y_ts = split_features_target(df_ts)

Normalize the features before predicting

In [11]:
scal = StandardScaler()
X_tr = scal.fit_transform(X_tr)
X_vl = scal.transform(X_vl)
X_ts = scal.transform(X_ts)

Let's build this model

In [12]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_tr.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    # tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[
    'accuracy',
    'recall',
    'f1_score'
    ])

2024-12-31 18:38:29.770090: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Now fit it

In [13]:
history = model.fit(
    X_tr, y_tr, 
    validation_data=(X_vl, y_vl), 
    epochs=5, 
    batch_size=32,
    verbose=2
)

Epoch 1/5
11099/11099 - 29s - 3ms/step - accuracy: 0.8569 - f1_score: 0.2540 - loss: 0.3598 - recall: 0.0964 - val_accuracy: 0.8643 - val_f1_score: 0.2509 - val_loss: 0.3383 - val_recall: 0.1289
Epoch 2/5
11099/11099 - 26s - 2ms/step - accuracy: 0.8595 - f1_score: 0.2540 - loss: 0.3535 - recall: 0.1110 - val_accuracy: 0.8633 - val_f1_score: 0.2509 - val_loss: 0.3361 - val_recall: 0.1550
Epoch 3/5
11099/11099 - 27s - 2ms/step - accuracy: 0.8598 - f1_score: 0.2540 - loss: 0.3524 - recall: 0.1150 - val_accuracy: 0.8648 - val_f1_score: 0.2509 - val_loss: 0.3357 - val_recall: 0.2113
Epoch 4/5
11099/11099 - 27s - 2ms/step - accuracy: 0.8599 - f1_score: 0.2540 - loss: 0.3517 - recall: 0.1181 - val_accuracy: 0.8644 - val_f1_score: 0.2509 - val_loss: 0.3340 - val_recall: 0.1975
Epoch 5/5
11099/11099 - 26s - 2ms/step - accuracy: 0.8602 - f1_score: 0.2540 - loss: 0.3509 - recall: 0.1228 - val_accuracy: 0.8644 - val_f1_score: 0.2509 - val_loss: 0.3339 - val_recall: 0.1621


In [None]:
from sklearn.metrics import classification_report, f1_score

val_pred = model.predict(X_vl)
print(classification_report(y_vl, val_pred))

In [15]:
print(y_vl)

449164    False
449165     True
449166    False
449167    False
449168    False
          ...  
492639    False
492640    False
492641    False
492642    False
492643    False
Name: target, Length: 43480, dtype: bool


In [16]:
print(val_pred)

[[0.01081997]
 [0.56833375]
 [0.01587189]
 ...
 [0.17953698]
 [0.1327488 ]
 [0.07120073]]
