In [1]:
import numpy as np
import pandas as pd
import joblib
from thefuzz import process

In [None]:
# Load All Necessary Assets
# --- Load the pre-trained model, scaler, and column list ---
model = joblib.load('best_rf_model_2025.joblib')
scaler = joblib.load('scaler_2025.joblib')
model_columns = joblib.load('model_columns_2025.joblib')

# --- Load the new 2025 startlist data ---
# On Monday, this file will come from your scraper.
df_2025 = pd.read_csv(r'C:\Users\anonym\Documents\Bootcamp\tour-de-france-project\notebooks\tdf_2025_startlist_full_details_FINAL.csv')

# --- Load historical data (REQUIRED for feature engineering) ---
df_raw = pd.read_csv(r'C:\Users\raclo\Documents\Bootcamp\tour-de-france-project\data\TDF_full_ranking_2014_2024_encoded.csv')

# --- Load UCI and PCS ranking files ---
uci_df = pd.read_csv(r'C:\Users\raclo\Documents\Bootcamp\tour-de-france-project\notebooks\uci_rankings.csv')
pcs_df = pd.read_csv(r'C:\Users\raclo\Documents\Bootcamp\tour-de-france-project\notebooks\pcs_rankings.csv')

print("All assets loaded successfully.")

All assets loaded successfully.


In [4]:
# Full Feature engineering for the 2025 data
# --- Rename columns to match historical data ---
df_2025 = df_2025.rename(columns={
    'one_day_races': 'speciality_one_day_races', 'gc': 'speciality_gc',
    'time_trial': 'speciality_time_trial', 'sprint': 'speciality_sprint',
    'climber': 'speciality_climber', 'hills': 'speciality_hills', 'name': 'rider_name'
})

# --- Initial Feature Creation ---
df_2025['year_edition'] = 2025
df_2025['age'] = 2025 - pd.to_datetime(df_2025['birthdate']).dt.year
df_2025['bmi'] = df_2025['weight'] / (df_2025['height'] ** 2)

# --- Experience Feature ---
first_year_map = df_raw.groupby('rider_name')['year_edition'].min()
df_2025['first_year'] = df_2025['rider_name'].map(first_year_map).fillna(2025)
df_2025['experience'] = df_2025['year_edition'] - df_2025['first_year']

# --- Lagged Team Median Rank Feature ---
team_rank_2024 = df_raw[df_raw['year_edition'] == 2024].groupby('team')['rank'].median()
df_2025['team_median_rank_last_year'] = df_2025['team'].map(team_rank_2024)
df_2025['team_median_rank_last_year'].fillna(team_rank_2024.median(), inplace=True)

# --- Nationality Grouping Feature ---
top_countries = ['FR', 'ES', 'BE', 'NL', 'IT', 'CO', 'SI', 'DK', 'DE', 'GB', 'US']
region_map = {
    'PT': 'Southern Europe', 'BY': 'Eastern Europe', 'CH': 'Western Europe', 'PL': 'Eastern Europe', 'AU': 'Oceania',
    'EC': 'South America', 'AT': 'Central Europe', 'LU': 'Western Europe', 'KZ': 'Central Asia', 'EE': 'Baltic',
    'IE': 'Western Europe', 'CA': 'North America', 'CZ': 'Central Europe', 'SK': 'Central Europe', 'NO': 'Nordic',
    'ER': 'Africa', 'ZA': 'Africa', 'CR': 'Central America', 'ET': 'Africa', 'HR': 'Balkan', 'JP': 'Asia',
    'LV': 'Baltic', 'NZ': 'Oceania', 'IL': 'Middle East', 'AR': 'South America', 'LT': 'Baltic',
    'RU': 'Eastern Europe', 'SE': 'Nordic', 'CN': 'Asia'
}
conditions = [
    df_2025['nationality'].isin(top_countries),
    df_2025['nationality'].isin(region_map.keys())
]
choices = [df_2025['nationality'], df_2025['nationality'].map(region_map)]
df_2025['nationality_group'] = np.select(conditions, choices, default='Other')
nationality_dummies_2025 = pd.get_dummies(df_2025['nationality_group'], prefix='nat', dtype=int)
df_2025 = pd.concat([df_2025, nationality_dummies_2025], axis=1)

# --- Final Column Alignment ---
# Use the loaded model_columns list to ensure perfect alignment
df_2025_features = df_2025.reindex(columns=model_columns, fill_value=0)

print("2025 data prepared for prediction.")
print("Shape of prediction data:", df_2025_features.shape)

2025 data prepared for prediction.
Shape of prediction data: (117, 39)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2025['team_median_rank_last_year'].fillna(team_rank_2024.median(), inplace=True)


In [5]:
# Make ML Predictions
# 1. Scale the 2025 features using the loaded scaler
X_2025_scaled = scaler.transform(df_2025_features)

# 2. Make the predictions using the loaded model
predictions_2025 = model.predict(X_2025_scaled)

# 3. Add the predictions to your 2025 DataFrame
df_2025['predicted_rank'] = predictions_2025

# 4. Create the initial ML-only ranking
ml_ranking_2025 = df_2025[['rider_name', 'team', 'predicted_rank']].sort_values(by='predicted_rank')
ml_ranking_2025['predicted_position'] = range(1, len(ml_ranking_2025) + 1)

print("ML predictions generated successfully.")
display(ml_ranking_2025.head())

ML predictions generated successfully.


Unnamed: 0,rider_name,team,predicted_rank,predicted_position
4,POGAČAR Tadej,UAE Team Emirates - XRG (WT),13.916354,1
41,VINGEGAARD Jonas,Team Visma | Lease a Bike (WT),14.727362,2
6,YATES Adam,UAE Team Emirates - XRG (WT),15.277802,3
5,ALMEIDA João,UAE Team Emirates - XRG (WT),16.97844,4
68,GAUDU David,Groupama - FDJ (WT),17.219819,5


In [7]:
# Create Hybrid Ranking
# --- Fuzzy Name Matching ---
uci_name_choices = uci_df['rider_name'].tolist()
pcs_name_choices = pcs_df['rider_name'].tolist()

def get_fuzzy_rank(name, choices_df, name_choices, rank_column_name):
    best_match, score = process.extractOne(name, name_choices)
    if score > 85:
        return choices_df[choices_df['rider_name'] == best_match][rank_column_name].iloc[0]
    return None

hybrid_ranking_df = ml_ranking_2025.copy()

print("Applying fuzzy matching...")
hybrid_ranking_df['uci_rank'] = hybrid_ranking_df['rider_name'].apply(
    lambda name: get_fuzzy_rank(name, uci_df, uci_name_choices, 'uci_rank')
)
hybrid_ranking_df['pcs_rank'] = hybrid_ranking_df['rider_name'].apply(
    lambda name: get_fuzzy_rank(name, pcs_df, pcs_name_choices, 'pcs_rank')
)

hybrid_ranking_df['uci_rank'] = hybrid_ranking_df['uci_rank'].fillna(500)
hybrid_ranking_df['pcs_rank'] = hybrid_ranking_df['pcs_rank'].fillna(500)

# --- Calculate Final Hybrid Score ---
weight_ml, weight_uci, weight_pcs = 0.60, 0.20, 0.20
hybrid_ranking_df['hybrid_score'] = (
    (hybrid_ranking_df['predicted_position'] * weight_ml) +
    (hybrid_ranking_df['uci_rank'] * weight_uci) +
    (hybrid_ranking_df['pcs_rank'] * weight_pcs)
)

final_hybrid_ranking = hybrid_ranking_df.sort_values(by='hybrid_score')
final_hybrid_ranking['hybrid_position'] = range(1, len(final_hybrid_ranking) + 1)

print("Hybrid score calculated successfully.")
display(final_hybrid_ranking.head())

Applying fuzzy matching...
Hybrid score calculated successfully.


Unnamed: 0,rider_name,team,predicted_rank,predicted_position,uci_rank,pcs_rank,hybrid_score,hybrid_position
4,POGAČAR Tadej,UAE Team Emirates - XRG (WT),13.916354,1,1.0,1.0,1.0,1
41,VINGEGAARD Jonas,Team Visma | Lease a Bike (WT),14.727362,2,7.0,6.0,3.8,2
5,ALMEIDA João,UAE Team Emirates - XRG (WT),16.97844,4,6.0,3.0,4.2,3
20,EVENEPOEL Remco,Soudal Quick-Step (WT),21.448155,11,2.0,2.0,7.4,4
14,ROGLIČ Primož,Red Bull - BORA - hansgrohe (WT),20.317863,8,8.0,11.0,8.6,5


In [8]:
# Prepare and save final output for Streamlit
# Merge with df_2025 to get details like 'nationality' and 'age'
final_hybrid_ranking_details = pd.merge(
    final_hybrid_ranking,
    df_2025[['rider_name', 'nationality', 'age']],
    on='rider_name',
    how='left'
)

# Add the placeholder 'image_url' column
final_hybrid_ranking_details['image_url'] = ""

# Define a clean order for the columns
column_order = [
    'hybrid_position', 'rider_name', 'team', 'nationality', 'age', 'image_url',
    'hybrid_score', 'predicted_position', 'uci_rank', 'pcs_rank'
]
final_app_data = final_hybrid_ranking_details.reindex(columns=column_order)

# Save the final data to a CSV
output_filename = 'tour_de_france_2025_app_data.csv'
final_app_data.to_csv(output_filename, index=False)

print(f"✅ Pipeline complete! Final data saved to '{output_filename}'")
display(final_app_data.head())

✅ Pipeline complete! Final data saved to 'tour_de_france_2025_app_data.csv'


Unnamed: 0,hybrid_position,rider_name,team,nationality,age,image_url,hybrid_score,predicted_position,uci_rank,pcs_rank
0,1,POGAČAR Tadej,UAE Team Emirates - XRG (WT),SI,27,,1.0,1,1.0,1.0
1,2,VINGEGAARD Jonas,Team Visma | Lease a Bike (WT),DK,29,,3.8,2,7.0,6.0
2,3,ALMEIDA João,UAE Team Emirates - XRG (WT),PT,27,,4.2,4,6.0,3.0
3,4,EVENEPOEL Remco,Soudal Quick-Step (WT),BE,25,,7.4,11,2.0,2.0
4,5,ROGLIČ Primož,Red Bull - BORA - hansgrohe (WT),SI,36,,8.6,8,8.0,11.0


In [None]:
# Visualize the ML Model's Top 15 Predictions

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# --- Step 1: Data Preparation ---

# To ensure 'nationality' and 'team' are present, we merge the ML predictions
# with the detailed df_2025 dataframe.
# The variable 'ml_ranking_2025' was created in Cell 4 of your pipeline.
ml_ranking_details = pd.merge(
    ml_ranking_2025, # This is the corrected variable name
    df_2025[['rider_name', 'nationality']],
    on='rider_name',
    how='left'
)

# Let's select the Top 15 for the plot and create a safe copy
top_15_predictions = ml_ranking_details.head(15).copy()

# Create the new label for the y-axis
top_15_predictions['rider_label'] = top_15_predictions['rider_name'] + ' (' + top_15_predictions['nationality'] + ')'


# --- Step 2: Create the Plot ---

plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(12, 10))

# Create the horizontal bar plot, using the new rider_label
ax = sns.barplot(
    x='predicted_rank',
    y='rider_label', # Use the new combined label
    data=top_15_predictions,
    hue='team',
    dodge=False,
    palette='tab20'
)

# Add the position number as an annotation on each bar
for i, row in enumerate(top_15_predictions.itertuples()):
    ax.text(
        row.predicted_rank + 0.1,
        i,
        f'#{row.predicted_position}',
        ha='left',
        va='center',
        fontsize=12,
        fontweight='bold',
        color='black'
    )

# Improve layout and labels
ax.set_title('Final ML Model Predictions - Tour de France 2025 (Top 15)', fontsize=18, fontweight='bold')
ax.set_xlabel('Predicted Rank Score (Lower is Better)', fontsize=12)
ax.set_ylabel('Rider (Nationality)', fontsize=12)

# Adjust the x-axis limit to make sure the labels fit
ax.set_xlim(right=max(top_15_predictions['predicted_rank']) * 1.15)

# Move the legend outside of the plot area
ax.legend(
    bbox_to_anchor=(1.02, 1),
    loc='upper left',
    borderaxespad=0,
    title='Team'
)

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

In [None]:
# Visualize the Final Hybrid Ranking (with Nationalities)

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# --- Step 1: Data Preparation ---

# First, create the top_15_hybrid DataFrame from the final ranking
# The 'final_hybrid_ranking' DataFrame was created in Cell 5 of your pipeline.
top_15_hybrid = final_hybrid_ranking.head(15).copy()

# Now, your original data preparation code will work perfectly.
# It merges the nationality data and creates the labels for the plot.
if 'rider_label' not in top_15_hybrid.columns:
    if 'nationality' not in top_15_hybrid.columns:
        # Merge nationality from df_2025 if missing
        top_15_hybrid = top_15_hybrid.merge(
            df_2025[['rider_name', 'nationality']],
            on='rider_name',
            how='left'
        )
    top_15_hybrid['rider_label'] = top_15_hybrid['rider_name'] + ' (' + top_15_hybrid['nationality'] + ')'

# --- Step 2: Create the Plot ---

plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(12, 10))

ax = sns.barplot(
    x='hybrid_score',
    y='rider_label',
    data=top_15_hybrid,
    hue='team',
    dodge=False,
    palette='tab20'
)

# Add the final position number as a label on each bar
for i, row in enumerate(top_15_hybrid.itertuples()):
    ax.text(
        row.hybrid_score + 0.5,
        i,
        f'#{row.hybrid_position}',
        ha='left', va='center', fontsize=12, fontweight='bold', color='black'
    )

# Improve layout and labels
ax.set_title('Final Hybrid Model Predictions - Tour de France 2025 (Top 15)', fontsize=18, fontweight='bold')
ax.set_xlabel('Hybrid Score (Lower is Better)', fontsize=12)
ax.set_ylabel('Rider (Nationality)', fontsize=12)

# Adjust the x-axis limit to give labels more space
ax.set_xlim(right=max(top_15_hybrid['hybrid_score']) * 1.2)

# Move the legend outside of the plot area
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, title='Team')

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

In [None]:
# Visualize the Most Represented Nationalities

import matplotlib.pyplot as plt
import seaborn as sns

# The 'df_2025' DataFrame from your pipeline contains all the rider details.
# We will use it to count the nationalities.
nationality_counts = df_2025['nationality'].value_counts()

# Plot the top 10
plt.figure(figsize=(10, 7))
ax = sns.barplot(
    x=nationality_counts.head(10).values,
    y=nationality_counts.head(10).index,
    palette='viridis',
    hue=nationality_counts.head(10).index,
    legend=False
)

ax.set_title('Top 10 Most Represented Nationalities in the 2025 Peloton', fontsize=16)
ax.set_xlabel('Number of Riders', fontsize=12)
ax.set_ylabel('Country', fontsize=12)

# Add count labels to the bars
for i, v in enumerate(nationality_counts.head(10).values):
    ax.text(v + 0.1, i, str(v), va='center', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Visualise team strength by average predicted rank
import matplotlib.pyplot as plt
import seaborn as sns

# Use 'ml_ranking_2025' from Cell 4, which contains 'team' and 'predicted_rank'
team_strength = ml_ranking_2025.groupby('team')['predicted_rank'].mean().sort_values(ascending=True)

# Plot the top 10 strongest teams
plt.figure(figsize=(10, 7))
ax = sns.barplot(
    x=team_strength.head(10).values,
    y=team_strength.head(10).index,
    palette='plasma',
    hue=team_strength.head(10).index,
    legend=False
)

ax.set_title('Top 10 Strongest Teams by Average Predicted Rank (ML Model)', fontsize=16)
ax.set_xlabel('Average Predicted Rank (Lower is Better)', fontsize=12)
ax.set_ylabel('Team', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Visualise age distribution of top contenders
import matplotlib.pyplot as plt
import seaborn as sns

# --- Step 1: Prepare the analysis_df ---
# This dataframe combines the age from df_2025 with the ML model's predicted position.
analysis_df = pd.merge(
    df_2025[['rider_name', 'age']],
    ml_ranking_2025[['rider_name', 'predicted_position']],
    on='rider_name'
)

# --- Step 2: Create the plot ---
# Create a flag for top contenders
analysis_df['is_top_20'] = analysis_df['predicted_position'] <= 20

# Create a box plot to compare age distributions
plt.figure(figsize=(10, 7))
sns.boxplot(
    x='is_top_20',
    y='age',
    hue='is_top_20',
    legend=False,
    data=analysis_df,
    palette='coolwarm'
)
plt.title('Age Distribution: Top 20 vs. The Rest of the Peloton', fontsize=16)
plt.xlabel('', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.xticks(ticks=[0, 1], labels=['Rest of Peloton', 'Predicted Top 20'])
plt.show()

# --- Step 3: Print the exact average ages ---
avg_age_top20 = analysis_df[analysis_df['is_top_20']]['age'].mean()
avg_age_rest = analysis_df[~analysis_df['is_top_20']]['age'].mean()
print(f"Average age of predicted Top 20: {avg_age_top20:.1f} years")
print(f"Average age of the rest of the peloton: {avg_age_rest:.1f} years")