In [None]:
# Scrape the Final Master Startlist from ProCyclingStats (first July week)

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def scrape_latest_startlist(year=2025):
    """
    Scrapes the master startlist for a given year of the Tour de France.
    This function gets only the team and rider names.
    """
    url = f"https://www.procyclingstats.com/race/tour-de-france/{year}/startlist"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    print(f"Fetching LATEST startlist from: {url}")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.content, "html.parser")
    all_riders = []
    team_list_container = soup.find('ul', class_='startlist_v4')
    if not team_list_container:
        print("Could not find startlist container. Website structure may have changed.")
        return pd.DataFrame()

    teams = team_list_container.find_all('li', recursive=False)
    print(f"Found {len(teams)} teams. Processing...")

    for team in teams:
        team_name_tag = team.find('h4').find('a')
        if not team_name_tag: continue
        team_name = team_name_tag.text.strip()
        rider_tags = team.find_all('a')
        for rider_tag in rider_tags:
            rider_name = rider_tag.text.strip()
            # This logic ensures we only get rider names, not team names or other links
            if " " in rider_name and rider_name != team_name and len(rider_name) > 2:
                rider_url = rider_tag['href']
                all_riders.append({
                    "team": team_name,
                    "rider_name": rider_name,
                    "rider_url": "https://www.procyclingstats.com/" + rider_url
                })
    
    if not all_riders:
        print("No riders were scraped.")
        return pd.DataFrame()

    return pd.DataFrame(all_riders)

# --- Execute the scrape on Monday ---
print("--- Step 1: Scraping latest official startlist ---")
latest_startlist_df = scrape_latest_startlist()

if not latest_startlist_df.empty:
    print(f"✅ Success! Found {len(latest_startlist_df)} riders on the latest list.")
    display(latest_startlist_df.head())
else:
    print("❌ Scraping failed.")

In [None]:
# Compare, Enrich, and Create Final Dataset

def scrape_rider_details(rider_url):
    """Scrapes the detailed information for a single rider from their profile page."""
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    try:
        response = requests.get(rider_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        details = {'rider_url': rider_url}
        
        # Extract birthdate, nationality, height, weight
        info_div = soup.find('div', class_='rdr-info-cont')
        if info_div:
            # Using .get_text(strip=True) is more robust
            details['birthdate'] = info_div.find(string=lambda t: '19' in t or '20' in t, recursive=True) or None
            if details['birthdate']:
                details['birthdate'] = details['birthdate'].split('(')[0].strip()
            
            details['nationality'] = info_div.find('span', class_='flag').next_sibling.strip() or None
            
            height_tag = info_div.find(string=lambda t: 'm' in t and t.replace('.', '', 1).isdigit())
            details['height'] = float(height_tag.replace('m', '')) if height_tag else None

            weight_tag = info_div.find(string=lambda t: 'kg' in t)
            details['weight'] = float(weight_tag.replace('kg', '')) if weight_tag else None

        # Extract specialties
        specialties_div = soup.find('div', class_='pps')
        if specialties_div:
            for specialty in specialties_div.find_all('div'):
                name = specialty.find('div').text.lower().replace(' ', '_')
                value = int(specialty.find('div', class_='pnt').text)
                details[name] = value
        
        return details
    except Exception as e:
        print(f"  - Could not scrape details for {rider_url}. Error: {e}")
        return {'rider_url': rider_url} # Return URL to identify failure

# --- Main Logic ---
print("\n--- Step 2: Updating your detailed rider list ---")

# 1. Load your existing detailed dataset
try:
    existing_detailed_df = pd.read_csv(r'C:\Users\anonym\Documents\Bootcamp\tour-de-france-project\notebooks\tdf_2025_startlist_full_details.csv')
    print(f"Loaded {len(existing_detailed_df)} riders from your existing file.")
except FileNotFoundError:
    print("Error: Could not find your existing detailed CSV file. Please check the path.")
    existing_detailed_df = pd.DataFrame(columns=['rider_name']) # Create empty df to avoid errors


# 2. Identify riders to remove and add
existing_names = set(existing_detailed_df['rider_name'])
latest_names = set(latest_startlist_df['rider_name'])

riders_to_remove = existing_names - latest_names
riders_to_add = latest_names - existing_names

print(f"Riders to REMOVE: {len(riders_to_remove)}. Names: {list(riders_to_remove)}")
print(f"Riders to ADD: {len(riders_to_add)}. Names: {list(riders_to_add)}")


# 3. Remove withdrawn riders
df_after_removals = existing_detailed_df[~existing_detailed_df['rider_name'].isin(riders_to_remove)]


# 4. Scrape details for new riders
new_riders_data = []
if riders_to_add:
    print("\nScraping details for new riders...")
    new_rider_info = latest_startlist_df[latest_startlist_df['rider_name'].isin(riders_to_add)]
    
    for index, rider in new_rider_info.iterrows():
        print(f" - Scraping {rider['rider_name']}...")
        details = scrape_rider_details(rider['rider_url'])
        full_details = {**rider, **details} # Combine basic info with scraped details
        new_riders_data.append(full_details)
        time.sleep(random.uniform(1, 2)) # Be polite to the server
    
    new_riders_df = pd.DataFrame(new_riders_data)
else:
    new_riders_df = pd.DataFrame()


# 5. Combine old and new data to create the final, updated DataFrame
final_updated_df = pd.concat([df_after_removals, new_riders_df], ignore_index=True)

# 6. Save the final, updated startlist for your pipeline
output_filename = r'C:\Users\raclo\Documents\Bootcamp\tour-de-france-project\notebooks\tdf_2025_startlist_full_details_FINAL.csv'
final_updated_df.to_csv(output_filename, index=False)

print("\n" + "="*50)
print(f"✅ FINAL STARTLIST CREATED! Saved to '{output_filename}'")
print(f"Total riders in final list: {len(final_updated_df)}")
print("This is the file you will use as the input for your main prediction pipeline.")
print("="*50)
display(final_updated_df.head())