In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import math
import csv
import re

from dotenv import load_dotenv

# Scraping

### env variables

In [2]:
load_dotenv()

BASE_PATH = os.environ['BASE_PATH']
RESULTS_PATH = os.environ['RESULTS_PATH']
RANKINGS_PATH = os.environ['RANKINGS_PATH']
SPECIALITY_RANKINGS_PATH = os.environ['SPECIALITY_RANKINGS_PATH']
TEAMS_PATH = os.environ['TEAMS_PATH']
CALENDARS_PATH = os.environ['CALENDARS_PATH']
STARTLISTS_PATH = os.environ['STARTLISTS_PATH']
RACERESULTS_PATH = os.environ['RACERESULTS_PATH']
IMG_PATH = os.environ['IMG_PATH']
RIDERSTATS_PATH = os.environ['RIDERSTATS_PATH']
RACENAMES_PATH = os.environ['RACENAMES_PATH']

# Load data

In [3]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

# Goals

- Compute average speed for top professionals in ITT for given type of course (distance, elevation)
- select which are the top professionals
- select junior riders
- compute junior riders ITT speed and compare
- for successful ITT specialists, visualize progression from junior to pro

# Top pros

- top 20 riders by PCS points

In [4]:
year = '2022'
date = '2022-03-25'

itt_ranking = pd.read_csv(os.path.join(SPECIALITY_RANKINGS_PATH, 'ITT', year, date, 'itt.csv'))
itt_ranking.Rider.iloc[:5]

0         Küng Stefan
1       Ganna Filippo
2      Asgreen Kasper
3    Bissegger Stefan
4       Roglič Primož
Name: Rider, dtype: object

In [5]:
def normalize_name(name):
    
    return '-'.join(reversed(name.split(' '))).lower()

In [6]:
year = '2022'
rider_name = normalize_name(itt_ranking.Rider[0])

df = pd.read_csv(os.path.join(RESULTS_PATH, rider_name, f'{year}.csv'), encoding='UTF-8')

In [7]:
df

Unnamed: 0,Date,Result,GC,Race,Distance,PointsPCS,PointsUCI
0,25.03,3.0,,E3 Saxo Bank Classic (1.UWT),203.9,110.0,260.0
1,06.03 » 13.03,,,Paris - Nice (2.UWT),,,
2,,10.0,,Points classification,,,
3,,21.0,,General classification,,18.0,20.0
4,13.03,6.0,,Stage 8 - Nice › Nice,115.6,7.0,
5,12.03,38.0,33.0,Stage 7 - Nice › Col de Turini,155.2,,
6,11.03,66.0,32.0,Stage 6 - CourthÃ©zon › Aubagne,213.6,,
7,10.03,49.0,26.0,Stage 5 - Saint-Just-Saint-Rambert › Saint-Sau...,188.8,,
8,09.03,4.0,24.0,Stage 4 (ITT) - DomÃ©rat › MontluÃ§on,13.4,13.0,
9,08.03,40.0,45.0,Stage 3 - Vierzon › Dun-le-Palestel,190.8,,


### Correct encoding errors

In [8]:
replace_dict = {'Ã©': 'é', 
              'Ã§': 'ç', 
              'Ã³': 'ó',
              'Ã´': 'ô',
              'Ã£': 'ã',
              'Ã': 'É'}

df.replace(replace_dict, regex=True, inplace=True)

### Clean data

In [12]:
import sys
sys.path.insert(1, '../scripts')

In [13]:
from data_prep_utils import *

In [14]:
clean_data(df, rider_name, year, save=False)

Unnamed: 0,FromDate,Result,GC,Race,Distance,PointsPCS,PointsUCI,Type,ToDate,RaceName
0,2022-03-25,3,,E3 Saxo Bank Classic (1.UWT),203.9,110.0,260.0,OneDayRace,2022-03-25,E3 Saxo Bank Classic (1.UWT)
1,2022-03-06,0,,Paris - Nice (2.UWT),,,,StageRace,2022-03-13,Paris - Nice (2.UWT)
2,2022-03-13,10,,Points classification,,,,StageRace,2022-03-13,Paris - Nice (2.UWT)
3,2022-03-13,21,,General classification,,18.0,20.0,StageRace,2022-03-13,Paris - Nice (2.UWT)
4,2022-03-13,6,,Stage 8 - Nice › Nice,115.6,7.0,,StageRace,2022-03-13,Paris - Nice (2.UWT)
5,2022-03-12,38,33.0,Stage 7 - Nice › Col de Turini,155.2,,,StageRace,2022-03-12,Paris - Nice (2.UWT)
6,2022-03-11,66,32.0,Stage 6 - Courthézon › Aubagne,213.6,,,StageRace,2022-03-11,Paris - Nice (2.UWT)
7,2022-03-10,49,26.0,Stage 5 - Saint-Just-Saint-Rambert › Saint-Sau...,188.8,,,StageRace,2022-03-10,Paris - Nice (2.UWT)
8,2022-03-09,4,24.0,Stage 4 (ITT) - Domérat › Montluçon,13.4,13.0,,StageRace,2022-03-09,Paris - Nice (2.UWT)
9,2022-03-08,40,45.0,Stage 3 - Vierzon › Dun-le-Palestel,190.8,,,StageRace,2022-03-08,Paris - Nice (2.UWT)


### Select ITTs

In [15]:
df[df.Race.str.contains('ITT|Chrono des Nations')]

Unnamed: 0,FromDate,Result,GC,Race,Distance,PointsPCS,PointsUCI,Type,ToDate,RaceName
8,2022-03-09,4,24.0,Stage 4 (ITT) - Domérat › Montluçon,13.4,13.0,,StageRace,2022-03-09,Paris - Nice (2.UWT)
17,2022-02-19,2,5.0,Stage 4 (ITT) - Vila Real de Santo António › T...,32.2,18.0,10.0,StageRace,2022-02-19,Volta ao Algarve em Bicicleta (2.Pro)


In [21]:
for race in df[df.Race.str.contains('ITT|Chrono des Nations')].iterrows():
    print(race[1].RaceName)


Paris - Nice (2.UWT)
Volta ao Algarve em Bicicleta (2.Pro)
