### Imports

In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Base functions

In [5]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]

### Scraper team status before tour

In [6]:
def clean_scrape_teams(row, year, race):
    di = {}
    team = row[1].text.lstrip()
    division = row[2].text
    points = row[3].text
    di['team'] = team
    di['team_class'] = division
    di['team_points'] = points
    di['year'] = year
    di['race'] = race
    return di

In [7]:
def get_teams(race):
    years = [2016, 2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/{race}/{year}/gc/startlist/teams-ranked'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_scrape_teams(chunks(td, 4, chunk), year, race)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data/data_teams_{race}.csv', index=False)

for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:
#for race in [ 'tour-de-france']:

    get_teams(race)

### Get age participants

In [8]:
def clean_age(row, year, race):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    age = row[3].text
    di['name'] = name
    di['team'] = team
    di['age'] = age[0:2]
    di['year'] = year
    di['race'] = race
    return di

In [9]:
def get_age(race):
    years = [2016, 2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/{race}/{year}/gc/startlist/youngest-competitors'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_age(chunks(td, 4, chunk), year, race)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data/data_age_{race}.csv', index=False)
    
for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:
#for race in [ 'tour-de-france']:
    get_age(race)

### Get profile riders

In [10]:
def clean_scrape_profile(row, category, year, race):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di['name'] = name
    di['team'] = team
    di['points'] = points
    di['ranking'] = ranking
    di['type'] = category
    di['year'] = year
    return di

In [11]:
def get_profile(race):
    categories = ['top-gc-riders', 'top-sprinters', 'top-time-trial-riders', 'best-classic-riders']
    years = [2016, 2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
        for category in categories:
            url = f'https://www.procyclingstats.com/race.php?missing=0&filter=Filter&id1={race}&id2={year}&id3=gc&id4=startlist&id5={category}'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 5)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_profile(chunks(td, 5, chunk), category, year, race)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data/data_profile_{race}.csv', index=False)

for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:
#for race in ['tour-de-france']:
    get_profile(race)

### Get number of finishes

In [12]:
def get_finishes(race):
    years = [2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/{race}/{year}/stage-1/startlist/most-starts-finishes'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser').find('table', class_='basic').tbody
            tr = soup.find_all('tr')
            ls = []
            for row in range(len(tr)):
                name = tr[row].find_all('td')[1].text.strip()
                finished = int(tr[row].find_all('td')[3].text) / int(tr[0].find_all('td')[2].text)
                di = {}
                di['rider'] = name
                di['finished'] = finished
                di['year'] = str(year)
                di['race'] = race
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data/data_finishes_{race}.csv', index=False)

for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:
#for race in ['tour-de-france']:
    get_finishes(race)

In [14]:
def clean_scrape_performance(row, year):
    di = {}
    name_s = row[1].text.strip()
    name_ls = name_s.split()
    name = ' '.join([i.capitalize() for i in name_ls])
    gc_result = row[2].text
    giro_w = row[3].text
    stage_result = row[4].text
    stage_wins = row[5].text
    top10 = row[5].text
    total = row[7].text
    di['name'] = name
    di['gc_result'] = gc_result
    di['gc_w'] = giro_w
    di['stage_result'] = stage_result
    di['stage_wins'] = stage_wins
    di['top10'] = top10
    di['gc_total'] = total
    di['year'] = year
    return di

In [16]:
def get_performance(race):
    years = [2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/{race}/{year}/gc/startlist/previous-performance'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 9)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_performance(chunks(td, 9, chunk), year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data_performance_{race}.csv', index=False)


for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:

    get_performance(race)

In [17]:
def clean_scrape_stages(row, year, stage):
    di = {}
    rnk = row[0].text
    name = row[0].text
    di['rnk'] = rnk
    di['name'] = name
    di['key'] = str(year) + '-' + str(stage) + '-stage'

In [18]:
def get_stages(race):
    years = [2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
        #print(year)
        for stage in range(20,22):
                url = f'https://www.procyclingstats.com/race/{race}/{year}/stage-{stage}'
                print(url)
                #print(url)
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
                td = soup.find_all('tr', class_=None)
                length = 4
                full_list_chunks = np.arange(0,len(td), length)
                print(full_list_chunks)
                ls = []
                for chunk in full_list_chunks:
                        di = clean_scrape_stages(chunks(td, length, chunk), year, stage)
                        ls.append(di)
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
                print(f"{year}-{stage}—done-norm")
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv(f'data_stages_{race}.csv', index=False)
    
for race in ['giro-d-italia', 'vuelta-a-espana', 'tour-de-france']:

    get_stages(race)

https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20
[  0   4   8  12  16  20  24  28  32  36  40  44  48  52  56  60  64  68
  72  76  80  84  88  92  96 100 104 108 112 116 120 124 128 132 136 140
 144 148 152 156 160 164 168 172 176 180 184 188 192 196 200 204 208 212
 216 220 224 228 232 236 240 244 248 252 256 260 264 268 272 276 280 284
 288 292 296 300 304 308 312 316 320 324 328 332 336 340 344 348 352 356
 360 364 368 372 376 380 384 388 392 396 400 404 408 412 416 420 424 428
 432 436 440 444 448 452 456 460 464 468 472 476 480 484 488 492 496 500
 504 508 512 516 520 524 528 532 536 540 544 548 552 556 560 564 568 572]
2017-20—done-norm
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21
[  0   4   8  12  16  20  24  28  32  36  40  44  48  52  56  60  64  68
  72  76  80  84  88  92  96 100 104 108 112 116 120 124 128 132 136 140
 144 148 152 156 160 164 168 172 176 180 184 188 192 196 200 204 208 212
 216 220 224 228 232 236 240 244 248 252 256 260