In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [105]:
def scrape_participants(tour, year):
    # define url for startlist
    url = f'https://www.procyclingstats.com/race/{tour}/{year}/stage-21/startlist'
    
    #scrape page
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    #get all_teams
    all_teams = soup.find_all('li', class_='team')
    
    #loop over teams
    master_ls = []
    for t in all_teams:
        team = t.a.text
        riders = t.find_all('a', class_='blue')
        for r in riders:
            dict = {}
            rider = r.span.text
            href = r['href']
            dict['rider'] = href.split('/')[-1]
            dict['team'] = team
            dict['href'] = href
            dict['tour'] = tour
            dict['year'] = year
            master_ls.append(dict)
            
    return master_ls

In [106]:
scrape_participants("tour-de-france", 2021)

[{'rider': 'tadej-pogacar',
  'team': 'UAE Team Emirates',
  'href': 'rider/tadej-pogacar',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'mikkel-bjerg',
  'team': 'UAE Team Emirates',
  'href': 'rider/mikkel-bjerg',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'rui-costa',
  'team': 'UAE Team Emirates',
  'href': 'rider/rui-costa',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'davide-formolo',
  'team': 'UAE Team Emirates',
  'href': 'rider/davide-formolo',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'marc-hirschi',
  'team': 'UAE Team Emirates',
  'href': 'rider/marc-hirschi',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'vegard-stake-laengen',
  'team': 'UAE Team Emirates',
  'href': 'rider/vegard-stake-laengen',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'rafal-majka',
  'team': 'UAE Team Emirates',
  'href': 'rider/rafal-majka',
  'tour': 'tour-de-france',
  'year': 2021},
 {'rider': 'brandon-mcnulty',
  'team': '

In [178]:
def scrape_performance(rider, endpoint, year):
    
    #set up
    base_url = 'https://www.procyclingstats.com/'
    url = base_url+endpoint+'/'+str(year)
    
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    result_ls = []
    
    #get stage_race results
    stage_races = soup.find_all('tr', {'data-main': '0'})
    
    for o in stage_races:
        dict = {}
        o = o.find_all('td')
        dict['year'] = str(year)
        dict['type'] = 'etappe'
        dict['date'] = o[0].text
        if len(dict['date']) == 0:
            dict['type'] = 'gc'
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    #get one day race results
    one_day_races = soup.find_all('tr', {'data-main': '1'})
    
    for o in one_day_races:
        dict = {}
        o = o.find_all('td')
        dict['year'] = str(year)
        dict['type'] = 'one_day'
        dict['date'] = o[0].text
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    return result_ls

In [179]:
tadej = scrape_performance('tadej-pogacar', 'rider/tadej-pogacar', 2021)

In [180]:
tadej_df = pd.DataFrame(tadej)
tadej_df.tail(40)

Unnamed: 0,year,type,date,result,gc,icon,race_ref,race_name,race_detail,race_rank,distance
44,2021,gc,,1,,st7,race/tirreno-adriatico/2021/stage-7-kom,tirreno-adriatico,Mountains classification,,
45,2021,gc,,2,,st5,race/tirreno-adriatico/2021/stage-7-points,tirreno-adriatico,Points classification,,
46,2021,gc,,1,,st4,race/tirreno-adriatico/2021/gc,tirreno-adriatico,General classification,,
47,2021,etappe,16.03,4,,chrono,race/tirreno-adriatico/2021/stage-7,tirreno-adriatico,Stage 7 (ITT) - San Benedetto del Tronto › San...,,10.1
48,2021,etappe,15.03,25,1.0,stage,race/tirreno-adriatico/2021/stage-6,tirreno-adriatico,Stage 6 - Castelraimondo › Lido di Fermo,,169.0
49,2021,etappe,14.03,2,1.0,stage,race/tirreno-adriatico/2021/stage-5,tirreno-adriatico,Stage 5 - Castellalto › Castelfidardo,,205.0
50,2021,etappe,13.03,1,1.0,stage,race/tirreno-adriatico/2021/stage-4,tirreno-adriatico,Stage 4 - Terni › Prati di Tivo,,148.0
51,2021,etappe,12.03,8,5.0,stage,race/tirreno-adriatico/2021/stage-3,tirreno-adriatico,Stage 3 - Monticiano › Gualdo Tadino,,219.0
52,2021,etappe,11.03,4,9.0,stage,race/tirreno-adriatico/2021/stage-2,tirreno-adriatico,Stage 2 - Camaiore › Chiusdino,,202.0
53,2021,etappe,10.03,29,31.0,stage,race/tirreno-adriatico/2021/stage-1,tirreno-adriatico,Stage 1 - Lido di Camaiore › Lido di Camaiore,,156.0


In [164]:
tadej[len(tadej_df['date']) > 5]

{'year': '2021',
 'type': 'gc',
 'date': '',
 'result': '1',
 'gc': '',
 'icon': 'st7',
 'race_ref': 'race/tour-de-france/2021/stage-21-kom',
 'race_name': 'tour-de-france',
 'race_detail': 'Mountains classification',
 'race_rank': '',
 'distance': ''}

In [67]:
base_url = 'https://www.procyclingstats.com/'

endpoint = 'rider/tadej-pogacar/'

year = '2021'

rider = 'POGAČAR Tadej'

url = base_url+endpoint+year

In [68]:
response = requests.get(url).content

soup = BeautifulSoup(response)

In [77]:
stage_races = soup.find_all('tr', {'data-main': '0'})

one_day_races = soup.find_all('tr', {'data-main': '1'})

In [88]:
master_ls = []
for o in one_day_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'one_day'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [89]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,09.10,1,race/il-lombardia/2021/result,Il Lombardia (1.UWT),(1.UWT),239.0
1,06.10,4,race/milano-torino/2021/result,Milano - Torino (1.Pro),51k,190.0
2,05.10,3,race/tre-valli-varesine/2021/result,Tre Valli Varesine (1.Pro),86k,196.7
3,02.10,DNF,race/giro-dell-emilia/2021/result,Giro dell'Emilia (1.Pro),(1.Pro),195.3
4,26.09,37,race/world-championship/2021/result,World Championships - Road Race (WC),(WC),268.3
5,19.09,10,race/world-championship-itt/2021/result,World Championships - ITT (WC),(WC),43.3
6,12.09,5,race/uec-road-european-championships/2021/result,European Continental Championships - Road Race...,68k,179.2
7,09.09,12,race/uec-road-european-championships-itt/2021/...,European Continental Championships - ITT (CC),(CC),22.4
8,29.08,DNF,race/bretagne-classic/2021/result,Bretagne Classic - Ouest-France (1.UWT),(1.UWT),251.0
9,24.07,3,race/olympic-games/2021/result,Olympic Games Road Race (Olympics),(Olympics),234.0


In [113]:
stage_races[0].find('span', class_='icon')['class'][-1]

'st6'

In [90]:
master_ls = []
for o in stage_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'stage_race'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [91]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,,1,race/tour-de-france/2021/stage-21-youth,Youth classification,,
1,,1,race/tour-de-france/2021/stage-21-kom,Mountains classification,,
2,,8,race/tour-de-france/2021/stage-21-points,Points classification,,
3,,1,race/tour-de-france/2021/gc,General classification,,
4,18.07,72,race/tour-de-france/2021/stage-21,Stage 21 - Chatou › Paris Champs-Élysées,,108.4
...,...,...,...,...,...,...
59,25.02,2,race/uae-tour/2021/stage-5,Stage 5 - Fujairah Marine Club › Jebel Jais,,170
60,24.02,20,race/uae-tour/2021/stage-4,Stage 4 - Al Marjan Island › Al Marjan Island,,204
61,23.02,1,race/uae-tour/2021/stage-3,Stage 3 - Strata Manufactoring › Jebel Hafeet,,166
62,22.02,4,race/uae-tour/2021/stage-2,Stage 2 (ITT) - Al Hudayriat Island › Al Huday...,,13
