In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import numpy as np

In [2]:
#scraper

tours = ['tour-de-france', 'giro-d-italia', 'vuelta-a-espana']
#years = [2020, 2019, 2018, 2017, 2016, 2015, 2014]
years = [2021, 2022, 2013, 2012, 2011, 2010]

In [3]:
def scrape_participants(tour, year):
    # define url for startlist
    
    url = f'https://www.procyclingstats.com/race/{tour}/{year}/stage-21/startlist'
    print(url)
    
    #scrape page
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    #get all_teams
    all_teams = soup.find_all('li', class_='team')
    
    #loop over teams
    master_ls = []
    for t in all_teams:
        team = t.a.text
        riders = t.find_all('a', class_='blue')
        for r in riders:
            dict = {}
            rider = r.span.text
            href = r['href']
            dict['rider'] = href.split('/')[-1]
            dict['team'] = team
            dict['href'] = href
            dict['tour'] = tour
            dict['year'] = int(year)
            master_ls.append(dict)
            
    return master_ls

In [4]:
participants_ls = []

for y, t in list(itertools.product(years, tours)):
    participants_ls.append(scrape_participants(t, y))

https://www.procyclingstats.com/race/tour-de-france/2021/stage-21/startlist
https://www.procyclingstats.com/race/giro-d-italia/2021/stage-21/startlist
https://www.procyclingstats.com/race/vuelta-a-espana/2021/stage-21/startlist
https://www.procyclingstats.com/race/tour-de-france/2022/stage-21/startlist
https://www.procyclingstats.com/race/giro-d-italia/2022/stage-21/startlist
https://www.procyclingstats.com/race/vuelta-a-espana/2022/stage-21/startlist
https://www.procyclingstats.com/race/tour-de-france/2013/stage-21/startlist
https://www.procyclingstats.com/race/giro-d-italia/2013/stage-21/startlist
https://www.procyclingstats.com/race/vuelta-a-espana/2013/stage-21/startlist
https://www.procyclingstats.com/race/tour-de-france/2012/stage-21/startlist
https://www.procyclingstats.com/race/giro-d-italia/2012/stage-21/startlist
https://www.procyclingstats.com/race/vuelta-a-espana/2012/stage-21/startlist
https://www.procyclingstats.com/race/tour-de-france/2011/stage-21/startlist
https://www.

In [5]:
pd.DataFrame(list(itertools.chain(*participants_ls))).to_csv('new_data/participants_2.csv')

In [6]:
participants_df = pd.DataFrame(list(itertools.chain(*participants_ls)))

In [7]:
from matplotlib.pyplot import text


def scrape_performance(rider, endpoint, year):
    
    #set up
    base_url = 'https://www.procyclingstats.com/'
    url = base_url+endpoint+'/'+str(year)
    
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    result_ls = []
    
    #get stage_race results
    stage_races = soup.find_all('tr', {'data-main': '0'})
    
    for o in stage_races:
        dict = {}
        o = o.find_all('td')
        dict['name'] = rider
        dict['year'] = str(year)
        dict['type'] = 'etappe'
        dict['date'] = o[0].text
        if len(dict['date']) == 0:
            dict['type'] = 'gc'
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        try:
            dict['race_rank'] = o[4].a.span.text
        except AttributeError:
            dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    #get one day race results
    one_day_races = soup.find_all('tr', {'data-main': '1'})
    
    for o in one_day_races:
        dict = {}
        o = o.find_all('td')
        dict['name'] = rider
        dict['year'] = str(year)
        dict['type'] = 'one_day'
        dict['date'] = o[0].text
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        try:
            dict['race_rank'] = o[4].a.span.text
        except AttributeError:
            dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    return result_ls

In [8]:
performance_ls = []

for index, row in participants_df.iterrows():
    performance_ls.append(scrape_performance(row['rider'], row['href'], row['year']))

In [9]:
performance_df = pd.DataFrame(list(itertools.chain(*performance_ls)))

In [10]:
performance_df.to_csv('new_data/raw_performance_2.csv')

In [13]:
stage_s = list(np.arange(2,32,2))+list(np.arange(32,48,4))+[50]
stage_s_i = list(np.arange(1,21,1))
stage_s_dict = dict(zip(stage_s_i, stage_s[::-1]))

def clean_df(ls):
    df = pd.DataFrame(ls)
    
    index_drop = df[df['result']==''].index

    dropped_df = df.drop(index_drop)

    index_drop = dropped_df[dropped_df['type']=='gc'].index

    dropped_df = dropped_df.drop(index_drop)
    
    dropped_df['date'] = pd.to_datetime(dropped_df['date'] + '.' + dropped_df['year'], infer_datetime_format=True)
    
    dropped_df['result'] =  dropped_df['result'].replace('DNF', 0).replace('DNS', 0).replace('OTL', 0).replace('DSQ', 0).replace('DF', 0).astype('int')
    
    dropped_df['points'] = dropped_df['result'].map(stage_s_dict).fillna('0').astype('int')
    
    #depreciated -> for gc 
    #stages_df = df.loc[index_drop][['race_name', 'race_rank']]#.to_dict(orient='records')
    #stages_df = stages_df.set_index('race_name').to_dict()['race_rank']
    
    return dropped_df


In [14]:
performance_clean = clean_df(list(itertools.chain(*performance_ls)))
performance_clean.to_csv('new_data/performance_clean_2.csv')

In [15]:
performance_clean.race_ref.unique()

array(['race/tour-de-france/2021/stage-21',
       'race/tour-de-france/2021/stage-20',
       'race/tour-de-france/2021/stage-19', ...,
       'race/course-cycliste-de-solidarnosc/2010/stage-3',
       'race/course-cycliste-de-solidarnosc/2010/stage-2',
       'race/course-cycliste-de-solidarnosc/2010/stage-1'], dtype=object)

In [16]:


def get_profile(list):
    extra_info_ls = []
    i=0
    
    for ref in list:
        print(i/len(performance_clean.race_ref.unique()))
        #create url
        base_url = 'https://www.procyclingstats.com/'
        url = base_url + ref
        response = requests.get(url).content
        soup = BeautifulSoup(response)
        
        #get al info
        dict = {}
        stage = soup.find('ul', class_='infolist').find_all('li')
            
        dict['href'] = ref
        #get speed
        try:
            dict[stage[2].find_all('div')[0].text] = float(stage[2].find_all('div')[1].text.strip(' km/h'))
        except ValueError:
            dict[stage[2].find_all('div')[0].text] = 0.0
        #get distance
        dict[stage[4].find_all('div')[0].text.strip()] = float(stage[4].find_all('div')[1].text.strip(' km'))
        #get parcours type
        dict[stage[6].find_all('div')[0].text.strip()] = stage[6].find_all('div')[1].span['class'][-1]
        #get profile score
        try:
            dict[stage[7].find_all('div')[0].text.strip()] = int(stage[7].find_all('div')[1].text)
        except ValueError:
            dict['ProfileScore:'] = np.nan
        #get vert meters
        try:
            dict[stage[8].find_all('div')[0].text.strip()] = int(stage[8].find_all('div')[1].text)
        except ValueError:
            dict['Vert. meters:'] = np.nan
        #get startlist
        try:
            dict[stage[12].find_all('div')[0].text.strip()] = int(stage[12].find_all('div')[1].text)
        except ValueError:
            dict['Startlist quality score:'] = np.nan
        #get won how
        try:
            dict[stage[13].find_all('div')[0].text]= stage[13].find_all('div')[1].text
        except ValueError:
            dict['Won how:'] = np.nan
        
        extra_info_ls.append(dict)
        
        i += 1
    return extra_info_ls

In [17]:
extra_info_ls = get_profile(performance_clean.race_ref.unique())

0.0
0.00028042624789680314
0.0005608524957936063
0.0008412787436904094
0.0011217049915872126
0.0014021312394840158
0.0016825574873808188
0.001962983735277622
0.002243409983174425
0.0025238362310712283
0.0028042624789680315
0.0030846887268648347
0.0033651149747616375
0.0036455412226584407
0.003925967470555244
0.004206393718452047
0.00448681996634885
0.0047672462142456535
0.005047672462142457
0.00532809871003926
0.005608524957936063
0.005888951205832866
0.0061693774537296695
0.006449803701626472
0.006730229949523275
0.007010656197420078
0.007291082445316881
0.007571508693213685
0.007851934941110488
0.008132361189007292
0.008412787436904094
0.008693213684800897
0.0089736399326977
0.009254066180594503
0.009534492428491307
0.00981491867638811
0.010095344924284913
0.010375771172181716
0.01065619742007852
0.010936623667975322
0.011217049915872126
0.011497476163768929
0.011777902411665733
0.012058328659562535
0.012338754907459339
0.012619181155356141
0.012899607403252944
0.013180033651149748
0

In [20]:
stages_df = pd.DataFrame(extra_info_ls).rename(columns={'href':'race_ref'})
stages_df.to_csv('new_data/stages_2.csv')

In [21]:
performance_df.drop_duplicates(inplace=True)

In [22]:
stages_df

Unnamed: 0,race_ref,Avg. speed winner:,Distance:,Parcours type:,ProfileScore:,Vert. meters:,Startlist quality score:,Won how:
0,race/tour-de-france/2021/stage-21,40.748,108.4,p1,14.0,697.0,1646,Sprint of large group
1,race/tour-de-france/2021/stage-20,51.500,30.8,p1,8.0,246.0,1646,Time Trial
2,race/tour-de-france/2021/stage-19,47.901,207.0,p2,16.0,1181.0,1646,26 km solo
3,race/tour-de-france/2021/stage-18,36.407,129.7,p5,367.0,3561.0,1646,0.6 km solo
4,race/tour-de-france/2021/stage-17,35.267,178.4,p5,450.0,4375.0,1646,Sprint of small group
...,...,...,...,...,...,...,...,...
3561,race/course-cycliste-de-solidarnosc/2010/stage-5,39.080,183.7,p0,,,65,? - let us know!
3562,race/course-cycliste-de-solidarnosc/2010/stage-4,41.310,188.2,p0,,,65,? - let us know!
3563,race/course-cycliste-de-solidarnosc/2010/stage-3,43.560,157.2,p0,,,65,? - let us know!
3564,race/course-cycliste-de-solidarnosc/2010/stage-2,44.300,78.0,p0,,,65,? - let us know!


In [23]:
merged = performance_clean.merge(stages_df, on='race_ref')
merged['points'] = merged['points'].astype('float')
merged['adjusted_points'] = merged['points'] * merged['ProfileScore:']  * merged['Startlist quality score:']


In [24]:
merged.to_csv('new_data/merged_clean_2.csv')

In [150]:
merged['adjusted_points'] = merged['points'] * merged['ProfileScore:']  * merged['Startlist quality score:']
merged.dropna(subset='ProfileScore:',inplace=True)
merged.drop_duplicates(inplace=True)
merged = merged[merged['adjusted_points'] != 0]
merged.sort_values(by='adjusted_points').tail(50)

Unnamed: 0,name,year,type,date,result,gc,icon,race_ref,race_name,race_detail,...,distance,points,Avg. speed winner:,Distance:,Parcours type:,ProfileScore:,Vert. meters:,Startlist quality score:,Won how:,adjusted_points
55448,primoz-roglic,2018,etappe,2018-07-25,4,4,stage,race/tour-de-france/2018/stage-17,tour-de-france,Stage 17 - Bagnères-de-Luchon › Col Du Portet,...,65.0,36.0,27.57,65.0,p5,428.0,3274.0,1702,8 km solo,26224416.0
99071,darwin-atapuma,2017,etappe,2017-07-20,2,41,stage,race/tour-de-france/2017/stage-18,tour-de-france,Stage 18 - Briancon › Izoard,...,179.5,44.0,38.39,179.5,p5,338.0,4059.0,1764,1 km solo,26234208.0
404,richie-porte,2020,etappe,2020-09-13,3,6,stage,race/tour-de-france/2020/stage-15,tour-de-france,Stage 15 - Lyon › Grand Colombier,...,174.5,40.0,38.18,174.5,p5,390.0,3724.0,1685,Sprint a deux,26286000.0
238412,vincenzo-nibali,2014,etappe,2014-07-23,3,1,stage,race/tour-de-france/2014/stage-17,tour-de-france,Stage 17 - Saint-Gaudens › Saint-Lary-Soulan (...,...,124.5,40.0,34.68,124.5,p5,410.0,3810.0,1612,2.4 km solo,26436800.0
5330,michal-kwiatkowski,2020,etappe,2020-09-17,1,29,stage,race/tour-de-france/2020/stage-18,tour-de-france,Stage 18 - Méribel › La Roche-sur-Foron,...,175.0,50.0,36.52,175.0,p4,314.0,5166.0,1685,Sprint a deux,26454500.0
100422,fabio-aru,2017,etappe,2017-07-13,3,1,stage,race/tour-de-france/2017/stage-12,tour-de-france,Stage 12 - Pau › Peyragudes,...,214.5,40.0,36.81,214.5,p5,375.0,4903.0,1764,Sprint of small group,26460000.0
99367,primoz-roglic,2017,etappe,2017-07-19,1,30,stage,race/tour-de-france/2017/stage-17,tour-de-france,Stage 17 - La Mure › Serre-Chevalier,...,183.0,50.0,35.69,183.0,p4,300.0,5064.0,1764,34.4 km solo,26460000.0
181543,nairo-quintana,2015,etappe,2015-07-25,2,2,stage,race/tour-de-france/2015/stage-20,tour-de-france,Stage 20 - Modane › l'Alpe d'Huez,...,110.5,44.0,33.6,110.5,p5,332.0,3164.0,1812,6.4 km solo,26469696.0
142312,ion-izagirre,2016,etappe,2016-07-23,1,47,stage,race/tour-de-france/2016/stage-20,tour-de-france,Stage 20 - Megève › Morzine,...,146.5,50.0,35.62,146.5,p4,314.0,4127.0,1704,8.5 km solo,26752800.0
143044,jarlinson-pantano,2016,etappe,2016-07-20,2,23,stage,race/tour-de-france/2016/stage-17,tour-de-france,Stage 17 - Berne › Finhaut-Emosson,...,184.5,44.0,40.03,184.5,p5,359.0,3891.0,1704,6.5 km solo,26916384.0


In [149]:
merged.groupby(['race_ref']).mean().sort_values(by='adjusted_points').tail(50)

Unnamed: 0_level_0,result,points,Avg. speed winner:,Distance:,ProfileScore:,Vert. meters:,Startlist quality score:,adjusted_points
race_ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
race/paris-nice/2017/stage-7,10.5,22.1,35.21,177.0,354.0,4487.0,1179.0,9223789.0
race/tour-de-france/2020/stage-16,10.5,22.1,38.91,164.0,250.0,3903.0,1685.0,9309625.0
race/dauphine/2019/stage-7,10.526316,22.105263,33.159,133.5,413.0,4343.0,1021.0,9321193.0
race/tirreno-adriatico/2018/stage-4,9.352941,24.588235,34.38,219.0,316.0,4704.0,1208.0,9386018.0
race/dauphine/2016/stage-6,10.5,22.1,32.01,141.0,344.0,4528.0,1238.0,9411771.0
race/tour-de-france/2017/stage-13,10.5,22.1,38.73,101.0,243.0,2856.0,1764.0,9473209.0
race/giro-d-italia/2014/stage-20,10.5,22.1,35.54,167.0,480.0,4150.0,921.0,9769968.0
race/vuelta-a-espana/2017/stage-20,10.5,22.1,33.33,117.5,504.0,3817.0,878.0,9779515.0
race/dauphine/2017/stage-8,10.055556,23.111111,33.44,115.0,411.0,4113.0,1032.0,9802624.0
race/tour-de-france/2019/stage-20,10.5,22.1,31.908,59.5,267.0,2228.0,1704.0,10054790.0


In [133]:
sum(merged['ProfileScore:'].isna())

75274

In [282]:
diction = {}

stage = soup.find('ul', class_='infolist').find_all('li')
stage

[<li><div>Date:</div> <div>18 July 2021</div></li>,
 <li><div>Start time:</div> <div>16:30 </div></li>,
 <li><div>Avg. speed winner:</div> <div>40.748 km/h</div></li>,
 <li><div>Race category:</div> <div>ME - Men Elite</div></li>,
 <li><div>Distance: </div> <div>108.4 km</div></li>,
 <li><div>Points scale:</div> <div><a href="info.php?s=point-scales&amp;season=2021&amp;category=1&amp;scale=7">GT.A.Stage</a></div></li>,
 <li><div>Parcours type: </div> <div><span class="icon profile p1"></span></div></li>,
 <li><div>ProfileScore: </div> <div>14</div></li>,
 <li><div>Vert. meters:</div> <div>697</div></li>,
 <li><div>Departure:</div> <div><a href="location/chatou">Chatou</a></div></li>,
 <li><div>Arrival:</div> <div><a href="location/paris">Paris Champs-Élysées</a></div></li>,
 <li><div>Race ranking:</div> <div>1</div></li>,
 <li><div>Startlist quality score:</div> <div><a href="race/tour-de-france/2021/stage-21/startlist/lineup-quality">1646</a></div></li>,
 <li><div>Won how: </div> <div

In [292]:
diction['href'] = endpoint
#get speed
diction[stage[2].find_all('div')[0].text] = float(stage[2].find_all('div')[1].text.strip(' km/h'))
#get distance
diction[stage[4].find_all('div')[0].text.strip()] = float(stage[4].find_all('div')[1].text.strip(' km'))
#get parcours type
diction[stage[6].find_all('div')[0].text.strip()] = stage[6].find_all('div')[1].span['class'][-1]
#get profile score
diction[stage[7].find_all('div')[0].text.strip()] = int(stage[7].find_all('div')[1].text)
#get vert meters
diction[stage[8].find_all('div')[0].text.strip()] = int(stage[8].find_all('div')[1].text)
#get vert meters
diction[stage[12].find_all('div')[0].text.strip()] = int(stage[12].find_all('div')[1].text)
#get won how
diction[stage[13].find_all('div')[0].text]= stage[13].find_all('div')[1].text
diction

{'href': 'race/tour-de-france/2021/stage-21',
 'Avg. speed winner:': 40.748,
 'Distance: ': 108.4,
 'Parcours type:': 'p1',
 'ProfileScore:': 14,
 'Distance:': 108.4,
 'Vert. meters:': 697,
 'Startlist quality score:': 1646,
 'Won how: ': 'Sprint of large group'}

In [294]:
pd.DataFrame(dict)

AttributeError: 'dict' object has no attribute 'to_records'

In [259]:
stage[2].find_all('div')[0].text, stage[2].find_all('div')[1].text

('Avg. speed winner:', '40.748 km/h')

In [232]:
stages_df.set_index('race_name').to_dict()['race_rank']

{'tour-de-france': '(2.UWT)',
 'tour-of-slovenia': '(2.Pro)',
 'itzulia-basque-country': '(2.UWT)',
 'tirreno-adriatico': '(2.UWT)',
 'uae-tour': '(2.UWT)'}

In [187]:
tadej_df.drop(index_drop)

Unnamed: 0,year,type,date,result,gc,icon,race_ref,race_name,race_detail,race_rank,distance
0,2021,gc,,1,,st6,race/tour-de-france/2021/stage-21-youth,tour-de-france,Youth classification,,
1,2021,gc,,1,,st7,race/tour-de-france/2021/stage-21-kom,tour-de-france,Mountains classification,,
2,2021,gc,,8,,st5,race/tour-de-france/2021/stage-21-points,tour-de-france,Points classification,,
3,2021,gc,,1,,st4,race/tour-de-france/2021/gc,tour-de-france,General classification,,
4,2021,etappe,18.07,72,,stage,race/tour-de-france/2021/stage-21,tour-de-france,Stage 21 - Chatou › Paris Champs-Élysées,,108.4
...,...,...,...,...,...,...,...,...,...,...,...
75,2021,one_day,20.06,5,,stage,race/nc-slovenia/2021/result,nc-slovenia,National Championships Slovenia - Road Race (NC),[(NC)],172
76,2021,one_day,17.06,3,,chrono,race/nc-slovenia-itt/2021/result,nc-slovenia-itt,National Championships Slovenia - ITT (NC),[(NC)],31.5
78,2021,one_day,25.04,1,,stage,race/liege-bastogne-liege/2021/result,liege-bastogne-liege,Liège-Bastogne-Liège (1.UWT),[(1.UWT)],259.1
79,2021,one_day,21.04,DNS,,stage,race/la-fleche-wallone/2021/result,la-fleche-wallone,La Flèche Wallonne (1.UWT),[(1.UWT)],193.6


In [164]:
tadej[len(tadej_df['date']) > 5]

{'year': '2021',
 'type': 'gc',
 'date': '',
 'result': '1',
 'gc': '',
 'icon': 'st7',
 'race_ref': 'race/tour-de-france/2021/stage-21-kom',
 'race_name': 'tour-de-france',
 'race_detail': 'Mountains classification',
 'race_rank': '',
 'distance': ''}

In [67]:
base_url = 'https://www.procyclingstats.com/'

endpoint = 'rider/tadej-pogacar/'

year = '2021'

rider = 'POGAČAR Tadej'

url = base_url+endpoint+year

In [68]:
response = requests.get(url).content

soup = BeautifulSoup(response)

In [77]:
stage_races = soup.find_all('tr', {'data-main': '0'})

one_day_races = soup.find_all('tr', {'data-main': '1'})

In [88]:
master_ls = []
for o in one_day_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'one_day'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [89]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,09.10,1,race/il-lombardia/2021/result,Il Lombardia (1.UWT),(1.UWT),239.0
1,06.10,4,race/milano-torino/2021/result,Milano - Torino (1.Pro),51k,190.0
2,05.10,3,race/tre-valli-varesine/2021/result,Tre Valli Varesine (1.Pro),86k,196.7
3,02.10,DNF,race/giro-dell-emilia/2021/result,Giro dell'Emilia (1.Pro),(1.Pro),195.3
4,26.09,37,race/world-championship/2021/result,World Championships - Road Race (WC),(WC),268.3
5,19.09,10,race/world-championship-itt/2021/result,World Championships - ITT (WC),(WC),43.3
6,12.09,5,race/uec-road-european-championships/2021/result,European Continental Championships - Road Race...,68k,179.2
7,09.09,12,race/uec-road-european-championships-itt/2021/...,European Continental Championships - ITT (CC),(CC),22.4
8,29.08,DNF,race/bretagne-classic/2021/result,Bretagne Classic - Ouest-France (1.UWT),(1.UWT),251.0
9,24.07,3,race/olympic-games/2021/result,Olympic Games Road Race (Olympics),(Olympics),234.0


In [113]:
stage_races[0].find('span', class_='icon')['class'][-1]

'st6'

In [90]:
master_ls = []
for o in stage_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'stage_race'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [91]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,,1,race/tour-de-france/2021/stage-21-youth,Youth classification,,
1,,1,race/tour-de-france/2021/stage-21-kom,Mountains classification,,
2,,8,race/tour-de-france/2021/stage-21-points,Points classification,,
3,,1,race/tour-de-france/2021/gc,General classification,,
4,18.07,72,race/tour-de-france/2021/stage-21,Stage 21 - Chatou › Paris Champs-Élysées,,108.4
...,...,...,...,...,...,...
59,25.02,2,race/uae-tour/2021/stage-5,Stage 5 - Fujairah Marine Club › Jebel Jais,,170
60,24.02,20,race/uae-tour/2021/stage-4,Stage 4 - Al Marjan Island › Al Marjan Island,,204
61,23.02,1,race/uae-tour/2021/stage-3,Stage 3 - Strata Manufactoring › Jebel Hafeet,,166
62,22.02,4,race/uae-tour/2021/stage-2,Stage 2 (ITT) - Al Hudayriat Island › Al Huday...,,13
