In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import itertools
import numpy as np

In [5]:
#scraper

tours = ['tour-de-france', 'giro-d-italia', 'vuelta-a-espana']
#years = [2020, 2019, 2018, 2017, 2016, 2015, 2014]
years = [2022]#, 2021, 2013, 2012, 2011, 2010]
#years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]

In [6]:
def scrape_participants(tour, year):
    # define url for startlist
    
    url = f'https://www.procyclingstats.com/race/{tour}/{year}/stage-21/startlist'
    print(url)
    
    #scrape page
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    #get all_teams
    all_teams = soup.find_all('li', class_='team')
    
    #loop over teams
    master_ls = []
    for t in all_teams:
        team = t.a.text
        riders = t.find_all('a', class_='blue')
        for r in riders:
            dict = {}
            rider = r.span.text
            href = r['href']
            dict['rider'] = href.split('/')[-1]
            dict['team'] = team
            dict['href'] = href
            dict['tour'] = tour
            dict['year'] = int(year)
            master_ls.append(dict)
            
    return master_ls

In [7]:
participants_ls = []

for y, t in list(itertools.product(years, tours)):
    participants_ls.append(scrape_participants(t, y))

https://www.procyclingstats.com/race/tour-de-france/2022/stage-21/startlist
https://www.procyclingstats.com/race/giro-d-italia/2022/stage-21/startlist
https://www.procyclingstats.com/race/vuelta-a-espana/2022/stage-21/startlist


In [8]:
pd.DataFrame(list(itertools.chain(*participants_ls))).to_csv('new_data/participants_10s.csv')

In [8]:
participants_df = pd.DataFrame(list(itertools.chain(*participants_ls)))

In [9]:
participants_df

Unnamed: 0,rider,team,href,tour,year
0,tadej-pogacar,UAE Team Emirates,rider/tadej-pogacar,tour-de-france,2022
1,george-bennett,UAE Team Emirates,rider/george-bennett,tour-de-france,2022
2,mikkel-bjerg,UAE Team Emirates,rider/mikkel-bjerg,tour-de-france,2022
3,vegard-stake-laengen,UAE Team Emirates,rider/vegard-stake-laengen,tour-de-france,2022
4,rafal-majka,UAE Team Emirates,rider/rafal-majka,tour-de-france,2022
...,...,...,...,...,...
446,xabier-mikel-azparren-irurzun,Euskaltel - Euskadi,rider/xabier-mikel-azparren-irurzun,vuelta-a-espana,2022
447,ibai-azurmendi-sagastibel,Euskaltel - Euskadi,rider/ibai-azurmendi-sagastibel,vuelta-a-espana,2022
448,mikel-bizkarra,Euskaltel - Euskadi,rider/mikel-bizkarra,vuelta-a-espana,2022
449,joan-bou,Euskaltel - Euskadi,rider/joan-bou,vuelta-a-espana,2022


In [15]:
# check missing riders
missing_df = pd.read_csv('new_data/missing_riders.csv', index_col=0)
missing_df['href'] = 'rider/' + missing_df['rider']
missing_df

Unnamed: 0,rider,year,tour,href
0,julien-morice,2016,vuelta-a-espana,rider/julien-morice
1,silvio-herklotz,2016,vuelta-a-espana,rider/silvio-herklotz
2,gang-xu,2015,giro-d-italia,rider/gang-xu
3,alessandro-malaguti,2015,giro-d-italia,rider/alessandro-malaguti
4,riccardo-stacchiotti,2015,giro-d-italia,rider/riccardo-stacchiotti
...,...,...,...,...
1465,nathan-o-neill,2001,giro-d-italia,rider/nathan-o-neill
1466,filippo-perfetto,2001,vuelta-a-espana,rider/filippo-perfetto
1467,yauheni-seniushkin,2001,vuelta-a-espana,rider/yauheni-seniushkin
1468,sergiy-matveyev,2001,vuelta-a-espana,rider/sergiy-matveyev


In [61]:
scrape_performance('levi-leipheimer', 'rider/levi-leipheimer', 2001)

[{'name': 'levi-leipheimer',
  'year': '2001',
  'type': 'gc',
  'date': '',
  'result': '19',
  'gc': '',
  'icon': 'st7',
  'race_ref': 'race/vuelta-a-espana/2001/kom',
  'race_name': 'vuelta-a-espana',
  'race_detail': 'Mountains classification',
  'race_rank': None,
  'distance': ''},
 {'name': 'levi-leipheimer',
  'year': '2001',
  'type': 'gc',
  'date': '',
  'result': '3',
  'gc': '',
  'icon': 'st5',
  'race_ref': 'race/vuelta-a-espana/2001/points',
  'race_name': 'vuelta-a-espana',
  'race_detail': 'Points classification',
  'race_rank': None,
  'distance': ''},
 {'name': 'levi-leipheimer',
  'year': '2001',
  'type': 'gc',
  'date': '',
  'result': '3',
  'gc': '',
  'icon': 'st4',
  'race_ref': 'race/vuelta-a-espana/2001/gc',
  'race_name': 'vuelta-a-espana',
  'race_detail': 'General classification',
  'race_rank': None,
  'distance': ''},
 {'name': 'levi-leipheimer',
  'year': '2001',
  'type': 'etappe',
  'date': '30.09',
  'result': '2',
  'gc': '',
  'icon': 'chrono',


In [17]:
from matplotlib.pyplot import text


def scrape_performance(rider, endpoint, year):
    
    #set up
    base_url = 'https://www.procyclingstats.com/'
    url = base_url+endpoint+'/'+str(year)
    
    response = requests.get(url).content
    soup = BeautifulSoup(response)
    
    result_ls = []
    
    #get stage_race results
    stage_races = soup.find_all('tr', {'data-main': '0'})
    
    for o in stage_races:
        dict = {}
        o = o.find_all('td')
        dict['name'] = rider
        dict['year'] = str(year)
        dict['type'] = 'etappe'
        dict['date'] = o[0].text
        if len(dict['date']) == 0:
            dict['type'] = 'gc'
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        try:
            dict['race_rank'] = o[4].a.span.text
        except AttributeError:
            dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    #get one day race results
    one_day_races = soup.find_all('tr', {'data-main': '1'})
    
    for o in one_day_races:
        dict = {}
        o = o.find_all('td')
        dict['name'] = rider
        dict['year'] = str(year)
        dict['type'] = 'one_day'
        dict['date'] = o[0].text
        dict['result'] = o[1].text
        dict['gc'] = o[2].text
        try:
            dict['icon'] = o[3].find('span', class_='icon')['class'][-1]
        except TypeError:
            dict['icon'] = 'stage'
        dict['race_ref'] = o[4].a['href']
        dict['race_name'] = dict['race_ref'].split('/')[1]
        dict['race_detail'] = o[4].a.text
        try:
            dict['race_rank'] = o[4].a.span.text
        except AttributeError:
            dict['race_rank'] = o[4].a.span
        dict['distance'] = o[5].text
        result_ls.append(dict)
    
    return result_ls

In [18]:
performance_ls = []

for index, row in missing_df.iterrows():
    performance_ls.append(scrape_performance(row['rider'], row['href'], row['year']))

In [19]:
performance_df = pd.DataFrame(list(itertools.chain(*performance_ls)))
performance_df

Unnamed: 0,name,year,type,date,result,gc,icon,race_ref,race_name,race_detail,race_rank,distance
0,julien-morice,2016,gc,,150,,st4,race/vuelta-a-espana/2016/gc,vuelta-a-espana,General classification,,
1,julien-morice,2016,etappe,11.09,131,,stage,race/vuelta-a-espana/2016/stage-21,vuelta-a-espana,Stage 21 - Las Rozas › Madrid,,104.1
2,julien-morice,2016,etappe,10.09,93,149,stage,race/vuelta-a-espana/2016/stage-20,vuelta-a-espana,Stage 20 - Benidorm › Alto de Aitana. Escuadró...,,193.2
3,julien-morice,2016,etappe,09.09,55,152,chrono,race/vuelta-a-espana/2016/stage-19,vuelta-a-espana,Stage 19 (ITT) - Xàbia › Calp,,37
4,julien-morice,2016,etappe,08.09,91,156,stage,race/vuelta-a-espana/2016/stage-18,vuelta-a-espana,Stage 18 - Requena › Gandía,,200.6
...,...,...,...,...,...,...,...,...,...,...,...,...
121651,joris-nieuwenhuis,2020,one_day,14.10,58,,stage,race/scheldeprijs/2020/result,scheldeprijs,Scheldeprijs (1.Pro),(1.Pro),173.3
121652,joris-nieuwenhuis,2020,one_day,11.10,3,,stage,race/paris-tours/2020/result,paris-tours,Paris - Tours Elite (1.Pro),(1.Pro),213
121653,joris-nieuwenhuis,2020,one_day,29.08 » 20.09,,,stage,race/tour-de-france/2020/youth,tour-de-france,Tour de France (2.UWT),(2.UWT),
121654,joris-nieuwenhuis,2020,one_day,06.08 » 09.08,,,stage,race/sazka-tour/2020/stage-4,sazka-tour,Czech Tour (2.1),(2.1),


In [20]:
performance_df.to_csv('new_data/raw_performance_missing.csv')

In [42]:
stage_s = list(np.arange(2,32,2))+list(np.arange(32,48,4))+[50]
stage_s_i = list(np.arange(1,21,1))
stage_s_dict = dict(zip(stage_s_i, stage_s[::-1]))

def clean_df(ls):
    df = pd.DataFrame(ls)
    
    index_drop = df[df['result']==''].index

    dropped_df = df.drop(index_drop)

    index_drop = dropped_df[dropped_df['type']=='gc'].index

    dropped_df = dropped_df.drop(index_drop)
    
    dropped_df['date'] = pd.to_datetime(dropped_df['date'] + '.' + dropped_df['year'], infer_datetime_format=True)
    
    dropped_df['result'] = dropped_df.result.str.replace('*','')
    
    dropped_df['result'] =  dropped_df['result'].replace('DNF', 0).replace('DNS', 0).replace('OTL', 0).replace('DSQ', 0)\
        .replace('DF', 0).astype('int')
    
    dropped_df['points'] = dropped_df['result'].map(stage_s_dict).fillna('0').astype('int')
    
    #depreciated -> for gc 
    #stages_df = df.loc[index_drop][['race_name', 'race_rank']]#.to_dict(orient='records')
    #stages_df = stages_df.set_index('race_name').to_dict()['race_rank']
    
    return dropped_df


In [56]:
performance_clean = clean_df(list(itertools.chain(*performance_ls)))
performance_clean.to_csv('new_data/performance_clean_missing.csv')

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [44]:
performance_clean.race_ref.unique()

array(['race/vuelta-a-espana/2016/stage-21',
       'race/vuelta-a-espana/2016/stage-20',
       'race/vuelta-a-espana/2016/stage-19', ...,
       'race/scheldeprijs/2020/result', 'race/paris-tours/2020/result',
       'race/strade-bianche/2020/result'], dtype=object)

In [45]:

def get_profile(list):
    extra_info_ls = []
    i=0
    
    for ref in list:
        print(i/len(performance_clean.race_ref.unique()))
        #create url
        base_url = 'https://www.procyclingstats.com/'
        url = base_url + ref
        response = requests.get(url).content
        soup = BeautifulSoup(response)
        
        print(url)
        
        #get al info
        dict = {}
        try:
            stage = soup.find('ul', class_='infolist').find_all('li')
                
            dict['href'] = ref
            #get speed
            try:
                dict[stage[2].find_all('div')[0].text] = float(stage[2].find_all('div')[1].text.strip(' km/h'))
            except ValueError:
                dict[stage[2].find_all('div')[0].text] = np.nan
            #get distance
            try:
                dict[stage[4].find_all('div')[0].text.strip()] = float(stage[4].find_all('div')[1].text.strip(' km'))
            except ValueError:
                dict[stage[4].find_all('div')[0].text.strip()] = np.nan
            #get parcours type
            try:
                dict[stage[6].find_all('div')[0].text.strip()] = stage[6].find_all('div')[1].span['class'][-1]
            except (ValueError, TypeError):
                dict[stage[6].find_all('div')[0].text.strip()] = np.nan
            #get profile score
            try:
                dict[stage[7].find_all('div')[0].text.strip()] = int(stage[7].find_all('div')[1].text)
            except ValueError:
                dict['ProfileScore:'] = np.nan
            #get vert meters
            try:
                dict[stage[8].find_all('div')[0].text.strip()] = int(stage[8].find_all('div')[1].text)
            except (ValueError, IndexError):
                dict['Vert. meters:'] = np.nan
            #get startlist
            try:
                dict[stage[12].find_all('div')[0].text.strip()] = int(stage[12].find_all('div')[1].text)
            except (ValueError, IndexError):
                dict['Startlist quality score:'] = np.nan
            #get won how
            try:
                dict[stage[13].find_all('div')[0].text]= stage[13].find_all('div')[1].text
            except (ValueError, IndexError):
                dict['Won how:'] = np.nan
            
            extra_info_ls.append(dict)
            
            i += 1
        
        except AttributeError:
            print(ref)
            
    return extra_info_ls

In [46]:
extra_info_ls = get_profile(performance_clean.race_ref.unique())

0.0
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-21
0.00016252234682268812
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-20
0.00032504469364537625
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-19
0.00048756704046806434
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-18
0.0006500893872907525
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-17
0.0008126117341134406
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-16
0.0009751340809361287
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-15
0.0011376564277588168
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-14
0.001300178774581505
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-13
0.0014627011214041932
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-12
0.0016252234682268812
https://www.procyclingstats.com/race/vuelta-a-espana/2016/stage-11
0.0017877458150495694
https://www.pro

In [57]:
stages_df = pd.DataFrame(extra_info_ls).rename(columns={'href':'race_ref'})
stages_df.to_csv('new_data/stages_missing.csv')

In [58]:
print(performance_clean.shape)
performance_clean.drop_duplicates(inplace=True)
print(performance_clean.shape)


(97513, 13)
(97506, 13)


In [50]:
stages_df

Unnamed: 0,race_ref,Avg. speed winner:,Distance:,Parcours type:,ProfileScore:,Vert. meters:,Startlist quality score:,Won how:
0,race/vuelta-a-espana/2016/stage-21,36.99,104.1,p1,10.0,847.0,900,Sprint of large group
1,race/vuelta-a-espana/2016/stage-20,36.26,193.2,p5,349.0,5036.0,900,Sprint a deux
2,race/vuelta-a-espana/2016/stage-19,47.69,37.0,p1,19.0,446.0,900,Time Trial
3,race/vuelta-a-espana/2016/stage-18,40.87,200.6,p2,49.0,2550.0,900,Sprint of large group
4,race/vuelta-a-espana/2016/stage-17,38.78,177.5,p5,229.0,3460.0,900,2.7 km solo
...,...,...,...,...,...,...,...,...
6148,race/sazka-tour/2020/stage-1,56.32,18.6,p1,,123.0,111,Time Trial
6149,race/ronde-van-vlaanderen/2020/result,42.52,243.3,p1,79.0,2014.0,700,Sprint a deux
6150,race/scheldeprijs/2020/result,48.45,173.3,p1,,710.0,542,Sprint of large group
6151,race/paris-tours/2020/result,43.81,213.0,p2,32.0,1258.0,259,Sprint a deux


In [55]:
merged = performance_clean.merge(stages_df, on='race_ref')
merged['points'] = merged['points'].astype('float')
merged['adjusted_points'] = merged['points'] * merged['ProfileScore:']  * merged['Startlist quality score:']
merged

Unnamed: 0,name,year,type,date,result,gc,icon,race_ref,race_name,race_detail,...,distance,points,Avg. speed winner:,Distance:,Parcours type:,ProfileScore:,Vert. meters:,Startlist quality score:,Won how:,adjusted_points
0,julien-morice,2016,etappe,2016-11-09,131,,stage,race/vuelta-a-espana/2016/stage-21,vuelta-a-espana,Stage 21 - Las Rozas › Madrid,...,104.1,0.0,36.99,104.1,p1,10.0,847.0,900,Sprint of large group,0.0
1,julien-morice,2016,etappe,2016-10-09,93,149,stage,race/vuelta-a-espana/2016/stage-20,vuelta-a-espana,Stage 20 - Benidorm › Alto de Aitana. Escuadró...,...,193.2,0.0,36.26,193.2,p5,349.0,5036.0,900,Sprint a deux,0.0
2,julien-morice,2016,etappe,2016-09-09,55,152,chrono,race/vuelta-a-espana/2016/stage-19,vuelta-a-espana,Stage 19 (ITT) - Xàbia › Calp,...,37,0.0,47.69,37.0,p1,19.0,446.0,900,Time Trial,0.0
3,julien-morice,2016,etappe,2016-08-09,91,156,stage,race/vuelta-a-espana/2016/stage-18,vuelta-a-espana,Stage 18 - Requena › Gandía,...,200.6,0.0,40.87,200.6,p2,49.0,2550.0,900,Sprint of large group,0.0
4,julien-morice,2016,etappe,2016-07-09,139,158,stage,race/vuelta-a-espana/2016/stage-17,vuelta-a-espana,Stage 17 - Castellón › Llucena. Camins del Pen...,...,177.5,0.0,38.78,177.5,p5,229.0,3460.0,900,2.7 km solo,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97508,joris-nieuwenhuis,2020,etappe,2020-06-08,2,6,chrono,race/sazka-tour/2020/stage-1,sazka-tour,Stage 1 (TTT) - Uničov › Uničov,...,18.6,44.0,56.32,18.6,p1,,123.0,111,Time Trial,
97509,joris-nieuwenhuis,2020,one_day,2020-10-18,33,,stage,race/ronde-van-vlaanderen/2020/result,ronde-van-vlaanderen,Ronde van Vlaanderen - Tour des Flandres ME (1...,...,243.3,0.0,42.52,243.3,p1,79.0,2014.0,700,Sprint a deux,0.0
97510,joris-nieuwenhuis,2020,one_day,2020-10-14,58,,stage,race/scheldeprijs/2020/result,scheldeprijs,Scheldeprijs (1.Pro),...,173.3,0.0,48.45,173.3,p1,,710.0,542,Sprint of large group,
97511,joris-nieuwenhuis,2020,one_day,2020-11-10,3,,stage,race/paris-tours/2020/result,paris-tours,Paris - Tours Elite (1.Pro),...,213,40.0,43.81,213.0,p2,32.0,1258.0,259,Sprint a deux,331520.0


 2001 vuelta-a-espana

In [54]:
merged['ProfileScore:'].isna().sum()

63381

In [62]:
merged[(merged['name']=='michael-rogers'), (merged['year']==2014)]

InvalidIndexError: (0        False
1        False
2        False
3        False
4        False
         ...  
97508    False
97509    False
97510    False
97511    False
97512    False
Name: name, Length: 97513, dtype: bool, 0        False
1        False
2        False
3        False
4        False
         ...  
97508    False
97509    False
97510    False
97511    False
97512    False
Name: year, Length: 97513, dtype: bool)

In [53]:
merged.to_csv('new_data/merged_clean_missing.csv')

In [36]:
merged['adjusted_points'] = merged['points'] * merged['ProfileScore:']  * merged['Startlist quality score:']
merged.dropna(subset='ProfileScore:',inplace=True)
merged.drop_duplicates(inplace=True)
merged = merged[merged['adjusted_points'] != 0]
merged.sort_values(by='adjusted_points').tail(50)

Unnamed: 0,name,year,type,date,result,gc,icon,race_ref,race_name,race_detail,...,distance,points,Avg. speed winner:,Distance:,Parcours type:,ProfileScore:,Vert. meters:,Startlist quality score:,Won how:,adjusted_points
132874,carlos-sastre,2006,etappe,2006-07-19,2,2,stage,race/tour-de-france/2006/stage-16,tour-de-france,Stage 16 - Le Bourg d'Oisans › La Toussuire - ...,...,182.0,44.0,32.49,182.0,p5,393.0,5456.0,1426,70 km solo,24658392.0
62336,jose-vicente-garcia-acosta,2002,etappe,2002-07-23,6,113,stage,race/tour-de-france/2002/stage-15,tour-de-france,Stage 15 - Vaison-le-Romaine › Les deux Alpes,...,226.5,30.0,38.253,226.5,p5,522.0,7679.0,1575,9 km solo,24664500.0
62514,roberto-heras,2002,etappe,2002-07-18,3,14,stage,race/tour-de-france/2002/stage-11,tour-de-france,Stage 11 - Pau › La Mongie,...,158.0,40.0,36.19,158.0,p5,393.0,5312.0,1575,0.4 km solo,24759000.0
32363,igor-gonzalez-de-galdeano,2001,etappe,2001-07-17,8,8,stage,race/tour-de-france/2001/stage-10,tour-de-france,Stage 10 - Aix-Les-Bains › L'Alpe d'Huez,...,209.0,26.0,32.675,209.0,p5,630.0,8559.0,1546,7 km solo,25323480.0
92622,floyd-landis,2004,etappe,2004-07-22,5,27,stage,race/tour-de-france/2004/stage-17,tour-de-france,Stage 17 - Bourg d'Oisans › Le Grand Bornand,...,204.5,32.0,32.996,204.5,p4,505.0,8048.0,1568,Sprint of small group,25338880.0
32164,joseba-beloki,2001,etappe,2001-07-22,6,4,stage,race/tour-de-france/2001/stage-14,tour-de-france,Stage 14 - Tarbes › Luz Ardiden,...,141.5,30.0,32.098,141.5,p5,547.0,5174.0,1546,7 km solo,25369860.0
174420,samuel-sanchez,2008,etappe,2008-07-23,2,8,stage,race/tour-de-france/2008/stage-17,tour-de-france,Stage 17 - Embrun › l'Alpe d'Huez,...,210.5,44.0,34.32,210.5,p5,389.0,5405.0,1491,12.7 km solo,25519956.0
176077,leonardo-piepoli,2008,etappe,2008-07-14,1,24,stage,race/tour-de-france/2008/stage-10,tour-de-france,Stage 10 - Pau › Hautacam,...,156.0,50.0,36.076,156.0,p5,346.0,3794.0,1491,Sprint a deux,25794300.0
176073,juan-jose-cobo,2008,etappe,2008-07-14,1,8,stage,race/tour-de-france/2008/stage-10,tour-de-france,Stage 10 - Pau › Hautacam,...,156.0,50.0,36.076,156.0,p5,346.0,3794.0,1491,Sprint a deux,25794300.0
103843,georg-totschnig,2005,etappe,2005-07-16,1,14,stage,race/tour-de-france/2005/stage-14,tour-de-france,Stage 14 - Agde › Ax-3 Domaines,...,174.0,50.0,30.374,174.0,p5,334.0,4188.0,1549,0 km solo,25868300.0


In [38]:
merged.groupby(['race_ref']).mean().sort_values(by='adjusted_points').tail(50)

Unnamed: 0_level_0,result,points,Avg. speed winner:,Distance:,ProfileScore:,Vert. meters:,Startlist quality score:,Race category:,Points scale:,Arrival:,Won how:,adjusted_points
race_ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
race/la-route-d-occitanie/2022/stage-3,7.0,31.333333,34.989,188.7,268.0,4503.0,298.0,,,,,2502405.0
race/faun-ardeche-classic/2022/result,8.6,26.4,37.944,168.5,216.0,3161.0,453.0,,,,,2583187.0
race/itzulia-basque-country/2022/stage-4,10.375,22.75,43.605,185.6,154.0,2703.0,742.0,,,,,2599597.0
race/tour-de-france/2022/stage-13,10.5,22.1,45.667,192.6,78.0,2109.0,1550.0,,,,,2671890.0
race/vuelta-a-la-comunidad-valenciana/2022/stage-1,7.714286,27.714286,38.989,166.7,162.0,3164.0,610.0,,,,,2738726.0
race/itzulia-basque-country/2022/stage-5,10.769231,21.076923,39.765,163.76,184.0,3485.0,742.0,,,,,2877590.0
race/tour-de-france/2022/stage-6,10.5,22.1,49.376,219.9,85.0,2477.0,1550.0,,,,,2911675.0
race/dauphine/2022/stage-3,8.571429,26.428571,40.62,169.0,140.0,2707.0,788.0,,,,,2915600.0
race/tour-de-france/2022/stage-8,10.5,22.1,44.164,186.3,87.0,2556.0,1550.0,,,,,2980185.0
race/volta-a-catalunya/2022/stage-4,12.0,19.0,38.403,166.7,221.0,3538.0,722.0,,,,,3031678.0


In [133]:
sum(merged['ProfileScore:'].isna())

75274

In [282]:
diction = {}

stage = soup.find('ul', class_='infolist').find_all('li')
stage

[<li><div>Date:</div> <div>18 July 2021</div></li>,
 <li><div>Start time:</div> <div>16:30 </div></li>,
 <li><div>Avg. speed winner:</div> <div>40.748 km/h</div></li>,
 <li><div>Race category:</div> <div>ME - Men Elite</div></li>,
 <li><div>Distance: </div> <div>108.4 km</div></li>,
 <li><div>Points scale:</div> <div><a href="info.php?s=point-scales&amp;season=2021&amp;category=1&amp;scale=7">GT.A.Stage</a></div></li>,
 <li><div>Parcours type: </div> <div><span class="icon profile p1"></span></div></li>,
 <li><div>ProfileScore: </div> <div>14</div></li>,
 <li><div>Vert. meters:</div> <div>697</div></li>,
 <li><div>Departure:</div> <div><a href="location/chatou">Chatou</a></div></li>,
 <li><div>Arrival:</div> <div><a href="location/paris">Paris Champs-Élysées</a></div></li>,
 <li><div>Race ranking:</div> <div>1</div></li>,
 <li><div>Startlist quality score:</div> <div><a href="race/tour-de-france/2021/stage-21/startlist/lineup-quality">1646</a></div></li>,
 <li><div>Won how: </div> <div

In [292]:
diction['href'] = endpoint
#get speed
diction[stage[2].find_all('div')[0].text] = float(stage[2].find_all('div')[1].text.strip(' km/h'))
#get distance
diction[stage[4].find_all('div')[0].text.strip()] = float(stage[4].find_all('div')[1].text.strip(' km'))
#get parcours type
diction[stage[6].find_all('div')[0].text.strip()] = stage[6].find_all('div')[1].span['class'][-1]
#get profile score
diction[stage[7].find_all('div')[0].text.strip()] = int(stage[7].find_all('div')[1].text)
#get vert meters
diction[stage[8].find_all('div')[0].text.strip()] = int(stage[8].find_all('div')[1].text)
#get vert meters
diction[stage[12].find_all('div')[0].text.strip()] = int(stage[12].find_all('div')[1].text)
#get won how
diction[stage[13].find_all('div')[0].text]= stage[13].find_all('div')[1].text
diction

{'href': 'race/tour-de-france/2021/stage-21',
 'Avg. speed winner:': 40.748,
 'Distance: ': 108.4,
 'Parcours type:': 'p1',
 'ProfileScore:': 14,
 'Distance:': 108.4,
 'Vert. meters:': 697,
 'Startlist quality score:': 1646,
 'Won how: ': 'Sprint of large group'}

In [294]:
pd.DataFrame(dict)

AttributeError: 'dict' object has no attribute 'to_records'

In [259]:
stage[2].find_all('div')[0].text, stage[2].find_all('div')[1].text

('Avg. speed winner:', '40.748 km/h')

In [232]:
stages_df.set_index('race_name').to_dict()['race_rank']

{'tour-de-france': '(2.UWT)',
 'tour-of-slovenia': '(2.Pro)',
 'itzulia-basque-country': '(2.UWT)',
 'tirreno-adriatico': '(2.UWT)',
 'uae-tour': '(2.UWT)'}

In [187]:
tadej_df.drop(index_drop)

Unnamed: 0,year,type,date,result,gc,icon,race_ref,race_name,race_detail,race_rank,distance
0,2021,gc,,1,,st6,race/tour-de-france/2021/stage-21-youth,tour-de-france,Youth classification,,
1,2021,gc,,1,,st7,race/tour-de-france/2021/stage-21-kom,tour-de-france,Mountains classification,,
2,2021,gc,,8,,st5,race/tour-de-france/2021/stage-21-points,tour-de-france,Points classification,,
3,2021,gc,,1,,st4,race/tour-de-france/2021/gc,tour-de-france,General classification,,
4,2021,etappe,18.07,72,,stage,race/tour-de-france/2021/stage-21,tour-de-france,Stage 21 - Chatou › Paris Champs-Élysées,,108.4
...,...,...,...,...,...,...,...,...,...,...,...
75,2021,one_day,20.06,5,,stage,race/nc-slovenia/2021/result,nc-slovenia,National Championships Slovenia - Road Race (NC),[(NC)],172
76,2021,one_day,17.06,3,,chrono,race/nc-slovenia-itt/2021/result,nc-slovenia-itt,National Championships Slovenia - ITT (NC),[(NC)],31.5
78,2021,one_day,25.04,1,,stage,race/liege-bastogne-liege/2021/result,liege-bastogne-liege,Liège-Bastogne-Liège (1.UWT),[(1.UWT)],259.1
79,2021,one_day,21.04,DNS,,stage,race/la-fleche-wallone/2021/result,la-fleche-wallone,La Flèche Wallonne (1.UWT),[(1.UWT)],193.6


In [164]:
tadej[len(tadej_df['date']) > 5]

{'year': '2021',
 'type': 'gc',
 'date': '',
 'result': '1',
 'gc': '',
 'icon': 'st7',
 'race_ref': 'race/tour-de-france/2021/stage-21-kom',
 'race_name': 'tour-de-france',
 'race_detail': 'Mountains classification',
 'race_rank': '',
 'distance': ''}

In [67]:
base_url = 'https://www.procyclingstats.com/'

endpoint = 'rider/tadej-pogacar/'

year = '2021'

rider = 'POGAČAR Tadej'

url = base_url+endpoint+year

In [68]:
response = requests.get(url).content

soup = BeautifulSoup(response)

In [77]:
stage_races = soup.find_all('tr', {'data-main': '0'})

one_day_races = soup.find_all('tr', {'data-main': '1'})

In [88]:
master_ls = []
for o in one_day_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'one_day'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [89]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,09.10,1,race/il-lombardia/2021/result,Il Lombardia (1.UWT),(1.UWT),239.0
1,06.10,4,race/milano-torino/2021/result,Milano - Torino (1.Pro),51k,190.0
2,05.10,3,race/tre-valli-varesine/2021/result,Tre Valli Varesine (1.Pro),86k,196.7
3,02.10,DNF,race/giro-dell-emilia/2021/result,Giro dell'Emilia (1.Pro),(1.Pro),195.3
4,26.09,37,race/world-championship/2021/result,World Championships - Road Race (WC),(WC),268.3
5,19.09,10,race/world-championship-itt/2021/result,World Championships - ITT (WC),(WC),43.3
6,12.09,5,race/uec-road-european-championships/2021/result,European Continental Championships - Road Race...,68k,179.2
7,09.09,12,race/uec-road-european-championships-itt/2021/...,European Continental Championships - ITT (CC),(CC),22.4
8,29.08,DNF,race/bretagne-classic/2021/result,Bretagne Classic - Ouest-France (1.UWT),(1.UWT),251.0
9,24.07,3,race/olympic-games/2021/result,Olympic Games Road Race (Olympics),(Olympics),234.0


In [113]:
stage_races[0].find('span', class_='icon')['class'][-1]

'st6'

In [90]:
master_ls = []
for o in stage_races:
    dict = {}
    o = o.find_all('td')
    dict['type'] = 'stage_race'
    dict['day'] = o[0].text
    dict['result'] = o[1].text
    dict['race_ref'] = o[4].a['href']
    dict['race_name'] = o[4].a.text
    dict['race_rank'] = o[4].find_all('span')[-1].text
    dict['distance'] = o[5].text
    master_ls.append(dict)

In [91]:
pd.DataFrame(master_ls)

Unnamed: 0,day,result,race_ref,race_name,race_rank,distance
0,,1,race/tour-de-france/2021/stage-21-youth,Youth classification,,
1,,1,race/tour-de-france/2021/stage-21-kom,Mountains classification,,
2,,8,race/tour-de-france/2021/stage-21-points,Points classification,,
3,,1,race/tour-de-france/2021/gc,General classification,,
4,18.07,72,race/tour-de-france/2021/stage-21,Stage 21 - Chatou › Paris Champs-Élysées,,108.4
...,...,...,...,...,...,...
59,25.02,2,race/uae-tour/2021/stage-5,Stage 5 - Fujairah Marine Club › Jebel Jais,,170
60,24.02,20,race/uae-tour/2021/stage-4,Stage 4 - Al Marjan Island › Al Marjan Island,,204
61,23.02,1,race/uae-tour/2021/stage-3,Stage 3 - Strata Manufactoring › Jebel Hafeet,,166
62,22.02,4,race/uae-tour/2021/stage-2,Stage 2 (ITT) - Al Hudayriat Island › Al Huday...,,13
