In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#Extracting Data from cycling Grand Tours

Using BeautifulSoup to scrape stage by stage placings and time gaps. Data is only captured for riders that completed the tour

In [3]:
def get_stage(tour,year,stage):
    #TODO: pass in dictionary of rider data - will be generated for all riders from stage 21, previous stages will only store for riders in it
    import datetime
    #data = requests.get("https://www.procyclingstats.com/race/giro-d-italia/2012/stage-21/result/result")
    data = requests.get(f"https://www.procyclingstats.com/race/{tour}/{year}/stage-{stage}/result/result")
    soup = BeautifulSoup(data.text,"html.parser")

    stage_table = soup.select('.results')

    #Not processing team time trial (since it's a different format and not a test of the rider)
    if len(soup.select('.results-ttt')) > 0:
        raise NotImplementedError("Team time trial not supported")

    #Parse non time trial stage
    results = soup.select('.result-cont:not(.hide)')
    output=[]
    for result in results:
        rows = result.select('tr')
        leaders_time = ""
        last_time=""
        last_gap=""
        for place,row in enumerate(rows[1:]): #first is header
            
                cells = row.select('td')
                scraped_place = (cells[0].get_text())
                scraped_place = scraped_place.replace('\xa0', '')#strip out non breaking space seen in Vuelta 2019 stage 13
                if not scraped_place.isalpha():
            
                    scraped_place = int(scraped_place)
                    #assert place == scraped_place-1,f"{place+1} != {scraped_place}"
                    if place != scraped_place-1:
                        print(f"{place+1} != {scraped_place} in {tour} stage {stage} {year}")
                    else:
                        rider_name = row.select('a')[0].get_text()
                        #Time will be time taken for the leader, or gap for other riders, but gap is placed in span and hidden div
                        span_element = row.select('.time>span')
                        if len(span_element) > 0:
                            #Happens in some years for 2nd and subsequent place
                            rider_time = span_element[0].get_text()
                        else:
                            rider_time = row.select('.time')[0].get_text()

                        if scraped_place == 1:
                            try:
                                leaders_time = datetime.datetime.strptime(rider_time,"%H:%M:%S")
                            except:
                                leaders_time = datetime.datetime.strptime(rider_time,"%M:%S")
                            finally: 
                                gap = 0 #TODO: prob need as datetime
                                rider_time = leaders_time
                                last_time = rider_time
                                last_gap = 0
                        else:
                            #may be ,, as same as last
                            if rider_time.count(":") == 0:
                                rider_time=last_time
                                gap = last_gap
                            else:
                                try:
                                    gap = datetime.datetime.strptime(rider_time,"%H:%M:%S")
                                except:
                                    gap = datetime.datetime.strptime(rider_time,"%M:%S")
                                finally: 
                                    rider_time = leaders_time + datetime.timedelta(hours=gap.hour, minutes = gap.minute, seconds = gap.second)
                                    last_time = rider_time
                                    last_gap = gap
                        #Convert times to seconds
                        if gap != 0:
                            gap = int((gap-datetime.datetime(1900,1,1)).total_seconds())
                        rider_time = int((rider_time-datetime.datetime(1900,1,1)).total_seconds())
                        output.append([rider_name,stage,scraped_place,gap,rider_time])

    print("finished")
    return output


# #220404 detect TTT
#print(get_stage("vuelta-a-espana",2019,13))
#df_vuelta_19 = save_full_tour("vuelta-a-espana",2019)


In [4]:
def save_full_tour(tour,year):
    output = []
    
    output=get_stage(tour,year,21)
    df_tour = pd.DataFrame(output,columns=["rider","stage","position","gap","stage_time"])
    #TODO: Team time trial on final stage is not trapped - but is this ever likely?
    for stage in range(20,0,-1):
        try:
            output=get_stage(tour,year,stage)
            df_current = pd.DataFrame(output,columns=["rider","stage","position","gap","stage_time"])
            df_tour = df_tour.append(df_current)

        except NotImplementedError: #returned for team time trials
            pass

    df_tour.to_csv(f"{tour}_{year}.csv",index = False)

    return df_tour

#df_vuelta_19 = save_full_tour("vuelta-a-espana",2019)

In [19]:
df_from_load = pd.read_csv("vuelta-a-espana_2019.csv")
print(check_equality(df_from_load,df_vuelta_19)) 

True


In [5]:
def check_equality(A, B):

    df11 = A.sort_index(axis=1)
    df11 = df11.sort_values(df11.columns.tolist()).reset_index(drop=True)

    df22 = B.sort_index(axis=1)
    df22 = df22.sort_values(df22.columns.tolist()).reset_index(drop=True)
    return (df11 == df22).values.all()

#df_from_load = pd.read_csv("giro-d-italia_2020.csv")
#print(check_equality(df_from_load,df_giro_20)) 
df_giro_20 = pd.read_csv("giro-d-italia_2020.csv")

In [6]:
#df_giro_21 = save_full_tour("giro-d-italia",2021)

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [7]:
df_from_load = pd.read_csv("giro-d-italia_2021.csv")
print(check_equality(df_from_load,df_giro_21)) 

True


In [8]:
#df_giro_19 = save_full_tour("giro-d-italia",2019)

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [9]:
df_from_load = pd.read_csv("giro-d-italia_2019.csv")
print(check_equality(df_from_load,df_giro_19)) 

True


In [8]:
#df_tour_21 = save_full_tour("tour-de-france",2021) #2019 has team time trial which fails

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [12]:
#tour 20 and 21 are fine - 19 has team time trial
df_from_load = pd.read_csv("tour-de-france_2021.csv")
print(check_equality(df_from_load,df_tour_21)) 

True


In [13]:
#vuelta '19 has TTT
#df_vuelta_21 = save_full_tour("vuelta-a-espana",2021)
#df_vuelta_20 = save_full_tour("vuelta-a-espana",2020)

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [15]:
#Vuelta 20 and 21 are fine 19 has TTT
df_from_load = pd.read_csv("vuelta-a-espana_2020.csv")
print(check_equality(df_from_load,df_vuelta_20)) 

True


In [20]:
#df_tour_19 = save_full_tour("tour-de-france",2019)

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [21]:
df_from_load = pd.read_csv("tour-de-france_2019.csv")
print(check_equality(df_from_load,df_tour_19)) 

True


In [6]:
# df_tour_18 = save_full_tour("tour-de-france",2018)
# df_vuelta_18 = save_full_tour("vuelta-a-espana",2018)
# df_giro_18 = save_full_tour("giro-d-italia",2018)

finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished
finished


In [7]:
df_from_load = pd.read_csv("tour-de-france_2018.csv")
print(check_equality(df_from_load,df_tour_18)) 

True


In [9]:
df_from_load = pd.read_csv("vuelta-a-espana_2018.csv")
print(check_equality(df_from_load,df_vuelta_18)) 

True


In [8]:
df_from_load = pd.read_csv("giro-d-italia_2018.csv")
print(check_equality(df_from_load,df_giro_18)) 

True
