In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]

In [3]:
def clean_scrape_profile(row, category, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di['name'] = name
    di['team'] = team
    di['points'] = points
    di['ranking'] = ranking
    di['type'] = category
    di['year'] = year
    return di

In [4]:
def clean_scrape_performance(row, year):
    di = {}
    name_s = row[1].text.strip()
    name_ls = name_s.split()
    name = ' '.join([i.capitalize() for i in name_ls])
    gc_result = row[2].text
    giro_w = row[3].text
    stage_result = row[4].text
    stage_wins = row[5].text
    top10 = row[5].text
    total = row[7].text
    di['name'] = name
    di['gc_result'] = gc_result
    di['giro_w'] = giro_w
    di['stage_result'] = stage_result
    di['stage_wins'] = stage_wins
    di['top10'] = top10
    di['giro_total'] = total
    di['year'] = year
    return di

In [5]:
def clean_scrape_teams(row, year):
    di = {}
    team = row[1].text.lstrip()
    division = row[2].text
    points = row[3].text
    di['team'] = team
    di['team_class'] = division
    di['team_points'] = points
    di['year'] = year
    return di

In [6]:
def clean_age(row, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    age = row[3].text
    di['name'] = name
    di['team'] = team
    di['age'] = age[0:2]
    di['year'] = year
    return di

In [None]:
def get_age():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/youngest-competitors'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_age(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_age.csv', index=False)
get_age()

In [None]:
def get_teams():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/teams-ranked'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_scrape_teams(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_teams.csv', index=False)
get_teams()

In [None]:
def get_performance():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/previous-performance'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 9)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_performance(chunks(td, 9, chunk), year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_performance.csv', index=False)
get_performance()

In [None]:
def get_profile():
    categories = ['top-gc-riders', 'top-sprinters', 'top-time-trial-riders', 'best-classic-riders']
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
        for category in categories:
            url = f'https://www.procyclingstats.com/race.php?missing=0&filter=Filter&id1=giro-d-italia&id2={year}&id3=gc&id4=startlist&id5={category}'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 5)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_profile(chunks(td, 5, chunk), category, year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_profile.csv', index=False)
get_profile()
        
        

In [63]:
# Scraper

for year in [2016, 2017, 2018, 2019, 2020, 2021]:

    teams = {}
    url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-1'
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser').find('tbody')
    tr = soup.find_all('tr')
    dictionary = {}
    for i in range(len(tr)):
            td = tr[i].find_all('td', class_=None)[:2]
            name = td[1].find('a').text.split()
            name = ' '.join([x.capitalize() for x in name])
            team = td[1].find_all('span')[-1].text
            teams[name] = team

    teams_dictionary = {}
    class_dictionary = {}
    for day in range(1,22):
        url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-{day}'
        print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser').find_all('table', class_='results basic moblist10')
        print(len(soup))
        if len(soup) == 6:
            for k, m in enumerate([ '-stage', '-gc', '-points', '-kom', '-youth']):
                print(k,m)
                tr = soup[k].find_all('tr')
                dictionary = {}
                for i in range(1,len(tr)):
                    td = tr[i].find_all('td', class_=None)[:2]
                    name = td[1].find('a').text.split()
                    name = ' '.join([x.capitalize() for x in name])
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                    dictionary[name] = td[0].text
                    if day < 10 :
                        class_dictionary['0'+str(day)+m] = dictionary
                    else:
                        class_dictionary[str(day)+m] = dictionary
            #get teams
            tr = soup[-1].find_all('tr')
            dictionary = {}
            for i in range(1,len(tr)):
                td = tr[i].find_all('td', class_=None)[:2]
                name = td[1].find('a').text
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                dictionary[name] = td[0].text
                if day < 10 :
                    teams_dictionary["0"+str(day)+'-teams'] = dictionary
                else:
                    teams_dictionary[str(day)+'-teams'] = dictionary
            
        else:
            for k, m in enumerate([ '-stage', '-gc', '-points', '-youth']):
                print(k,m)
                tr = soup[k].find_all('tr')
                dictionary = {}
                for i in range(1,len(tr)):
                    td = tr[i].find_all('td', class_=None)[:2]
                    name = td[1].find('a').text.split()
                    name = ' '.join([x.capitalize() for x in name])
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                    dictionary[name] = td[0].text
                    if day < 10 :
                        class_dictionary['0'+str(day)+m] = dictionary
                    else:
                        class_dictionary[str(day)+m] = dictionary
            
            tr = soup[-1].find_all('tr')
            dictionary = {}
            for i in range(1,len(tr)):
                td = tr[i].find_all('td', class_=None)[:2]
                name = td[1].find('a').text
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                dictionary[name] = td[0].text
                if day < 10 :
                    teams_dictionary["0"+str(day)+'-teams'] = dictionary
                else:
                    teams_dictionary[str(day)+'-teams'] = dictionary
                
    #merge frames + csv
    teams_rider = pd.DataFrame.from_dict(teams, orient='index', columns=['team'])
    all_df = pd.DataFrame.from_dict(class_dictionary, orient='index').T
    teams_df = pd.DataFrame.from_dict(teams_dictionary, orient='index').T.reset_index().rename(columns={'index':'team'})
    combo = pd.merge(all_df, teams_rider, left_index=True, right_index=True)
    combo = combo.reset_index().merge(teams_df, on='team', how='left')
    combo['year'] = str(year)
    combo.to_csv(f'data_raw_{year}.csv')

https://www.procyclingstats.com/race/giro-d-italia/2016/stage-1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-1
5
0 -stage
1 -gc
2 -points
3 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-2
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-3
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-4
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-5
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-6
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-7
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-8
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-9
6
0 -stage
1 -gc
2 -poi

In [58]:
sorted(all_df.columns)

['19-gc',
 '19-kom',
 '19-points',
 '19-stage',
 '19-youth',
 '20-gc',
 '20-kom',
 '20-points',
 '20-stage',
 '20-youth',
 '21-gc',
 '21-kom',
 '21-points',
 '21-stage',
 '21-youth']

Nieve Mikel               1
Kruijswijk Steven        10
Ulissi Diego             11
Dombrowski Joe           12
Nibali Vincenzo          13
                       ... 
Grosu Eduard-michael    NaN
Ruffoni Nicola          NaN
Jim Songezo             NaN
Pelucchi Matteo         NaN
Mareczko Jakub          NaN
Name: 21-youth, Length: 198, dtype: object

In [40]:
teams_rider

Unnamed: 0,team
Dumoulin Tom,Team Giant - Alpecin
Roglič Primož,Team LottoNL-Jumbo
Amador Andrey,Movistar Team
Ludvigsson Tobias,Team Giant - Alpecin
Kittel Marcel,Etixx - Quick Step
...,...
Grosu Eduard-michael,Nippo - Vini Fantini
Ruffoni Nicola,Bardiani - CSF
Jim Songezo,Team Dimension Data
Pelucchi Matteo,IAM Cycling


In [34]:
c = pd.merge(all_df, teams_rider, left_index=True, right_index=True)
c.reset_index().merge(teams_df, how='left', on='team')

Unnamed: 0,index,1-stage,1-gc,1-points,2-stage,2-gc,2-points,1-youth,2-youth,2-kom,team,1-teams,2-teams
0,Dumoulin Tom,1,1,1,36,1,10,,,,Team Giant - Alpecin,,1
1,Roglič Primož,2,2,2,61,2,12,,,,Team LottoNL-Jumbo,,2
2,Amador Andrey,3,3,3,9,4,9,,,,Movistar Team,,5
3,Ludvigsson Tobias,4,4,4,22,5,17,1,1,,Team Giant - Alpecin,,1
4,Kittel Marcel,5,5,5,1,3,1,,,,Etixx - Quick Step,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Grosu Eduard-michael,194,194,,14,176,26,48,43,,Nippo - Vini Fantini,,22
194,Ruffoni Nicola,195,195,,5,177,11,,,,Bardiani - CSF,,21
195,Jim Songezo,196,196,,183,196,,,,,Team Dimension Data,,19
196,Pelucchi Matteo,197,197,,23,178,,,,,IAM Cycling,,4


In [35]:
teams_df

Unnamed: 0,team,1-teams,2-teams
0,LUDVIGSSON Tobias,1,
1,JUNGELS Bob,2,
2,HOWSON Damien,3,
3,WIŚNIOWSKI Łukasz,4,
4,HEPBURN Michael,5,
...,...,...,...
66,Lampre - Merida,,18
67,Team Dimension Data,,19
68,Wilier - Southeast,,20
69,Bardiani - CSF,,21


In [24]:
c.reset_index(inplace=True)

In [29]:
c[c['index']=='Jungels Bob']['2-youth']

6    2
Name: 2-youth, dtype: object

In [168]:
url = f'https://www.procyclingstats.com/race/giro-d-italia/2021/stage-2'
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser').find_all('table', class_='results basic moblist10')

https://www.procyclingstats.com/race/giro-d-italia/2021/stage-2


In [175]:
len(soup)

6

In [126]:
def clean_stages(row, year):
    di = {}
    name = row[0].text.strip()
    value = row[1].text
    di['name'] = name
    if len(value) < 1:
        di['value'] = row[1].select('div span')[0]['class'][-1]
        print(di['value'])
    else:
       di['value'] = value
    
    di['year'] = year
    return di

In [165]:
soup

<ul class="infolist" style=""><li><div>Date:</div> <div>15 October 2020</div></li>
<li><div>Start time:</div> <div>11:15 </div></li>
<li><div>Avg. speed winner:</div> <div>36.93 km/h</div></li>
<li><div>Race category:</div> <div>ME - Men Elite</div></li>
<li><div>Distance: </div> <div>204 km</div></li>
<li><div>Points scale:</div> <div><a href="info.php?s=point-scales&amp;season=2020&amp;category=1&amp;scale=8">GT.B.Stage</a></div></li>
<li><div>Parcours type: </div> <div><span class="icon profile p2"></span></div></li>
<li><div>ProfileScore: </div> <div>164</div></li>
<li><div>Vert. meters:</div> <div>3815</div></li>
<li><div>Departure:</div> <div><a href="location/cesenatico">Cesenatico</a></div></li>
<li><div>Arrival:</div> <div><a href="location/cesenatico">Cesenatico</a></div></li>
<li><div>Race ranking:</div> <div>11</div></li>
<li><div>Startlist quality score:</div> <div><a href="race/giro-d-italia/2020/stage-12/startlist/lineup-quality">732</a></div></li>
<li><div>Won how: </di

In [161]:

#div = soup.find_all('div')
full_list_chunks = np.arange(0,len(div), 2)
ml_ls = []
ls = []
for chunk in full_list_chunks:
    try:
        di = clean_stages(chunks(div, 2, chunk), 2017)
        ls.append(di)
    except IndexError:
        continue
ml_ls.append(ls)

p2


In [162]:
ls

[{'name': 'Date:', 'value': '15 October 2020', 'year': 2017},
 {'name': 'Start time:', 'value': '11:15 ', 'year': 2017},
 {'name': 'Avg. speed winner:', 'value': '36.93 km/h', 'year': 2017},
 {'name': 'Race category:', 'value': 'ME - Men Elite', 'year': 2017},
 {'name': 'Distance:', 'value': '204 km', 'year': 2017},
 {'name': 'Points scale:', 'value': 'GT.B.Stage', 'year': 2017},
 {'name': 'Parcours type:', 'value': 'p2', 'year': 2017},
 {'name': 'ProfileScore:', 'value': '164', 'year': 2017},
 {'name': 'Vert. meters:', 'value': '3815', 'year': 2017},
 {'name': 'Departure:', 'value': 'Cesenatico', 'year': 2017},
 {'name': 'Arrival:', 'value': 'Cesenatico', 'year': 2017},
 {'name': 'Race ranking:', 'value': '11', 'year': 2017},
 {'name': 'Startlist quality score:', 'value': '732', 'year': 2017},
 {'name': 'Won how:', 'value': '25 km solo', 'year': 2017}]

In [164]:
def get_stage_profiles():
    years = [2017]#,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
        for day in range(1,22):
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-{day}'
            print(url)
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser').find('ul', class_='infolist')
            div = soup.find_all('div')
            full_list_chunks = np.arange(0,len(div), 2)
            ls = []
            for chunk in full_list_chunks:
                #try:
                di = clean_stages(chunks(div, 2, chunk), 2017)
                ls.append(di)
                sub_df = pd.DataFrame(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('parcours.csv', index=False)
get_stage_profiles()

https://www.procyclingstats.com/race/giro-d-italia/2017/stage-1
p2


IndexError: list index out of range

In [152]:
zestien = pd.read_csv('parcours.csv')
zestien


Unnamed: 0,name,value,year
0,Date:,05 May 2017,2017
1,Date:,05 May 2017,2017
2,Start time:,13:25,2017
3,Date:,05 May 2017,2017
4,Start time:,13:25,2017
...,...,...,...
2200,Departure:,Monza,2017
2201,Arrival:,Milano,2017
2202,Race ranking:,23,2017
2203,Startlist quality score:,795,2017


In [None]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]