In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [105]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]

In [106]:
def clean_scrape_profile(row, category, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di['name'] = name
    di['team'] = team
    di['points'] = points
    di['ranking'] = ranking
    di['type'] = category
    di['year'] = year
    return di

In [107]:
def clean_scrape_performance(row, year):
    di = {}
    name_s = row[1].text.strip()
    name_ls = name_s.split()
    name = ' '.join([i.capitalize() for i in name_ls])
    gc_result = row[2].text
    giro_w = row[3].text
    stage_result = row[4].text
    stage_wins = row[5].text
    top10 = row[5].text
    total = row[7].text
    di['name'] = name
    di['gc_result'] = gc_result
    di['giro_w'] = giro_w
    di['stage_result'] = stage_result
    di['stage_wins'] = stage_wins
    di['top10'] = top10
    di['giro_total'] = total
    di['year'] = year
    return di

In [108]:
def clean_scrape_teams(row, year):
    di = {}
    team = row[1].text.lstrip()
    division = row[2].text
    points = row[3].text
    di['team'] = team
    di['team_class'] = division
    di['team_points'] = points
    di['year'] = year
    return di

In [109]:
def clean_age(row, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    age = row[3].text
    di['name'] = name
    di['team'] = team
    di['age'] = age[0:2]
    di['year'] = year
    return di

In [None]:
def clean_finish(row, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    age = row[3].text
    di['name'] = name
    di['team'] = team
    di['age'] = age[0:2]
    di['year'] = year
    return di

In [110]:
def get_age():
    years = [2016, 2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/youngest-competitors'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_age(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_age.csv', index=False)
get_age()

In [111]:
def get_teams():
    years = [2016, 2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/teams-ranked'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_scrape_teams(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_teams.csv', index=False)
get_teams()

In [112]:
def get_performance():
    years = [2016,2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/previous-performance'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 9)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_performance(chunks(td, 9, chunk), year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_performance.csv', index=False)
get_performance()

In [113]:
def get_profile():
    categories = ['top-gc-riders', 'top-sprinters', 'top-time-trial-riders', 'best-classic-riders']
    years = [2016,2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
        for category in categories:
            url = f'https://www.procyclingstats.com/race.php?missing=0&filter=Filter&id1=giro-d-italia&id2={year}&id3=gc&id4=startlist&id5={category}'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 5)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_profile(chunks(td, 5, chunk), category, year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_profile.csv', index=False)
get_profile()
        
        

In [47]:
from IPython.core.debugger import set_trace


In [5]:
def get_finishes():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-1/startlist/most-starts-finishes'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser').find('table', class_='basic').tbody
            tr = soup.find_all('tr')
            ls = []
            for row in range(len(tr)):
                name = tr[row].find_all('td')[1].text.strip()
                finished = int(tr[row].find_all('td')[3].text) / int(tr[0].find_all('td')[2].text)
                di = {}
                di['rider'] = name
                di['finished'] = finished
                di['year'] = str(year)
                print(di)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_finishes.csv', index=False)
get_finishes()

{'rider': 'Tiralongo Paolo', 'finished': 0.8181818181818182, 'year': '2017'}
{'rider': 'Pellizotti Franco', 'finished': 0.8181818181818182, 'year': '2017'}
{'rider': 'Pozzovivo Domenico', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Dupont Hubert', 'finished': 0.8181818181818182, 'year': '2017'}
{'rider': 'Bak Lars Ytting', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Hansen Adam', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Visconti Giovanni', 'finished': 0.6363636363636364, 'year': '2017'}
{'rider': 'Capecchi Eros', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Cataldo Dario', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Siutsou Kanstantsin', 'finished': 0.45454545454545453, 'year': '2017'}
{'rider': 'Agnoli Valerio', 'finished': 0.5454545454545454, 'year': '2017'}
{'rider': 'Pirazzi Stefano', 'finished': 0.6363636363636364, 'year': '2017'}
{'rider': 'Brutt Pavel', 'finished': 0.5454545454545454, 'year': '2017'}
{'

In [43]:
def get_team_strength():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/percentage-of-max-team-strength'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser').find('table', class_='basic').tbody
            tr = soup.find_all('tr')
            ls = []
            for row in range(len(tr)):
                team = tr[row].find('a' , href=True).text.strip()
                strength = tr[row].find_all('span', style=True)[-1].text
                di = {}
                di['team'] = team
                di['strength'] = strength
                di['year'] = str(year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_team_strength.csv', index=False)
get_team_strength()

In [34]:
year=2021
url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/percentage-of-max-team-strength'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser').find('table', class_='basic').tbody
tr = soup.find_all('tr')
team = tr[0].find('a' , href=True).text.strip()
strength = tr[0].find_all('span', style=True)[-1].text
#finished = int(tr[0].find_all('td')[3].text) / int(tr[0].find_all('td')[2].text)
#finished

In [16]:
int(tr[0].find_all('td')[3].text)

'10'

In [None]:
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-1/startlist/most-starts-finishes

In [None]:
def get_finisrate():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-1/startlist/most-starts-finishes'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 9)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_performance(chunks(td, 9, chunk), year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data/data_performance.csv', index=False)
get_performance()

In [63]:
# Scraper

for year in [2016, 2017, 2018, 2019, 2020, 2021]:

    teams = {}
    url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-1'
    print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser').find('tbody')
    tr = soup.find_all('tr')
    dictionary = {}
    for i in range(len(tr)):
            td = tr[i].find_all('td', class_=None)[:2]
            name = td[1].find('a').text.split()
            name = ' '.join([x.capitalize() for x in name])
            team = td[1].find_all('span')[-1].text
            teams[name] = team

    teams_dictionary = {}
    class_dictionary = {}
    for day in range(1,22):
        url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-{day}'
        print(url)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser').find_all('table', class_='results basic moblist10')
        print(len(soup))
        if len(soup) == 6:
            for k, m in enumerate([ '-stage', '-gc', '-points', '-kom', '-youth']):
                print(k,m)
                tr = soup[k].find_all('tr')
                dictionary = {}
                for i in range(1,len(tr)):
                    td = tr[i].find_all('td', class_=None)[:2]
                    name = td[1].find('a').text.split()
                    name = ' '.join([x.capitalize() for x in name])
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                    dictionary[name] = td[0].text
                    if day < 10 :
                        class_dictionary['0'+str(day)+m] = dictionary
                    else:
                        class_dictionary[str(day)+m] = dictionary
            #get teams
            tr = soup[-1].find_all('tr')
            dictionary = {}
            for i in range(1,len(tr)):
                td = tr[i].find_all('td', class_=None)[:2]
                name = td[1].find('a').text
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                dictionary[name] = td[0].text
                if day < 10 :
                    teams_dictionary["0"+str(day)+'-teams'] = dictionary
                else:
                    teams_dictionary[str(day)+'-teams'] = dictionary
            
        else:
            for k, m in enumerate([ '-stage', '-gc', '-points', '-youth']):
                print(k,m)
                tr = soup[k].find_all('tr')
                dictionary = {}
                for i in range(1,len(tr)):
                    td = tr[i].find_all('td', class_=None)[:2]
                    name = td[1].find('a').text.split()
                    name = ' '.join([x.capitalize() for x in name])
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                    dictionary[name] = td[0].text
                    if day < 10 :
                        class_dictionary['0'+str(day)+m] = dictionary
                    else:
                        class_dictionary[str(day)+m] = dictionary
            
            tr = soup[-1].find_all('tr')
            dictionary = {}
            for i in range(1,len(tr)):
                td = tr[i].find_all('td', class_=None)[:2]
                name = td[1].find('a').text
                            #name = ''.join(name.apply(lambda x: x.capitalize for x in name))
                dictionary[name] = td[0].text
                if day < 10 :
                    teams_dictionary["0"+str(day)+'-teams'] = dictionary
                else:
                    teams_dictionary[str(day)+'-teams'] = dictionary
                
    #merge frames + csv
    teams_rider = pd.DataFrame.from_dict(teams, orient='index', columns=['team'])
    all_df = pd.DataFrame.from_dict(class_dictionary, orient='index').T
    teams_df = pd.DataFrame.from_dict(teams_dictionary, orient='index').T.reset_index().rename(columns={'index':'team'})
    combo = pd.merge(all_df, teams_rider, left_index=True, right_index=True)
    combo = combo.reset_index().merge(teams_df, on='team', how='left')
    combo['year'] = str(year)
    combo.to_csv(f'data/data_raw_{year}.csv')

https://www.procyclingstats.com/race/giro-d-italia/2016/stage-1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-1
5
0 -stage
1 -gc
2 -points
3 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-2
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-3
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-4
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-5
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-6
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-7
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-8
6
0 -stage
1 -gc
2 -points
3 -kom
4 -youth
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-9
6
0 -stage
1 -gc
2 -poi

In [58]:
sorted(all_df.columns)

['19-gc',
 '19-kom',
 '19-points',
 '19-stage',
 '19-youth',
 '20-gc',
 '20-kom',
 '20-points',
 '20-stage',
 '20-youth',
 '21-gc',
 '21-kom',
 '21-points',
 '21-stage',
 '21-youth']

In [69]:
url = f'https://www.procyclingstats.com/race/giro-d-italia/2021/stage-2'
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser').find('ul', class_='infolist')

https://www.procyclingstats.com/race/giro-d-italia/2021/stage-2


In [71]:
div = soup.find_all('div')
soup

<ul class="infolist" style=""><li><div>Date:</div> <div>09 May 2021</div></li>
<li><div>Start time:</div> <div>12:55 </div></li>
<li><div>Avg. speed winner:</div> <div>41.126 km/h</div></li>
<li><div>Race category:</div> <div>ME - Men Elite</div></li>
<li><div>Distance: </div> <div>179 km</div></li>
<li><div>Points scale:</div> <div><a href="info.php?s=point-scales&amp;season=2021&amp;category=1&amp;scale=8">GT.B.Stage</a></div></li>
<li><div>Parcours type: </div> <div><span class="icon profile p1"></span></div></li>
<li><div>ProfileScore: </div> <div>9</div></li>
<li><div>Vert. meters:</div> <div>707</div></li>
<li><div>Departure:</div> <div><a href="location/stupinigi">Stupinigi </a></div></li>
<li><div>Arrival:</div> <div><a href="location/novara">Novara</a></div></li>
<li><div>Race ranking:</div> <div>12</div></li>
<li><div>Startlist quality score:</div> <div><a href="race/giro-d-italia/2021/stage-2/startlist/lineup-quality">885</a></div></li>
<li><div>Won how: </div> <div>Sprint o

In [93]:
def clean_stages(row, year, day):
    di = {}
    name = row[0].text.strip()
    value = row[1].text
    di['name'] = name
    if len(value) < 1:
        di['value'] = row[1].select('div span')[0]['class'][-1]
        print(di['value'])
    else:
       di['value'] = value
    
    di['year'] = year
    di['day'] = day
    return di

In [85]:
soup

<ul class="infolist" style=""><li><div>Date:</div> <div>09 May 2021</div></li>
<li><div>Start time:</div> <div>12:55 </div></li>
<li><div>Avg. speed winner:</div> <div>41.126 km/h</div></li>
<li><div>Race category:</div> <div>ME - Men Elite</div></li>
<li><div>Distance: </div> <div>179 km</div></li>
<li><div>Points scale:</div> <div><a href="info.php?s=point-scales&amp;season=2021&amp;category=1&amp;scale=8">GT.B.Stage</a></div></li>
<li><div>Parcours type: </div> <div><span class="icon profile p1"></span></div></li>
<li><div>ProfileScore: </div> <div>9</div></li>
<li><div>Vert. meters:</div> <div>707</div></li>
<li><div>Departure:</div> <div><a href="location/stupinigi">Stupinigi </a></div></li>
<li><div>Arrival:</div> <div><a href="location/novara">Novara</a></div></li>
<li><div>Race ranking:</div> <div>12</div></li>
<li><div>Startlist quality score:</div> <div><a href="race/giro-d-italia/2021/stage-2/startlist/lineup-quality">885</a></div></li>
<li><div>Won how: </div> <div>Sprint o

In [86]:

#div = soup.find_all('div')
full_list_chunks = np.arange(0,len(div), 2)
ml_ls = []
ls = []
for chunk in full_list_chunks:
    try:
        di = clean_stages(chunks(div, 2, chunk), 2017)
        ls.append(di)
    except IndexError:
        pass
ml_ls.append(ls)

p1


In [87]:
ls

[{'name': 'Date:', 'value': '09 May 2021', 'year': 2017},
 {'name': 'Start time:', 'value': '12:55 ', 'year': 2017},
 {'name': 'Avg. speed winner:', 'value': '41.126 km/h', 'year': 2017},
 {'name': 'Race category:', 'value': 'ME - Men Elite', 'year': 2017},
 {'name': 'Distance:', 'value': '179 km', 'year': 2017},
 {'name': 'Points scale:', 'value': 'GT.B.Stage', 'year': 2017},
 {'name': 'Parcours type:', 'value': 'p1', 'year': 2017},
 {'name': 'ProfileScore:', 'value': '9', 'year': 2017},
 {'name': 'Vert. meters:', 'value': '707', 'year': 2017},
 {'name': 'Departure:', 'value': 'Stupinigi ', 'year': 2017},
 {'name': 'Arrival:', 'value': 'Novara', 'year': 2017},
 {'name': 'Race ranking:', 'value': '12', 'year': 2017},
 {'name': 'Startlist quality score:', 'value': '885', 'year': 2017},
 {'name': 'Won how:', 'value': 'Sprint of large group', 'year': 2017}]

In [95]:
np.arange(0,len(div), 2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [104]:
def get_stage_profiles():
    years = [2016, 2017,2018,2019,2020,2021,2022]
    master_df_ls = []
    for year in years:
        for day in range(1,22):
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-{day}'
            print(url)
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser').find('ul', class_='infolist')
            div = soup.find_all('div')
            full_list_chunks = np.arange(0,len(div), 2)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_stages(chunks(div, 2, chunk), year, day)
                    ls.append(di)
                    sub_df = pd.DataFrame(ls)
                    master_df_ls.append(sub_df)
                except IndexError:
                    pass
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('parcours.csv', index=False)
get_stage_profiles()

https://www.procyclingstats.com/race/giro-d-italia/2016/stage-1
p1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-2
p1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-3
p1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-4
p2
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-5
p3
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-6
p5
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-7
p2
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-8
p3
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-9
p2
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-10
p5
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-11
p2
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-12
p1
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-13
p4
https://www.procyclingstats.com/race/giro-d-italia/2016/stage-14
p4
https://www.procyclingstats.com/race/giro-d-italia/2016/s

In [102]:
zestien = pd.read_csv('parcours.csv').drop_duplicates()
zestien[zestien['day']==2]


Unnamed: 0,name,value,year,day
105,Date:,06 May 2017,2017,2
107,Start time:,12:50,2017,2
110,Avg. speed winner:,36.3 km/h,2017,2
114,Race category:,ME - Men Elite,2017,2
119,Distance:,221 km,2017,2
125,Points scale:,GT.B.Stage,2017,2
132,Parcours type:,p2,2017,2
140,ProfileScore:,91,2017,2
149,Vert. meters:,3518,2017,2
159,Departure:,Olbia,2017,2


In [None]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]