In [69]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [70]:
years = [2017,2018,2019,2020,2021]

In [71]:
categories = ['top-gc-riders', 'top-sprinters', 'top-time-trial-riders', 'best-classic-riders']

In [72]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]

In [73]:
def clean_scrape_profile(row, category, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di['name'] = name
    di['team'] = team
    di['points'] = points
    di['ranking'] = ranking
    di['type'] = category
    di['year'] = year
    return di

In [74]:
def clean_scrape_performance(row, year):
    di = {}
    name_s = row[1].text.strip()
    name_ls = name_s.split()
    name = ' '.join([i.capitalize() for i in name_ls])
    gc_result = row[2].text
    giro_w = row[3].text
    stage_result = row[4].text
    stage_wins = row[5].text
    top10 = row[5].text
    total = row[7].text
    di['name'] = name
    di['gc_result'] = gc_result
    di['giro_w'] = giro_w
    di['stage_result'] = stage_result
    di['stage_wins'] = stage_wins
    di['top10'] = top10
    di['giro_total'] = total
    di['year'] = year
    return di

In [75]:
def clean_scrape_teams(row, year):
    di = {}
    team = row[1].text.lstrip()
    division = row[2].text
    points = row[3].text
    di['team'] = team
    di['team_class'] = division
    di['team_points'] = points
    di['year'] = year
    return di

In [76]:
def clean_age(row, year):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    age = row[3].text
    di['name'] = name
    di['team'] = team
    di['age'] = age[0:2]
    di['year'] = year
    return di

In [195]:
def clean_scrape_stages(row, year, stage, clas, last=False):
    di = {}
    rnk = row[0]
    if last==False:
        name = row[1].text.lstrip()
    else:
        name = row[3].text.lstrip()
    #di['key'] = name + '-' + str(year) + '-' + str(stage)
    di['name'] = name
    if len(clas) > 0:
        di['clas'] = clas
    di['clas'] = 'stage'
    di['rnk'] = rnk
    return di

In [180]:
master_df = pd.DataFrame(columns=['name','team','points','ranking','type','year'])

In [78]:
all_riders = []

In [79]:
def get_age():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/youngest-competitors'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_age(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_age.csv', index=False)
get_age()

In [80]:
def get_teams():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/teams-ranked'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 4)
            ls = []
            for chunk in full_list_chunks:
                try:
                    di = clean_scrape_teams(chunks(td, 4, chunk), year)
                    ls.append(di)
                except IndexError:
                    continue
                sub_df = pd.DataFrame.from_dict(ls)
                master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_teams.csv', index=False)
get_teams()

In [81]:
def get_performance():
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
            url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/gc/startlist/previous-performance'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 9)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_performance(chunks(td, 9, chunk), year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_performance.csv', index=False)
get_performance()

In [82]:
def get_profile():
    categories = ['top-gc-riders', 'top-sprinters', 'top-time-trial-riders', 'best-classic-riders']
    years = [2017,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
        for category in categories:
            url = f'https://www.procyclingstats.com/race.php?missing=0&filter=Filter&id1=giro-d-italia&id2={year}&id3=gc&id4=startlist&id5={category}'
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            td = soup.find_all('td')
            full_list_chunks = np.arange(0,len(td), 5)
            ls = []
            for chunk in full_list_chunks:
                di = clean_scrape_profile(chunks(td, 5, chunk), category, year)
                ls.append(di)
            sub_df = pd.DataFrame.from_dict(ls)
            master_df_ls.append(sub_df)
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_profile.csv', index=False)
get_profile()
        
        

In [200]:
def get_stages():
    years = [2017]#,2018,2019,2020,2021]
    master_df_ls = []
    for year in years:
        print(year)
        for stage in range(20,22):
            print(stage)
            for clas in ['', '-gc', '-points', '-kom', '-youth', '-teams']:
                print(clas)
                try:
                    url = f'https://www.procyclingstats.com/race/giro-d-italia/{year}/stage-{stage}{clas}'
                    print(url)
                    response = requests.get(url)
                    soup = BeautifulSoup(response.content, 'html.parser')
                    td = soup.find_all('td', class_=None)
                    dictionary = {
                        '':8,
                                '-gc':8,
                                '-points':8,
                                '-kom':8,
                                '-youth':6,
                                '-teams':6
                            }
                    length = dictionary[clas]
                    full_list_chunks = np.arange(0,len(td), length)
                    ls = []
                    for chunk in full_list_chunks:
                        di = clean_scrape_stages(chunks(td, length, chunk), year, stage, clas)
                        ls.append(di)
                    sub_df = pd.DataFrame.from_dict(ls)
                    master_df_ls.append(sub_df)
                except IndexError:
                        if clas != '':
                            url = f"https://www.procyclingstats.com/race/giro-d-italia/{year}{clas.replace('-','/')}"
                            print(url)
                            response = requests.get(url)
                            soup = BeautifulSoup(response.content, 'html.parser')
                            td = soup.find_all('td', class_=None)
                            dictionary = {
                                '-gc':8,
                                '-points':8,
                                '-kom':8,
                                '-youth':6,
                                '-teams':6
                            }
                            length = dictionary[clas]
                            full_list_chunks = np.arange(0,len(td), length)
                            ls = []
                            for chunk in full_list_chunks:
                                di = clean_scrape_stages(chunks(td, length, chunk), year, stage, clas.replace('/','-'), last=True)
                                ls.append(di)
                            sub_df = pd.DataFrame.from_dict(ls)
                            master_df_ls.append(sub_df)
                        else:
                            pass
    X1 = pd.concat(master_df_ls, ignore_index=True)
    X1.to_csv('data_stages.csv', index=False)
get_stages()

2017
20

https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20
-gc
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20-gc
-points
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20-points
-kom
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20-kom
-youth
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20-youth
-teams
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-20-teams
21

https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21
-gc
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21-gc
-points
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21-points
-kom
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21-kom
-youth
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21-youth
-teams
https://www.procyclingstats.com/race/giro-d-italia/2017/stage-21-teams


In [153]:
np.arange(1,22,1)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21])

In [83]:
profile = pd.read_csv('data_profile.csv').drop_duplicates()
age = pd.read_csv('data_age.csv').drop_duplicates()
performance = pd.read_csv('data_performance.csv').drop_duplicates()
teams = pd.read_csv('data_teams.csv').drop_duplicates()

In [84]:
age

Unnamed: 0,name,team,age,year
0,Albanese Vincenzo,Bardiani - CSF,20,2017
2,Martínez Daniel Felipe,Wilier Triestina - Selle Italia,21,2017
5,Pedersen Mads,Trek - Segafredo,21,2017
9,De Plus Laurens,Quick-Step Floors,21,2017
14,Schlegel Michal,CCC Sprandi Polkowice,21,2017
...,...,...,...,...
82520,Richeze Maximiliano,UAE Team Emirates,38,2021
82701,Visconti Giovanni,Bardiani-CSF-Faizanè,38,2021
82883,Keisse Iljo,Deceuninck - Quick Step,38,2021
83066,Pozzovivo Domenico,Team Qhubeka ASSOS,38,2021


In [94]:
table2 = pd.merge(profile, age,  how='outer', left_on=['name','year', 'team'], right_on = ['name','year', 'team'])

In [157]:
'-'.replace('-','/')

'/'

In [95]:
#table3 = pd.merge(table2, performance, how='outer', left_on=['name','year'], right_on = ['name','year'])

In [134]:
table4 = pd.merge(table2, teams, how='outer', left_on=['team','year'], right_on = ['team','year']).drop(columns=['ranking', 'team_class'])

In [144]:
table4['year'] = table4['year'].astype('str')
table4['key'] = table4[['name', 'year']].agg('-'.join, axis=1)
table4

Unnamed: 0,name,team,points,type,year,age,team_points,key
0,Quintana Nairo,Movistar Team,1550.0,top-gc-riders,2017,27,3999,Quintana Nairo-2017
1,Quintana Nairo,Movistar Team,34.6,top-sprinters,2017,27,3999,Quintana Nairo-2017
2,Quintana Nairo,Movistar Team,358.0,top-time-trial-riders,2017,27,3999,Quintana Nairo-2017
3,Izagirre Gorka,Movistar Team,267.0,top-gc-riders,2017,29,3999,Izagirre Gorka-2017
4,Izagirre Gorka,Movistar Team,39.3,top-sprinters,2017,29,3999,Izagirre Gorka-2017
...,...,...,...,...,...,...,...,...
2159,Bevin Patrick,Israel Start-Up Nation,8.0,top-sprinters,2021,30,1405,Bevin Patrick-2021
2160,Bevin Patrick,Israel Start-Up Nation,85.0,top-time-trial-riders,2021,30,1405,Bevin Patrick-2021
2161,Martin Dan,Israel Start-Up Nation,66.8,top-sprinters,2021,34,1405,Martin Dan-2021
2162,Martin Dan,Israel Start-Up Nation,38.0,top-time-trial-riders,2021,34,1405,Martin Dan-2021


In [145]:
table5 = table4.pivot_table(values='points', index='key', columns='type', aggfunc='first').reset_index()
table5

type,key,best-classic-riders,top-gc-riders,top-sprinters,top-time-trial-riders
0,Ackermann Pascal-2019,748.0,28.0,1069.4,
1,Affini Edoardo-2020,40.0,100.0,55.0,273.0
2,Affini Edoardo-2021,3.0,5.0,,109.0
3,Albanese Vincenzo-2017,90.0,,246.0,30.0
4,Albanese Vincenzo-2021,36.0,,34.0,
...,...,...,...,...,...
845,van den Berg Lars-2021,,,26.0,
846,van der Hoorn Taco-2021,32.0,15.0,,
847,Černý Josef-2019,15.0,149.0,126.2,188.0
848,Černý Josef-2020,39.0,23.0,,162.0


In [150]:
table6 = pd.merge(table4, table5, how='outer', left_on='key', right_on = 'key').drop(columns=['type', 'points']).drop_duplicates()
table6

Unnamed: 0,name,team,year,age,team_points,key,best-classic-riders,top-gc-riders,top-sprinters,top-time-trial-riders
0,Quintana Nairo,Movistar Team,2017,27,3999,Quintana Nairo-2017,,1550.0,34.6,358.0
3,Izagirre Gorka,Movistar Team,2017,29,3999,Izagirre Gorka-2017,76.0,267.0,39.3,172.0
7,Amador Andrey,Movistar Team,2017,30,3999,Amador Andrey-2017,37.0,180.0,85.0,202.0
11,de la Parte Víctor,Movistar Team,2017,30,3999,de la Parte Víctor-2017,29.0,145.0,,24.0
14,Anacona Winner,Movistar Team,2017,28,3999,Anacona Winner-2017,16.0,123.0,,
...,...,...,...,...,...,...,...,...,...,...
2150,Niv Guy,Israel Start-Up Nation,2021,27,1405,Niv Guy-2021,,30.0,,25.0
2152,De Marchi Alessandro,Israel Start-Up Nation,2021,34,1405,De Marchi Alessandro-2021,110.0,25.0,,47.0
2155,Cimolai Davide,Israel Start-Up Nation,2021,31,1405,Cimolai Davide-2021,19.0,20.0,146.0,
2158,Bevin Patrick,Israel Start-Up Nation,2021,30,1405,Bevin Patrick-2021,,9.0,8.0,85.0


In [None]:
td = soup.find_all('td')

750

In [None]:
int(len(td)/5)

150

In [None]:
full_list_chunks = np.arange(0,len(td), 5)


In [None]:
def chunks(lst, n, start):
    """Yield successive n-sized chunks from lst."""
    for i in range(start, len(lst), n):
        return lst[i:i + n]

In [None]:
def clean_scrape(row):
    di = {}
    name = row[1].text.lstrip()
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di['name'] = name
    di['team'] = team
    di['points'] = points
    di['ranking'] = ranking
    di['type'] = categories
    di['year'] = '2021'
    return di

In [None]:
ls = []
for chunk in full_list_chunks:
    di = clean_scrape(chunks(td, 5, chunk))
    ls.append(di)

In [None]:
pd.DataFrame.from_dict(ls)

Unnamed: 0,name,team,points,ranking,type,year
0,Evenepoel Remco,Deceuninck - Quick Step,700,4,top-gc-riders,2021
1,Ulissi Diego,UAE Team Emirates,566,8,top-gc-riders,2021
2,Landa Mikel,Bahrain - Victorious,528,11,top-gc-riders,2021
3,Yates Simon,Team BikeExchange,528,10,top-gc-riders,2021
4,Hindley Jai,Team DSM,516,14,top-gc-riders,2021
...,...,...,...,...,...,...
145,Covi Alessandro,UAE Team Emirates,5,846,top-gc-riders,2021
146,Duchesne Antoine,Groupama - FDJ,5,885,top-gc-riders,2021
147,Petilli Simone,Intermarché - Wanty - Gobert Matériaux,5,871,top-gc-riders,2021
148,Nizzolo Giacomo,Team Qhubeka ASSOS,5,822,top-gc-riders,2021


In [None]:
#row = chunks(td, 5, 0)
row = chunks(td,5,5)
row

[<td>2</td>,
 <td><span class="flag it"></span> <a href="rider/diego-ulissi"><span class="uppercase">Ulissi</span> Diego</a></td>,
 <td>UAE Team Emirates</td>,
 <td>566</td>,
 <td>8</td>]

In [None]:
clean_scrape(row)

{' Ulissi Diego': ['UAE Team Emirates', '566', '8']}

In [None]:
di = {}
#di2 = {}
def clean_scrape(row):
    name = row[1].text
    team = row[2].text
    points = row[3].text
    ranking = row[4].text
    di[name] = [team, points, ranking]
    return di

SyntaxError: 'return' outside function (1095910690.py, line 9)

In [None]:
[di,di2]

[{}, {' Evenepoel Remco': ['Deceuninck - Quick Step', '700', '4']}]

In [None]:
pd.DataFrame.from_dict(di).T

Unnamed: 0,0,1,2
Evenepoel Remco,Deceuninck - Quick Step,700,4
