# Scraping Cycling Stats

In [1]:
import pandas as pd
import requests
import bs4

In [2]:
#pretend to be a browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

## get start list:

### scraping 2019 giro-d-italia startlist

In [430]:
resp = requests.get('https://www.procyclingstats.com/race/giro-d-italia/2019/gc/startlist',headers=header).text

In [431]:
# use beautiful soup to parse response
soup = bs4.BeautifulSoup(resp,'lxml')

In [449]:
startlist = pd.DataFrame()

for team in soup.findAll('li',{'class':'team'}):
    tmp_dict = {}
    #team
    team_name = team.find_next('h4').text.split('.')[1].strip()
    tmp_dict['team_name'] = team_name
    #riders
    riders = team.findAll('a',{'class':'rider'})
    tmp_dict['rider_name'] = [x.text for x in riders]
    base_url = 'https://www.procyclingstats.com/'
    tmp_dict['rider_link'] = [base_url + x['href'] for x in riders]
    tmp_df = pd.DataFrame(tmp_dict)
    startlist = startlist.append(tmp_df,ignore_index=True)

In [433]:
startlist.head()

Unnamed: 0,rider_links,rider_name,team_name
0,https://www.procyclingstats.com/rider/mikel-landa,LANDA Mikel,Movistar Team
1,https://www.procyclingstats.com/rider/andrey-a...,AMADOR Andrey,Movistar Team
2,https://www.procyclingstats.com/rider/richard-...,CARAPAZ Richard,Movistar Team
3,https://www.procyclingstats.com/rider/hector-c...,CARRETERO HÃ©ctor,Movistar Team
4,https://www.procyclingstats.com/rider/lluis-gu...,MAS LluÃ­s,Movistar Team


## Get rider race results

### Function scraping rider races

In [437]:
def get_race_results(url):
    resp = requests.get(url,headers=header).text
    soup = bs4.BeautifulSoup(resp,'lxml')
    results = soup.findAll('div',{'class':'results'})[0]
    
    #get rows
    races = []
    for race in results.findAll('ul',{'class':'prres'})[0].findAll('li'):
        race = race.findAll('div')
        #if len(race) <= 9:
        row = [x.text for x in race[:9]]
        races.append(row)
    races = pd.DataFrame(races)
    
    #get header
    results_header = [x.text for x in results.findAll('ul',{'class':'prresHead'})[0].findAll('li')]
    races.columns = results_header
    
    #drop subtitles, only consider results (inc classifications)
    races = races[races.Result != '']
    races = races.iloc[:,:8]
    
    return races


### Iterating through start list for rider stats

In [445]:
data = pd.DataFrame()

years = ['2019','2018','2017']

for i,j in startlist.iterrows():
    
    #for each rider
    rider_name = j['rider_name']
    rider_link = j['rider_link']
    
    #for the last three years:
    for year in years:
        
        url = '{}/{}'.format(rider_link,year)
        
        try:
            tmp_df = get_race_results(url)
        except:
            print(rider_name,year,'failed')
            
        tmp_df['rider_name'] = rider_name
        tmp_df['year'] = year
        
        data = data.append(tmp_df,ignore_index=True)


### Clean further

In [476]:
#remove empty field
data_cols = ['Date', 'Result','Race', 'Distance', 'PointsPCS',
             'PointsUCI', 'rider_name', 'year']
data = data[data_cols]

#remove missing dates i.e. classifications
data = data[data.Date != ''].reset_index(drop=True)

#lower case cols
data.columns = [x.lower() for x in data.columns]

#### Fix dates

In [493]:
#create date
def make_date(x):
    day_month = x.date.split('.')
    day = day_month[0]
    month = day_month[1]
    year = x.year
    date = '{}-{}-{}'.format(year,month,day)
    return pd.to_datetime(date)

In [494]:
data['date'] = data.apply(make_date,1)

In [495]:
data.head()

Unnamed: 0,date,result,race,distance,pointspcs,pointsuci,rider_name,year
0,2019-05-05,DNS,Stage 3 - Cangas del Narcea › Oviedo,119.0,,,LANDA Mikel,2019
1,2019-05-04,2,Stage 2 - Soto de Ribera › Cangas del Narcea,171.1,12.0,5.0,LANDA Mikel,2019
2,2019-05-03,11,Stage 1 - Oviedo › Pola de Lena,179.2,,,LANDA Mikel,2019
3,2019-04-28,7,LiÃ¨ge-Bastogne-LiÃ¨ge (1.UWT),256.0,80.0,150.0,LANDA Mikel,2019
4,2019-04-13,7,Stage 6 - Eibar › Eibar,118.2,4.0,,LANDA Mikel,2019


In [500]:
data.to_pickle('rider_results.pkl')