In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from pandas import Series, DataFrame
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta  

In [4]:
#pretend to be a browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [18]:
#url data
base_url = 'https://www.procyclingstats.com/race/'

In [19]:
#grand tours
tours = ['giro-d-italia','tour-de-france','vuelta-a-espana']
#considered years
years = list(range(2010,2018,1))
years = [str(x) for x in years]
#stages
stages = list(range(1,22,1))
stages = [str(x) for x in stages]

In [20]:
def fix_time(time):
    '''fix formatting of time list'''
    #remove duplicates:
    time = [x[0] for x in results.time.str.split()]
    #turn into time:
    for n,t in enumerate(time):
        if t == ',,':
            time[n] = time[n-1]
        elif len(t) == 4:
            time[n] = '00:0' + t
        elif len(t) == 5:
            time[n] = '00:' + t
        elif len(t) == 7:
            time[n] = '0' + t
    time = pd.to_datetime(time,format='%H:%M:%S',errors='coerce')
    return time 

In [21]:
def race_time(time):
    '''time is our list of times'''
    race_time = []
    for n,t in enumerate(time):
        if n == 0:
            race_time.append(time[n])
        else:
            hours = time[n].hour
            minutes = time[n].minute
            seconds = time[n].second
            rt = time[0] + timedelta(hours = hours, minutes = minutes, seconds = seconds)
            race_time.append(rt)
    return Series(race_time).dt.time

In [22]:
data = pd.DataFrame()
#loop through years, tours and stages
for year in years:
    for tour in tours:
        for stage in stages:
            try:
                #construct urls
                url = base_url + tour +'/'+ year + '/stage-' + stage
                r = requests.get(url,headers=header)
                results = pd.read_html(r.text)
                results = results[0]
                #fix columns
                results.columns = results.columns.str.lower()
                #create matching fields
                results['race'] = tour
                results['year'] = year
                results['stage'] = stage
                #fix times
                results.time = fix_time(results.time)
                #make sure to only include valid results
                results = results[-results.time.isna()]
                #add race times
                results['race_time'] = race_time(results.time)
                results.time = Series(results.time).dt.time
                #build dataframe
                if data.empty:
                    data = results
                    print('Stage %s of the %s %s collected' %(stage,tour,year))
                else:
                    data = data.append(results)
                    print('Stage %s of the %s %s collected' %(stage,tour,year))
            except:
                print('There was an error collecting Stage %s of the %s %s' %(stage,tour,year))
                pass

Stage 1 of the giro-d-italia 2010 collected
Stage 2 of the giro-d-italia 2010 collected
Stage 3 of the giro-d-italia 2010 collected
There was an error collecting Stage 4 of the giro-d-italia 2010
Stage 5 of the giro-d-italia 2010 collected
Stage 6 of the giro-d-italia 2010 collected
Stage 7 of the giro-d-italia 2010 collected
Stage 8 of the giro-d-italia 2010 collected
Stage 9 of the giro-d-italia 2010 collected
Stage 10 of the giro-d-italia 2010 collected
Stage 11 of the giro-d-italia 2010 collected
Stage 12 of the giro-d-italia 2010 collected
Stage 13 of the giro-d-italia 2010 collected
Stage 14 of the giro-d-italia 2010 collected
Stage 15 of the giro-d-italia 2010 collected
Stage 16 of the giro-d-italia 2010 collected
Stage 17 of the giro-d-italia 2010 collected
Stage 18 of the giro-d-italia 2010 collected
Stage 19 of the giro-d-italia 2010 collected
Stage 20 of the giro-d-italia 2010 collected
Stage 21 of the giro-d-italia 2010 collected
Stage 1 of the tour-de-france 2010 collected

Stage 10 of the vuelta-a-espana 2012 collected
Stage 11 of the vuelta-a-espana 2012 collected
Stage 12 of the vuelta-a-espana 2012 collected
Stage 13 of the vuelta-a-espana 2012 collected
Stage 14 of the vuelta-a-espana 2012 collected
Stage 15 of the vuelta-a-espana 2012 collected
Stage 16 of the vuelta-a-espana 2012 collected
Stage 17 of the vuelta-a-espana 2012 collected
Stage 18 of the vuelta-a-espana 2012 collected
Stage 19 of the vuelta-a-espana 2012 collected
Stage 20 of the vuelta-a-espana 2012 collected
Stage 21 of the vuelta-a-espana 2012 collected
Stage 1 of the giro-d-italia 2013 collected
Stage 2 of the giro-d-italia 2013 collected
Stage 3 of the giro-d-italia 2013 collected
Stage 4 of the giro-d-italia 2013 collected
Stage 5 of the giro-d-italia 2013 collected
Stage 6 of the giro-d-italia 2013 collected
Stage 7 of the giro-d-italia 2013 collected
Stage 8 of the giro-d-italia 2013 collected
Stage 9 of the giro-d-italia 2013 collected
Stage 10 of the giro-d-italia 2013 colle

Stage 21 of the tour-de-france 2015 collected
There was an error collecting Stage 1 of the vuelta-a-espana 2015
Stage 2 of the vuelta-a-espana 2015 collected
Stage 3 of the vuelta-a-espana 2015 collected
Stage 4 of the vuelta-a-espana 2015 collected
Stage 5 of the vuelta-a-espana 2015 collected
Stage 6 of the vuelta-a-espana 2015 collected
Stage 7 of the vuelta-a-espana 2015 collected
Stage 8 of the vuelta-a-espana 2015 collected
Stage 9 of the vuelta-a-espana 2015 collected
Stage 10 of the vuelta-a-espana 2015 collected
Stage 11 of the vuelta-a-espana 2015 collected
Stage 12 of the vuelta-a-espana 2015 collected
Stage 13 of the vuelta-a-espana 2015 collected
Stage 14 of the vuelta-a-espana 2015 collected
Stage 15 of the vuelta-a-espana 2015 collected
Stage 16 of the vuelta-a-espana 2015 collected
Stage 17 of the vuelta-a-espana 2015 collected
Stage 18 of the vuelta-a-espana 2015 collected
Stage 19 of the vuelta-a-espana 2015 collected
Stage 20 of the vuelta-a-espana 2015 collected
Sta

In [168]:
#data.to_pickle('pcs_stage_results_raw')

In [169]:
#data.to_pickle('pcs_stage_results_sub')

In [None]:
#change saving type

In [5]:
#import pickle

In [42]:
#data_sub = pickle.load(open('pcs_stage_results_raw','rb'))

In [81]:
data_sub = data_sub[-data_sub.rnk.str.contains('\*',na=False)] #remove crashes *

In [133]:
data_sub = data_sub[data_sub.rnk.str.isdigit() == True] #only include ranks.

In [144]:
data_sub.rnk = data_sub.rnk.astype(int) #fix formatting.

In [154]:
data_sub = data_sub.fillna(0) #fill NAs.

In [155]:
#slice data
model_data = data_sub[['year','race','stage','team','rider','bib','age','rnk', 'time', 'race_time','pnt']]

In [156]:
model_data

Unnamed: 0,year,race,stage,team,rider,bib,age,rnk,time,race_time,pnt
0,2010,giro-d-italia,1,Sky Procycling,Wiggins Bradley,171.0,30,1,00:10:18,00:10:18,80.0
1,2010,giro-d-italia,1,BMC Racing Team,Bookwalter Brent,2.0,26,2,00:00:02,00:10:20,50.0
2,2010,giro-d-italia,1,BMC Racing Team,Evans Cadel,1.0,33,3,00:00:02,00:10:20,35.0
3,2010,giro-d-italia,1,Astana,Vinokourov Alexandre,41.0,36,4,00:00:05,00:10:23,25.0
4,2010,giro-d-italia,1,Sky Procycling,Henderson Gregory,177.0,33,5,00:00:05,00:10:23,18.0
5,2010,giro-d-italia,1,Saxo Bank,Porte Richie,219.0,25,6,00:00:05,00:10:23,15.0
6,2010,giro-d-italia,1,Team Garmin - Transitions,Millar David,111.0,33,7,00:00:06,00:10:24,12.0
7,2010,giro-d-italia,1,Saxo Bank,Larsson Gustav,215.0,29,8,00:00:07,00:10:25,10.0
8,2010,giro-d-italia,1,Rabobank ProTeam,van Emden Jos,168.0,25,9,00:00:09,00:10:27,8.0
9,2010,giro-d-italia,1,Team HTC - Columbia,Pinotti Marco,182.0,34,10,00:00:09,00:10:27,6.0
