In [48]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
import pandas as pd
from pandas import Series, DataFrame
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta
import numpy as np

In [50]:
#pretend to be a browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [51]:
#url data
base_url = 'https://www.procyclingstats.com/race/'

In [52]:
#grand tours
tours = ['giro-d-italia','tour-de-france','vuelta-a-espana']
#considered years
years = list(range(2014,2018,1))
years = [str(x) for x in years]
#stages
stages = list(range(1,22,1))
stages = [str(x) for x in stages]

In [53]:
def fix_time(time):
    '''fix formatting of time list'''
    #remove duplicates:
    time = [x[0] for x in results.time.str.split()]
    #turn into time:
    for n,t in enumerate(time):
        if t == ',,':
            time[n] = time[n-1]
        elif len(t) == 4:
            time[n] = '00:0' + t
        elif len(t) == 5:
            time[n] = '00:' + t
        elif len(t) == 7:
            time[n] = '0' + t
    time = pd.to_datetime(time,format='%H:%M:%S',errors='coerce')
    return time 

In [54]:
def race_time(time):
    '''time is our list of times'''
    race_time = []
    for n,t in enumerate(time):
        if n == 0:
            race_time.append(time[n])
        elif time[n] == time[0]:
            race_time.append(time[0])
        else:
            hours = time[n].hour
            minutes = time[n].minute
            seconds = time[n].second
            rt = time[0] + timedelta(hours = hours, minutes = minutes, seconds = seconds)
            race_time.append(rt)
    return race_time

In [55]:
def time_delta(time):
    '''return time deltas between rnks'''
    time_diff = []
    for n,t in enumerate(time):
        if n==0:
            hours = time[n].hour
            minutes = time[n].minute
            seconds = time[n].second
            td = time[n+1] - timedelta(hours = hours,minutes = minutes, seconds = seconds)
            time_diff.append(td)
        else:
            hours = time[n-1].hour
            minutes = time[n-1].minute
            seconds = time[n-1].second
            td = time[n] - timedelta(hours = hours,minutes = minutes, seconds = seconds)
            time_diff.append(td)
    return time_diff

In [56]:
#construct urls
url = base_url + tours[0] +'/'+ years[0] + '/stage-' + stages[0]
r = requests.get(url,headers=header)
results = pd.read_html(r.text)
results = results[0]
#fix columns
results.columns = results.columns.str.lower()
#create matching fields
results['race'] = tours[0]
results['year'] = years[0]
results['stage'] = stages[0]
#fix times
results.time = fix_time(results.time)
#make sure to only include valid results
results = results[-results.time.isna()]
#add race times
results['race_time'] = race_time(results.time)
#add time diff
results['time_diff'] = time_delta(results.race_time)
#drop time
results.drop('time',inplace=True,axis=1)
#format times
results.race_time = Series(results.race_time).dt.time
results.time_diff = Series(results.time_diff).dt.time

results.head(5)

Unnamed: 0,rnk,bib,rider,age,team,gc,gc-time,race,year,stage,race_time,time_diff
0,1,158,Tuft Svein,37,Orica GreenEDGE,1,+0:00,giro-d-italia,2014,1,00:24:42,00:00:00
1,2,152,Durbridge Luke,23,Orica GreenEDGE,2,+0:00,giro-d-italia,2014,1,00:24:42,00:00:00
2,3,159,Weening Pieter,33,Orica GreenEDGE,3,+0:00,giro-d-italia,2014,1,00:24:42,00:00:00
3,4,156,Meyer Cameron,26,Orica GreenEDGE,4,+0:00,giro-d-italia,2014,1,00:24:42,00:00:00
4,5,155,Matthews Michael,23,Orica GreenEDGE,5,+0:00,giro-d-italia,2014,1,00:24:42,00:00:00


In [57]:
data = pd.DataFrame()
#loop through years, tours and stages
for year in years:
    for tour in tours:
        for stage in stages:
            try:
                #construct urls
                url = base_url + tour +'/'+ year + '/stage-' + stage
                r = requests.get(url,headers=header)
                results = pd.read_html(r.text)
                results = results[0]
                #fix columns
                results.columns = results.columns.str.lower()
                #create matching fields
                results['race'] = tour
                results['year'] = year
                results['stage'] = stage
                #fix times
                results.time = fix_time(results.time)
                #make sure to only include valid results
                results = results[-results.time.isna()]
                #add race times
                results['race_time'] = race_time(results.time)
                #add time diff
                results['time_diff'] = time_delta(results.race_time)
                #drop time
                results.drop('time',inplace=True,axis=1)
                #format times
                results.race_time = Series(results.race_time).dt.time
                results.time_diff = Series(results.time_diff).dt.time        
                #build dataframe
                if data.empty:
                    data = results
                    print('Stage %s of the %s %s collected' %(stage,tour,year))
                else:
                    data = data.append(results)
                    print('Stage %s of the %s %s collected' %(stage,tour,year))
            except:
                print('There was an error collecting Stage %s of the %s %s' %(stage,tour,year))
                pass

Stage 1 of the giro-d-italia 2014 collected
Stage 2 of the giro-d-italia 2014 collected
Stage 3 of the giro-d-italia 2014 collected
Stage 4 of the giro-d-italia 2014 collected
Stage 5 of the giro-d-italia 2014 collected
There was an error collecting Stage 6 of the giro-d-italia 2014
Stage 7 of the giro-d-italia 2014 collected
Stage 8 of the giro-d-italia 2014 collected
There was an error collecting Stage 9 of the giro-d-italia 2014
Stage 10 of the giro-d-italia 2014 collected
Stage 11 of the giro-d-italia 2014 collected
Stage 12 of the giro-d-italia 2014 collected
Stage 13 of the giro-d-italia 2014 collected
Stage 14 of the giro-d-italia 2014 collected
Stage 15 of the giro-d-italia 2014 collected
Stage 16 of the giro-d-italia 2014 collected
Stage 17 of the giro-d-italia 2014 collected
Stage 18 of the giro-d-italia 2014 collected
Stage 19 of the giro-d-italia 2014 collected
Stage 20 of the giro-d-italia 2014 collected
Stage 21 of the giro-d-italia 2014 collected
Stage 1 of the tour-de-f

Stage 11 of the vuelta-a-espana 2016 collected
There was an error collecting Stage 12 of the vuelta-a-espana 2016
Stage 13 of the vuelta-a-espana 2016 collected
Stage 14 of the vuelta-a-espana 2016 collected
Stage 15 of the vuelta-a-espana 2016 collected
Stage 16 of the vuelta-a-espana 2016 collected
Stage 17 of the vuelta-a-espana 2016 collected
Stage 18 of the vuelta-a-espana 2016 collected
Stage 19 of the vuelta-a-espana 2016 collected
Stage 20 of the vuelta-a-espana 2016 collected
Stage 21 of the vuelta-a-espana 2016 collected
Stage 1 of the giro-d-italia 2017 collected
Stage 2 of the giro-d-italia 2017 collected
Stage 3 of the giro-d-italia 2017 collected
Stage 4 of the giro-d-italia 2017 collected
Stage 5 of the giro-d-italia 2017 collected
Stage 6 of the giro-d-italia 2017 collected
Stage 7 of the giro-d-italia 2017 collected
Stage 8 of the giro-d-italia 2017 collected
Stage 9 of the giro-d-italia 2017 collected
Stage 10 of the giro-d-italia 2017 collected
Stage 11 of the giro-d

In [58]:
data.to_hdf('stage_results_20142017_raw.h5',key='a') #save data.

In [59]:
#test
df = pd.read_hdf('stage_results_20142017_raw.h5',key = 'a')

In [60]:
df.head(10)

Unnamed: 0,age,avg,bib,gc,gc-time,pnt,race,race_time,rider,rnk,stage,team,time_diff,uci,uci.1,year
0,37,,158,1,+0:00,,giro-d-italia,00:24:42,Tuft Svein,1,1,Orica GreenEDGE,00:00:00,,,2014
1,23,,152,2,+0:00,,giro-d-italia,00:24:42,Durbridge Luke,2,1,Orica GreenEDGE,00:00:00,,,2014
2,33,,159,3,+0:00,,giro-d-italia,00:24:42,Weening Pieter,3,1,Orica GreenEDGE,00:00:00,,,2014
3,26,,156,4,+0:00,,giro-d-italia,00:24:42,Meyer Cameron,4,1,Orica GreenEDGE,00:00:00,,,2014
4,23,,155,5,+0:00,,giro-d-italia,00:24:42,Matthews Michael,5,1,Orica GreenEDGE,00:00:00,,,2014
5,30,,151,6,+0:00,,giro-d-italia,00:24:42,Santaromita Ivan,6,1,Orica GreenEDGE,00:00:00,,,2014
6,25,,148,7,+0:05,,giro-d-italia,00:24:47,Serry Pieter,7,1,Omega Pharma - Quick-Step,00:00:05,,,2014
7,26,,142,8,+0:05,,giro-d-italia,00:24:47,Brambilla Gianluca,8,1,Omega Pharma - Quick-Step,00:00:00,,,2014
8,27,,141,9,+0:05,,giro-d-italia,00:24:47,Uran Rigoberto,9,1,Omega Pharma - Quick-Step,00:00:00,,,2014
9,30,,145,10,+0:05,,giro-d-italia,00:24:47,Pauwels Serge,10,1,Omega Pharma - Quick-Step,00:00:00,,,2014


In [61]:
data = data[-data.rnk.str.contains('\*',na=False)] #remove crashes and cheats

In [62]:
data = data[data.rnk.str.isdigit()!= False]

In [63]:
data.rnk = data.rnk.astype(int) #fix formatting.

In [64]:
data = data.fillna(0) #fill NAs.

In [65]:
#slice data
model_data = data[['year','race','stage','team','rider','bib','age','rnk','pnt','race_time','time_diff']]

In [66]:
model_data.head(10)

Unnamed: 0,year,race,stage,team,rider,bib,age,rnk,pnt,race_time,time_diff
0,2014,giro-d-italia,1,Orica GreenEDGE,Tuft Svein,158,37,1,0.0,00:24:42,00:00:00
1,2014,giro-d-italia,1,Orica GreenEDGE,Durbridge Luke,152,23,2,0.0,00:24:42,00:00:00
2,2014,giro-d-italia,1,Orica GreenEDGE,Weening Pieter,159,33,3,0.0,00:24:42,00:00:00
3,2014,giro-d-italia,1,Orica GreenEDGE,Meyer Cameron,156,26,4,0.0,00:24:42,00:00:00
4,2014,giro-d-italia,1,Orica GreenEDGE,Matthews Michael,155,23,5,0.0,00:24:42,00:00:00
5,2014,giro-d-italia,1,Orica GreenEDGE,Santaromita Ivan,151,30,6,0.0,00:24:42,00:00:00
6,2014,giro-d-italia,1,Omega Pharma - Quick-Step,Serry Pieter,148,25,7,0.0,00:24:47,00:00:05
7,2014,giro-d-italia,1,Omega Pharma - Quick-Step,Brambilla Gianluca,142,26,8,0.0,00:24:47,00:00:00
8,2014,giro-d-italia,1,Omega Pharma - Quick-Step,Uran Rigoberto,141,27,9,0.0,00:24:47,00:00:00
9,2014,giro-d-italia,1,Omega Pharma - Quick-Step,Pauwels Serge,145,30,10,0.0,00:24:47,00:00:00


In [67]:
model_data.to_hdf('stage_results_20142017.h5',key='a') #save data.

In [68]:
#features
#has the cyclist ridden in a tour before? Yes, No.

#Did the cyclist ride in the previous tour? Yes, No.

#Does the Team matter?