
# Web Scraping


### Rider Profile

* Name
* Age 
* Height 
* Weight 
* Nationality
* Current Team 
* Sprint 
* Climb 
* TT
* One-Day races 
* General-classification 

### Season Statistics DataFrame

* Name 
* Year 
* Points 
* Racedays 
* KMs
* Wins 

### LeaderBoard DataFrame

* Name 
* Year
* Rank (Time_Difference_&_Index)
* Points
* Team 
* Time difference 

### Race Course DataFrame

* Name
* Year
* Course length(km)
* Average speed(kph/mph)
* Course profile
* PCS_point_scale

In [2]:
import requests 
import lxml
import csv
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import datetime as dt
import pprint
import sqlite3
pd.__version__

'1.1.3'

In [3]:
def getsoup(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    return soup

In [4]:
pd.set_option('display.max_colwidth', 255)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)

# Rider_URLs
Rider profile pages contain useful information about each athlete. Inorder to automate the process of extracting nescessary data for every rider, a list of rider profile web adresses (URLs) needs to be created.  

In [30]:
def get_rider_urls(race_name, year, years_back):

    urls = []
    
    for year in range(int(year),int(year-(years_back+1)),-1):
        
        race_url = 'https://www.procyclingstats.com/race/' + str(race_name) + '/' + str(year) + '/result'
        soup = getsoup(race_url)

        race_table_tag = soup.find('table', class_='basic results moblist10')
        tr_table = race_table_tag.find_all('tr')

        rider_name = []

        for i in tr_table[1:]:   
            a_tag = i.find_all('a')[0]
            rider_name.append(a_tag['href'])


        urls.append(['https://www.procyclingstats.com/' + j for j in rider_name])
        
    return urls
    
    

races = ['gent-wevelgem','strade-bianche', 'milano-sanremo',
              'ronde-van-vlaanderen', 'paris-roubaix', 'omloop-het-nieuwsblad','e3-harelbeke'] 

races_21 = ['gent-wevelgem','strade-bianche', 'milano-sanremo',
              'ronde-van-vlaanderen', 'omloop-het-nieuwsblad','e3-harelbeke']


url_list = [get_rider_urls(i,2020,13) for i in races]
flat_url = [item for sublist in url_list for subsublist in sublist for item in subsublist]


races_21 = [get_rider_urls(i,2021,0) for i in races_21]
flat_21 = [item for sublist in races_21 for subsublist in sublist for item in subsublist]

flat_url = flat_url + flat_21

unique_URLs = list(set(flat_url))
unique_URLs = pd.Series(unique_URLs)

unique_URLs.to_csv(r'C:\\Users\\User\\Documents\\DataScience_Projects\RiderProfile_URLs.csv', index = False, header=True)
                                

In [1]:
URLs = pd.read_csv('RiderProfile_URLs.csv')

# Rider Profile Data

Data Includes: Height and Weight, Nationality, Age, Name
 


In [21]:
name = []
height = []
weight = []
nationality = []

    
for rider in URLs:

    soup = getsoup(rider)  

    main = ([i.text for i in soup.find('div', class_='main')])
    name.append(main[2])

    rdr_info_tag = soup.find('div', class_='rdr-info-cont')
    info_list = [i for i in rdr_info_tag] 
    rit_element = rdr_info_tag.find_all('span')


    span_list = [i for i in rit_element[1]]


    try: 
        weight.append(span_list[1])
    except IndexError: 
        weight.append('NaN')

    try:
        span_tag = [i for i in span_list[2]]
        height.append(span_tag[1])
    except IndexError:
        height.append('NaN')

    try:
        nationality.append(info_list[8].text)

    except IndexError:
        nationality.append('NaN')

In [19]:
climber = []
gc = []
tt = []
sprint = []
one_day_races = []

for rider in URLs:

    soup = getsoup(rider) 

    bs4elementTag = [i for i in soup.find('ul', class_='basic')]
    one_day_races.append(bs4elementTag[0].text)
    gc.append(bs4elementTag[2].text)
    tt.append(bs4elementTag[4].text)
    sprint.append(bs4elementTag[6].text)
    climber.append(bs4elementTag[8].text)


Profile = pd.DataFrame({'Name': name, 'Height': height, 'Weight': weight, 'Nationality': nationality,
                   'Climbing': climber, 'General_Classification': gc, 'Time_Trial': tt, 'Sprint': sprint,
                   'One_Day_Races': one_day_races})


Profile.to_csv(r'C:\\Users\\User\\Documents\\DataScience_Projects\Profile_Data2.csv', index = False, header=True)

# Get Season Statistics Page URLs

In [32]:
URLs = pd.read_csv('RiderProfile_URLs.csv')
URLs = list(URLs['0'])
season_urls = [i + str('/statistics/season-statistics') for i in URLs]

In [33]:
len(season_urls)

2083

# Season Statistics Table

Season year,
Points,
Racedays,
KMs,
Wins,
Top-10s


In [34]:
Season = []    
Points = []
Racedays = []
KMs_Rode = []
Wins = []
Top_10s = []
Rider_Name = []
    
    
for rider in season_urls:

    soup = getsoup(rider) 
    rdr_table_tag = soup.find('table', class_='basic')
    table = rdr_table_tag.find_all('tr')

    stat_list = [int(stat.text) for season in table[1:-1] for stat in season]

    stat_by_season = [stat_list[i:i+6] for i in range(0,len(stat_list),6)]

    for year in stat_by_season:
        
        Season.append(year[0])
        Points.append(year[1])
        Racedays.append(year[2])
        KMs_Rode.append(year[3])
        Wins.append(year[4])
        Top_10s.append(year[5])
        name = rider.replace('https://www.procyclingstats.com/rider/', '').replace('/statistics/season-statistics', '')
        Rider_Name.append(name.replace('-', ' '))

SeasonStats = pd.DataFrame({'Rider_Name': Rider_Name, 'Season': Season, 
'Points': Points, 'Racedays': Racedays, 'KMs_Rode': KMs_Rode, 'Wins': Wins, 'Top_10s': Top_10s})

SeasonStats.to_csv(r'C:\\Users\\User\\Documents\\DataScience_Projects\XG_Boost\SeasonStats.csv', index = False, header=True)

# Get Race_URLs


In [8]:
def get_race_URL(race_name, year, years_back):
   
    for year in range(int(year),int(year-(years_back+1)),-1):
        url = 'https://www.procyclingstats.com/race/' + str(race_name) + '/' + str(year) + '/result/result'
        race_URLs.append(url)
        
    return race_URLs


Race_Names = ['gent-wevelgem','strade-bianche', 'milano-sanremo',
              'ronde-van-vlaanderen', 'paris-roubaix', 'omloop-het-nieuwsblad','e3-harelbeke']


race_URLs = []


for race in Race_Names:
    get_race_URL(race, 2021, 14)

race_URLs = [i for i in race_URLs if i != 'https://www.procyclingstats.com/race/paris-roubaix/2021/result/result']
Flander_21_URLs = [i for i in race_URLs if i == 'https://www.procyclingstats.com/race/ronde-van-vlaanderen/2021/result/result']

# Race LeaderBoard Data 


1)Name #done#
2)Rank
2)Team #done#
3)Time difference #done#


In [32]:
race_name = []
year = []
age = [] 
rank = [] 
team = [] 
name = [] 
time = [] 


for race in race_URLs[:]:
    
    soup = getsoup(race) 

    race_table_tag = soup.find('table', class_='basic results moblist10')
    table_element = race_table_tag.find_all('tr')

    url_split = race.split('/')

    for i in table_element[1:]:

        race_name.append(url_split[4])
        year.append(soup.find('span', class_='hideIfMobile').text)
        age.append(i.find_all('td')[3].text)
        rank.append(i.find_all('td')[0].text)
        team.append(i.find_all('td')[4].text)
        name.append(str(i.find('a')).split('"')[1])
        time.append(i.find_all('div', class_="hide"))


LeaderBoard = pd.DataFrame({'Race_Name': race_name, 'Name': name, 'Season': year, 'Age': age,
'Rank': rank, 'Team_Name': team, 'Finishing_Time': time})

LeaderBoard.to_csv(r'C:\\Users\\User\\Documents\\DataScience_Projects\LeaderBoard_Data.csv', index = False, header=True)

# Race Course Data

1)Name
2)Date 
2)Course length(km)
3)Average speed(km/ph)
4)Course profile
5)PCS_point_scale


In [191]:
name
date = []
avg_speed = []
course_profile = []
distance = []
ranking = []


for race in race_URLs[:]:

        soup = getsoup(race)

        info_list_tag =(soup.find('ul', class_='infolist'))

        info_list = [i for i in info_list_tag]


        date.append(info_list[0].text)
        avg_speed.append(info_list[2].text)
        course_profile.append(info_list[8].text)
        distance.append(info_list[14].text)
        ranking.append(info_list[16].text)


Race_Course = pd.DataFrame({'Race_Name': name, 'Date': date, 'Average_Speed': avg_speed, 
'Course_Profile': course_profile, 'Distance': distance, 'Ranking': ranking})  

Race_Course.to_csv(r'C:\\Users\\User\\Documents\\DataScience_Projects\Race_Course.csv', index = False, header=True)