In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
import csv
import tqdm
import re
import datetime

# Scraping

### env variables

In [87]:
BASE_PATH = '../data/pcs-scraping'
RESULTS_PATH = '../data/pcs-scraping/results/rider'
RANKINGS_PATH = '../data/pcs-scraping/pcs-rankings/rider'
TEAMS_PATH = '../data/pcs-scraping/teams/rider'
CALENDARS_PATH = '../data/pcs-scraping/calendars'
STARTLISTS_PATH = '../data/pcs-scraping/startlists'
RACERESULTS_PATH = '../data/pcs-scraping/race_results'
IMG_PATH = '../data/pcs-scraping/img/rider'
RIDERSTATS_PATH = '../data/pcs-scraping/rider_stats/rider'

### Scraping functions

In [65]:
def get_rider_img(rider_name):
    
    # check if rider already has img data
    if not os.path.isdir(os.path.join(IMG_PATH, rider_name)):
        os.mkdir(os.path.join(IMG_PATH, rider_name))
        
    try:
        url = f'https://www.procyclingstats.com/rider/{rider_name}'
        res = requests.get(url)

        soup = BeautifulSoup(res.content)
        img_path = soup.find_all('img', {'src': re.compile(r'images.*\.jpeg')})

        url = os.path.join('https://www.procyclingstats.com', img_path[0]['src'])
        res = requests.get(url)
        
        with open(os.path.join(IMG_PATH, rider_name, f'{rider_name}-img.jpeg'), 'wb') as file:
            file.write(res.content)
            
        time.sleep(0.5)
    
    except Exception as e:
        print(rider_name, '---', e)

In [88]:
def get_rider_names(n_pages):
    
    rider_names = []

    offsets = np.arange(0, 3001, 100)

    for offset in offsets[:n_pages]:

        url = f'https://www.procyclingstats.com/rankings.php?date=2022-01-12&nation=&age=&zage=&page=smallerorequal&team=&offset={offset}&continent=&teamlevel=&filter=Filter&p=me&s=uci-individual'
        res = requests.get(url)

        tables = pd.read_html(res.content)
        rider_names.append(tables[0])

        time.sleep(0.5)
    
    return rider_names

def normalize_rider_name(rider_name):
    
    surname = rider_name.split(" ")[-1].lower()
    name = "-".join(rider_name.split(" ")[:-1]).lower()
    full_name = surname + '-' + name
    
    return full_name

def clean_pcs_table_results(df_table):
    
    df_table.drop('Unnamed: 3', axis=1, inplace=True)
    df_table.drop('Unnamed: 8', axis=1, inplace=True)
    df_table.rename(columns={'Unnamed: 2': 'GC'}, inplace=True)
    
    return df_table

def clean_pcs_table_ranking(df_table):
    
    df_table.rename(columns={'Unnamed: 0': 'year'}, inplace=True)

    return df_table

def get_rider_stats(rider_name, years):

    pcs_ranking = []
    results = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/rider/{rider_name}/{year}'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            pcs_ranking.append(clean_pcs_table_ranking(tables[1]))
            results.append(clean_pcs_table_results(tables[0]))

            time.sleep(0.5)
            
        except Exception as e:
            print(e)

    df_results = [(year, x) for year, x in zip(years, results) if not x.empty]
    
    return df_results, pcs_ranking[0]

def init_dirs():

    if not os.path.isdir(BASE_PATH):
        os.makedirs(BASE_PATH)
        
    if not os.path.isdir(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)
        
    if not os.path.isdir(RANKINGS_PATH):
        os.makedirs(RANKINGS_PATH)
        
    if not os.path.isdir(TEAMS_PATH):
        os.makedirs(TEAMS_PATH)
        
    if not os.path.isdir(IMG_PATH):
        os.makedirs(IMG_PATH)
        
    if not os.path.isdir(RIDERSTATS_PATH):
        os.makedirs(RIDERSTATS_PATH)
        

def save_data(rider_name, results, pcs_ranking):
        
    # check if rider already has results data
    if not os.path.isdir(os.path.join(RESULTS_PATH, rider_name)):
        os.mkdir(os.path.join(RESULTS_PATH, rider_name))
    
    # check if rider already has pcs-ranking data
    if not os.path.isdir(os.path.join(RANKINGS_PATH, rider_name)):
        os.mkdir(os.path.join(RANKINGS_PATH, rider_name))
        
    # save season results
    #[x[1].to_csv(f'../data/pcs-scraping/results/rider/{rider_name}/{x[0]}.csv', index=False) for x in results]
    [x[1].to_csv(os.path.join(RESULTS_PATH, rider_name , f'{x[0]}.csv'), index=False) for x in results]
    
    # save pcs_ranking
    #pcs_ranking.to_csv(f'../data/pcs-scraping/pcs-ranking/rider/{rider_name}/pcs_ranking.csv', index=False)
    pcs_ranking.to_csv(os.path.join(RANKINGS_PATH, rider_name, 'pcs_ranking.csv'), index=False)

### Initialize data directories

In [112]:
init_dirs()

### Get Rider Names

In [None]:
rider_names = get_rider_names(n_pages=10)

In [None]:
riders = []
[riders.extend(x['Rider']) for x in rider_names]

rider_names = [normalize_rider_name(x) for x in riders]

In [None]:
with open(os.path.join(BASE_PATH, 'rider_names.csv'), 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(rider_names)

In [49]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

### Get Race Results and Rankings

- NEED TO SCRAP ALL POSSIBLE YEARS (BEFORE 2011) !!!

- RIDER NAMES IN URL NOT CORRECT FOR FAILURES

In [None]:
years = np.arange(2000, 2023)

for rider_name in tqdm.tqdm(rider_names[:200]):

    try:
        results, pcs_ranking = get_rider_stats(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

In [None]:
rider_names_corrected_1 = ['jonas-vingegaard-rasmussen', 'mikkel-honore', 'aleksey-lutsenko', 'ben-o-connor',
                         'michael-valgren-andersen', 'biniam-girmay', 'miguel-angel-lopez', 'johan-esteban-chaves',
                         'michal-kwiatkowski', 'jesus-herrada-lopez', 'odd-christian-eiking', 'magnus-cort-nielsen',
                         'daniel-felipe-martinez', 'luis-leon-sanchez', 'tobias-halland-johannessen', 'jose-manuel-diaz-gallego',
                         'juan-ayuso-pesquera']
rider_names_corrected_2 = []

years = np.arange(2000, 2023)

for rider_name in tqdm.tqdm(rider_names_corrected_1):

    try:
        results, pcs_ranking = get_rider_stats(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

### Get Teams

In [None]:
def get_rider_teams(rider_name):
    
    time.sleep(0.5)
    
    url = f'https://www.procyclingstats.com/rider/{rider_name}'
    res = requests.get(url)
    
    soup = BeautifulSoup(res.content)
    
    possible_classes = ['list rdr-teams moblist moblist', 'list rdr-teams moblist', 'list rdr-teams moblist moblist ']
    ul = list(filter(None, [soup.find('ul', {'class': class_}) for class_ in possible_classes]))
    
    if ul:
        
        season = [x.find('div', {'class': 'season'}).text for x in ul[0].find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul[0].find_all('li')]
        teams = pd.DataFrame({'season': season, 'team': team})
        
        return teams

    else:
        
        print(rider_name, 'No Teams scraped')

def save_data(rider_name, teams):
        
    # check if rider already has teams data
    if not os.path.isdir(os.path.join(TEAMS_PATH, rider_name)):
        os.mkdir(os.path.join(TEAMS_PATH, rider_name))
        
    # save teams data
    teams.to_csv(os.path.join(TEAMS_PATH, rider_name, 'teams.csv'), index=False)

In [None]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

In [None]:
for rider_name in tqdm.tqdm(rider_names[:200]):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        print(rider_name)

In [None]:
for rider_name in tqdm.tqdm(rider_names_corrected):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        print(rider_name)

- RIDER NAMES NOT CORRECT ON URL FOR FAILURES

### Get race calendar

In [None]:
def get_race_calendar(years):

    calendar = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/races.php?year={year}&circuit=&class=&filter=Filter'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            calendar.append(tables[0].dropna())

            time.sleep(0.5)
            
        except Exception as e:
            print(e)

    df_calendar = [(year, cal) for year, cal in zip(years, calendar) if not cal.empty]
    
    return df_calendar

def save_data(calendar):
        
    if not os.path.isdir(CALENDARS_PATH):
        os.mkdir(CALENDARS_PATH)
        
    # save calendar data
    calendar[1].to_csv(os.path.join(CALENDARS_PATH, f'{calendar[0]}.csv'), index=False)

In [None]:
years = np.arange(2010, 2023)
calendar = get_race_calendar(years)
[save_data(cal) for cal in calendar]

### Get Race startlist + results

In [None]:
def convert_racename_to_url(racename):
    
    race_url = racename.values[0].lower().replace("'", "-").replace(" ", "-")
    
    return race_url
    
def get_race_results(race_url, year):
    
    url = f'https://www.procyclingstats.com/race/{race_url}/{year}'
    res = requests.get(url)

    tables = pd.read_html(res.content)
    last_stage = tables[0]
    gc = tables[1]
    points = tables[2]
    kom = tables[3]
    youth = tables[4]
    teams = tables[5]
    
    # ALSO NEED individual stages, kom, points, youth, teams final + stage results
    
    return last_stage, gc, points, kom, youth, teams

def get_startlist(race_url, year):
    
    url = f'https://www.procyclingstats.com/race/{race_url}/{year}/gc/startlist/alphabetical-with-filters'
    res = requests.get(url)
    
    startlist = pd.read_html(res.content)[0].drop(['Unnamed: 3'], axis=1)
    
    return startlist

def save_data(startlist, PATH, race_url, year):
        
    if not os.path.isdir(os.path.join(PATH, race_url, year)):
        os.makedirs(os.path.join(PATH, race_url, year))
        
    # save data
    startlist.to_csv(os.path.join(PATH, race_url, year, 'startlist.csv'), index=False)

### Get startlist of one race

In [None]:
year = '2021'
target_race = "Giro d'Italia"

calendar = pd.read_csv(os.path.join(CALENDARS_PATH, f'{year}.csv'))
race_url = convert_racename_to_url(calendar[calendar['Race'].str.contains(target_race)].Race)

startlist = get_startlist(race_url, year)
save_data(startlist, STARTLISTS_PATH, race_url, year)

### Get startlist of all races in year

In [None]:
year = '2021'
for race in calendar.Race:
    
    time.sleep(0.5)
    try:
        race_url = race.lower().replace(' ', '-')
        startlist = get_startlist(race_url, year)
        save_data(startlist, STARTLISTS_PATH, race_url, year)
    except:
        print(race)

### Get race results

In [None]:
def save_data(results, PATH, race_url, year):
        
    if not os.path.isdir(os.path.join(PATH, race_url, year)):
        os.makedirs(os.path.join(PATH, race_url, year))
        
    # save data
    [res.to_csv(os.path.join(PATH, race_url, year, f'{name}.csv'), index=False) for res, name in zip(results, ['last_stage', 'gc', 'points', 'kom', 'youth', 'team'])]

In [None]:
race_url = convert_racename_to_url(calendar[calendar['Race'].str.contains(target_race)].Race)
results = get_race_results(race_url, year)
save_data(results, RACERESULTS_PATH, race_url, year)

### Get race results of all races in year

In [None]:
year = '2021'
for race in calendar.Race:
    
    time.sleep(0.5)
    try:
        race_url = race.lower().replace(' ', '-')
        results = get_race_results(race_url, year)
        save_data(results, RACERESULTS_PATH, race_url, year)
    except:
        print(race)

### Get rider images

In [68]:
for rider_name in rider_names[:200]:
    get_rider_img(rider_name)

jonas-vingegaard --- list index out of range
frølich-honoré-mikkel --- list index out of range
alexey-lutsenko --- list index out of range
ben-o'connor --- list index out of range
michael-valgren --- list index out of range
biniam-girmay-hailu --- list index out of range
ángel-lópez-miguel --- list index out of range
esteban-chaves --- list index out of range
michał-kwiatkowski --- list index out of range
jesús-herrada --- list index out of range
christian-eiking-odd --- list index out of range
magnus-cort --- list index out of range
felipe-martínez-daniel --- list index out of range
león-sánchez-luis --- list index out of range
carlos-rodríguez --- list index out of range
halland-johannessen-tobias --- list index out of range
manuel-díaz-josé --- list index out of range
juan-ayuso --- list index out of range


In [69]:
rider_names_corrected_1 = ['jonas-vingegaard-rasmussen', 'mikkel-honore', 'aleksey-lutsenko', 'ben-o-connor',
                         'michael-valgren-andersen', 'biniam-girmay', 'miguel-angel-lopez', 'johan-esteban-chaves',
                         'michal-kwiatkowski', 'jesus-herrada-lopez', 'odd-christian-eiking', 'magnus-cort-nielsen',
                         'daniel-felipe-martinez', 'luis-leon-sanchez', 'tobias-halland-johannessen', 'jose-manuel-diaz-gallego',
                         'juan-ayuso-pesquera']

for rider_name in rider_names_corrected_1:
    get_rider_img(rider_name)

johan-esteban-chaves --- list index out of range


### Get rider stats

In [143]:
def get_rider_stats(race, year):
        
    # check if race already has stats data
    if not os.path.isdir(os.path.join(STARTLISTS_PATH, race, year)):
        os.makedirs(os.path.join(STARTLISTS_PATH, race, year))

    try:
        url = f'https://www.procyclingstats.com/race/{race}/{year}/result/startlist/kpis'
        res = requests.get(url)

        stats = pd.read_html(res.content)[0]
        date = str(datetime.datetime.now()).split(' ')[0]
        
        stats.to_csv(os.path.join(STARTLISTS_PATH, race, year, 'stats-kpis.csv'), index=False)

        for row in stats.iterrows():
            
            rider_name = '-'.join(reversed(row[1].Rider.split(' '))).lower()
            
            # check if rider already has img data
            if not os.path.isdir(os.path.join(RIDERSTATS_PATH, rider_name)):
                os.mkdir(os.path.join(RIDERSTATS_PATH, rider_name))
                
            pd.DataFrame(row[1]).T.to_csv(os.path.join(RIDERSTATS_PATH, rider_name, f'stats-kpis-{date}.csv'), index=False)
            
        
    except Exception as e:
        print(race, '---', e)
    
    

In [144]:
year = '2022'
race = 'milano-sanremo'

rider_stats = get_rider_stats(race, year)