In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
import csv
import tqdm

# Scraping

### env variables

In [108]:
BASE_PATH = '../data/pcs-scraping'
RESULTS_PATH = '../data/pcs-scraping/results/rider'
RANKINGS_PATH = '../data/pcs-scraping/pcs-rankings/rider'
TEAMS_PATH = '../data/pcs-scraping/teams/rider'
CALENDARS_PATH = '../data/pcs-scraping/calendars'
STARTLISTS_PATH = '../data/pcs-scraping/startlists'
RACERESULTS_PATH = '../data/pcs-scraping/race_results'

### Scraping functions

In [13]:
def get_rider_names(n_pages):
    
    rider_names = []

    offsets = np.arange(0, 3001, 100)

    for offset in offsets[:n_pages]:

        url = f'https://www.procyclingstats.com/rankings.php?date=2022-01-12&nation=&age=&zage=&page=smallerorequal&team=&offset={offset}&continent=&teamlevel=&filter=Filter&p=me&s=uci-individual'
        res = requests.get(url)

        tables = pd.read_html(res.content)
        rider_names.append(tables[0])

        time.sleep(0.5)
    
    return rider_names

def normalize_rider_name(rider_name):
    
    surname = rider_name.split(" ")[-1].lower()
    name = "-".join(rider_name.split(" ")[:-1]).lower()
    full_name = surname + '-' + name
    
    return full_name

def clean_pcs_table_results(df_table):
    
    df_table.drop('Unnamed: 3', axis=1, inplace=True)
    df_table.drop('Unnamed: 8', axis=1, inplace=True)
    df_table.rename(columns={'Unnamed: 2': 'GC'}, inplace=True)
    
    return df_table

def clean_pcs_table_ranking(df_table):
    
    df_table.rename(columns={'Unnamed: 0': 'year'}, inplace=True)

    return df_table

def get_rider_stats(rider_name, years):

    pcs_ranking = []
    results = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/rider/{rider_name}/{year}'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            pcs_ranking.append(clean_pcs_table_ranking(tables[1]))
            results.append(clean_pcs_table_results(tables[0]))

            time.sleep(0.5)
            
        except Exception as e:
            print(e)

    df_results = [(year, x) for year, x in zip(years, results) if not x.empty]
    
    return df_results, pcs_ranking[0]

def init_dirs():

    if not os.path.isdir(BASE_PATH):
        os.makedirs(BASE_PATH)
        
    if not os.path.isdir(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)
        
    if not os.path.isdir(RANKINGS_PATH):
        os.makedirs(RANKINGS_PATH)
        
    if not os.path.isdir(TEAMS_PATH):
        os.makedirs(TEAMS_PATH)

def save_data(rider_name, results, pcs_ranking):
        
    # check if rider already has results data
    if not os.path.isdir(os.path.join(RESULTS_PATH, rider_name)):
        os.mkdir(os.path.join(RESULTS_PATH, rider_name))
    
    # check if rider already has pcs-ranking data
    if not os.path.isdir(os.path.join(RANKINGS_PATH, rider_name)):
        os.mkdir(os.path.join(RANKINGS_PATH, rider_name))
        
    # save season results
    #[x[1].to_csv(f'../data/pcs-scraping/results/rider/{rider_name}/{x[0]}.csv', index=False) for x in results]
    [x[1].to_csv(os.path.join(RESULTS_PATH, rider_name , f'{x[0]}.csv'), index=False) for x in results]
    
    # save pcs_ranking
    #pcs_ranking.to_csv(f'../data/pcs-scraping/pcs-ranking/rider/{rider_name}/pcs_ranking.csv', index=False)
    pcs_ranking.to_csv(os.path.join(RANKINGS_PATH, rider_name, 'pcs_ranking.csv'), index=False)

### Initialize data directories

In [4]:
init_dirs()

### Get Rider Names

In [None]:
rider_names = get_rider_names(n_pages=5)

In [None]:
riders = []
[riders.extend(x['Rider']) for x in rider_names]

rider_names = [normalize_rider_name(x) for x in riders]

In [None]:
with open(os.path.join(BASE_PATH, 'rider_names.csv'), 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(rider_names)

### Get Race Results and Rankings

- NEED TO SCRAP ALL POSSIBLE YEARS (BEFORE 2011) !!!

- RIDER NAMES NOT CORRECT FOR FAILURES

In [14]:
years = np.arange(2011, 2022)

for rider_name in tqdm.tqdm(rider_names[10:100]):

    try:
        results, pcs_ranking = get_rider_stats(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

  8%|▊         | 7/90 [01:15<14:56, 10.80s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


  9%|▉         | 8/90 [01:18<11:32,  8.44s/it]

No tables found
jonas-vingegaard


 18%|█▊        | 16/90 [02:34<11:34,  9.39s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 19%|█▉        | 17/90 [02:37<08:55,  7.34s/it]

No tables found
frølich-honoré-mikkel


 31%|███       | 28/90 [04:29<10:08,  9.81s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 32%|███▏      | 29/90 [04:33<08:01,  7.89s/it]

No tables found
alexey-lutsenko
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 33%|███▎      | 30/90 [04:35<06:14,  6.24s/it]

No tables found
ben-o'connor


 54%|█████▍    | 49/90 [07:55<06:49,  9.98s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 56%|█████▌    | 50/90 [07:57<05:10,  7.76s/it]

No tables found
michael-valgren


 71%|███████   | 64/90 [10:29<05:05, 11.75s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 72%|███████▏  | 65/90 [10:32<03:48,  9.13s/it]

No tables found
No tables found
biniam-girmay-hailu


 89%|████████▉ | 80/90 [13:02<01:45, 10.54s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 90%|█████████ | 81/90 [13:05<01:14,  8.28s/it]

No tables found
No tables found
ángel-lópez-miguel
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 91%|█████████ | 82/90 [13:07<00:52,  6.55s/it]

No tables found
esteban-chaves
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 92%|█████████▏| 83/90 [13:10<00:37,  5.29s/it]

No tables found
No tables found
michał-kwiatkowski


 97%|█████████▋| 87/90 [13:49<00:26,  8.92s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


 98%|█████████▊| 88/90 [13:51<00:13,  6.97s/it]

No tables found
jesús-herrada


 99%|█████████▉| 89/90 [14:01<00:07,  7.68s/it]

No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found
No tables found


100%|██████████| 90/90 [14:03<00:00,  9.38s/it]

No tables found
christian-eiking-odd





### Get Teams

In [15]:
def get_rider_teams(rider_name):
    
    time.sleep(0.5)
    
    url = f'https://www.procyclingstats.com/rider/{rider_name}'
    res = requests.get(url)
    
    soup = BeautifulSoup(res.content)
    
    possible_classes = ['list rdr-teams moblist moblist', 'list rdr-teams moblist', 'list rdr-teams moblist moblist ']
    ul = list(filter(None, [soup.find('ul', {'class': class_}) for class_ in possible_classes]))
    
    if ul:
        
        season = [x.find('div', {'class': 'season'}).text for x in ul[0].find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul[0].find_all('li')]
        teams = pd.DataFrame({'season': season, 'team': team})
        
        return teams

    else:
        
        print(rider_name, 'No Teams scraped')

def save_data(rider_name, teams):
        
    # check if rider already has teams data
    if not os.path.isdir(os.path.join(TEAMS_PATH, rider_name)):
        os.mkdir(os.path.join(TEAMS_PATH, rider_name))
        
    # save teams data
    teams.to_csv(os.path.join(TEAMS_PATH, rider_name, 'teams.csv'), index=False)

In [16]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

In [17]:
for rider_name in tqdm.tqdm(rider_names[10:100]):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        print(rider_name)

  9%|▉         | 8/90 [00:08<01:31,  1.11s/it]

jonas-vingegaard No Teams scraped
jonas-vingegaard


 19%|█▉        | 17/90 [00:17<01:10,  1.04it/s]

frølich-honoré-mikkel No Teams scraped
frølich-honoré-mikkel


 32%|███▏      | 29/90 [00:31<00:57,  1.06it/s]

alexey-lutsenko No Teams scraped
alexey-lutsenko


 33%|███▎      | 30/90 [00:31<00:51,  1.16it/s]

ben-o'connor No Teams scraped
ben-o'connor


 56%|█████▌    | 50/90 [00:59<00:37,  1.07it/s]

michael-valgren No Teams scraped
michael-valgren


 72%|███████▏  | 65/90 [01:17<00:22,  1.10it/s]

biniam-girmay-hailu No Teams scraped
biniam-girmay-hailu


 90%|█████████ | 81/90 [01:32<00:08,  1.11it/s]

ángel-lópez-miguel No Teams scraped
ángel-lópez-miguel


 91%|█████████ | 82/90 [01:32<00:06,  1.17it/s]

esteban-chaves No Teams scraped
esteban-chaves


 92%|█████████▏| 83/90 [01:33<00:05,  1.20it/s]

michał-kwiatkowski No Teams scraped
michał-kwiatkowski


 98%|█████████▊| 88/90 [01:37<00:01,  1.24it/s]

jesús-herrada No Teams scraped
jesús-herrada


100%|██████████| 90/90 [01:39<00:00,  1.11s/it]

christian-eiking-odd No Teams scraped
christian-eiking-odd





- RIDER NAMES NOT CORRECT ON URL FOR FAILURES

### Get race calendar

In [22]:
def get_race_calendar(years):

    calendar = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/races.php?year={year}&circuit=&class=&filter=Filter'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            calendar.append(tables[0].dropna())

            time.sleep(0.5)
            
        except Exception as e:
            print(e)

    df_calendar = [(year, cal) for year, cal in zip(years, calendar) if not cal.empty]
    
    return df_calendar

def save_data(calendar):
        
    if not os.path.isdir(CALENDARS_PATH):
        os.mkdir(CALENDARS_PATH)
        
    # save calendar data
    calendar[1].to_csv(os.path.join(CALENDARS_PATH, f'{calendar[0]}.csv'), index=False)

In [23]:
years = ['2021', '2020']
calendar = get_race_calendar(years)
[save_data(cal) for cal in calendar]

[None, None]

### Get Race startlist + results

In [102]:
def convert_racename_to_url(racename):
    
    race_url = racename.values[0].lower().replace("'", "-").replace(" ", "-")
    
    return race_url
    
def get_race_results(race_url, year):
    
    url = f'https://www.procyclingstats.com/race/{race_url}/{year}'
    res = requests.get(url)

    tables = pd.read_html(res.content)
    last_stage = tables[0]
    gc = tables[1]
    points = tables[2]
    kom = tables[3]
    youth = tables[4]
    teams = tables[5]
    
    # ALSO NEED individual stages, kom, points, youth, teams final + stage results
    
    return last_stage, gc, points, kom, youth, teams

def get_startlist(race_url, year):
    
    url = f'https://www.procyclingstats.com/race/{race_url}/{year}/gc/startlist/alphabetical-with-filters'
    res = requests.get(url)
    
    startlist = pd.read_html(res.content)[0].drop(['Unnamed: 3'], axis=1)
    
    return startlist

def save_data(startlist, PATH, race_url, year):
        
    if not os.path.isdir(os.path.join(PATH, race_url, year)):
        os.makedirs(os.path.join(PATH, race_url, year))
        
    # save data
    startlist.to_csv(os.path.join(PATH, race_url, year, 'startlist.csv'), index=False)

In [82]:
year = '2021'
target_race = "Giro d'Italia"

calendar = pd.read_csv(os.path.join(CALENDARS_PATH, f'{year}.csv'))
race_url = convert_racename_to_url(calendar[calendar['Race'].str.contains(target_race)].Race)

### Get startlist

In [None]:
startlist = get_startlist(race_url, year)
save_data(startlist, STARTLISTS_PATH, race_url, year)

### Get race results

In [109]:
def save_data(results, PATH, race_url, year):
        
    if not os.path.isdir(os.path.join(PATH, race_url, year)):
        os.makedirs(os.path.join(PATH, race_url, year))
        
    # save data
    [res.to_csv(os.path.join(PATH, race_url, year, f'{name}.csv'), index=False) for res, name in zip(results, ['last_stage', 'gc', 'points', 'kom', 'youth', 'team'])]

In [104]:
results = get_race_results(race_url, year)

In [110]:
save_data(results, RACERESULTS_PATH, race_url, year)