In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
import csv
import tqdm

# Scraping

### Scraping functions

In [71]:
def get_rider_names(n_pages):
    
    rider_names = []

    offsets = np.arange(0, 3001, 100)

    for offset in offsets[:n_pages]:

        url = f'https://www.procyclingstats.com/rankings.php?date=2022-01-12&nation=&age=&zage=&page=smallerorequal&team=&offset={offset}&continent=&teamlevel=&filter=Filter&p=me&s=uci-individual'
        res = requests.get(url)

        tables = pd.read_html(res.content)
        rider_names.append(tables[0])

        time.sleep(0.5)
    
    return rider_names

def normalize_rider_name(rider_name):
    
    surname = rider_name.split(" ")[-1].lower()
    name = "-".join(rider_name.split(" ")[:-1]).lower()
    full_name = surname + '-' + name
    
    return full_name

def clean_pcs_table_results(df_table):
    
    df_table.drop('Unnamed: 3', axis=1, inplace=True)
    df_table.drop('Unnamed: 8', axis=1, inplace=True)
    df_table.rename(columns={'Unnamed: 2': 'GC'}, inplace=True)
    
    return df_table

def clean_pcs_table_ranking(df_table):
    
    df_table.rename(columns={'Unnamed: 0': 'year'}, inplace=True)

    return df_table

def get_rider_stats(rider_name, years):

    pcs_ranking = []
    results = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/rider/{rider_name}/{year}'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            pcs_ranking.append(clean_pcs_table_ranking(tables[1]))
            results.append(clean_pcs_table_results(tables[0]))

            time.sleep(0.5)
            
        except Exception as e:
            pass

    df_results = [(year, x) for year, x in zip(years, results) if not x.empty]
    
    return df_results, pcs_ranking[0]


def save_data(rider_name, results, pcs_ranking):
    
    path = '../data/pcs-scraping'
    
    # check if rider already has results data
    if not os.path.isdir(f'{path}/results/rider/{rider_name}'):
        
        os.mkdir(f'{path}/results/rider/{rider_name}')
    
    # check if rider already has pcs-ranking data
    if not os.path.isdir(f'{path}/pcs-ranking/rider/{rider_name}'):
        
        os.mkdir(f'{path}/pcs-ranking/rider/{rider_name}')
        
    # save season results
    [x[1].to_csv(f'../data/pcs-scraping/results/rider/{rider_name}/{x[0]}.csv', index=False) for x in results]
    
    # save pcs_ranking
    pcs_ranking.to_csv(f'../data/pcs-scraping/pcs-ranking/rider/{rider_name}/pcs_ranking.csv', index=False)

### Get Rider Names

In [None]:
rider_names = get_rider_names(n_pages=5)

In [None]:
riders = []
[riders.extend(x['Rider']) for x in rider_names]

rider_names = [normalize_rider_name(x) for x in riders]

In [None]:
with open('./data/pcs-scraping/rider_names.csv', 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(rider_names)

### Get Race Results and Rankings

- NEED TO SCRAP ALL POSSIBLE YEARS (BEFORE 2011) !!!

- RIDER NAMES NOT CORRECT FOR FAILURES

In [75]:
years = np.arange(2011, 2022)

for rider_name in tqdm.tqdm(rider_names[16:100]):

    try:
        results, pcs_ranking = get_rider_stats(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

  2%|█                                           | 2/84 [00:18<11:38,  8.52s/it]

jonas-vingegaard


 13%|█████▋                                     | 11/84 [01:49<10:33,  8.68s/it]

frølich-honoré-mikkel


 27%|███████████▊                               | 23/84 [04:17<11:38, 11.44s/it]

alexey-lutsenko


 29%|████████████▎                              | 24/84 [04:23<09:35,  9.59s/it]

ben-o'connor


 52%|██████████████████████▌                    | 44/84 [08:10<06:16,  9.40s/it]

michael-valgren


 70%|██████████████████████████████▏            | 59/84 [10:55<03:39,  8.78s/it]

biniam-ghirmay-hailu


 89%|██████████████████████████████████████▍    | 75/84 [13:53<01:24,  9.40s/it]

ángel-lópez-miguel


 90%|██████████████████████████████████████▉    | 76/84 [13:56<01:01,  7.63s/it]

esteban-chaves


 92%|███████████████████████████████████████▍   | 77/84 [14:00<00:45,  6.47s/it]

michał-kwiatkowski


 98%|█████████████████████████████████████████▉ | 82/84 [14:52<00:17,  8.82s/it]

jesús-herrada


100%|███████████████████████████████████████████| 84/84 [15:10<00:00, 10.84s/it]

christian-eiking-odd





### Get Teams

In [42]:
def get_rider_teams(rider_name):
    
    time.sleep(0.5)
    
    url = f'https://www.procyclingstats.com/rider/{rider_name}'
    res = requests.get(url)
    
    soup = BeautifulSoup(res.content)
    
    possible_classes = ['list rdr-teams moblist moblist', 'list rdr-teams moblist', 'list rdr-teams moblist moblist ']
    ul = list(filter(None, [soup.find('ul', {'class': class_}) for class_ in possible_classes]))
    
    if ul:
        
        season = [x.find('div', {'class': 'season'}).text for x in ul[0].find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul[0].find_all('li')]
        teams = pd.DataFrame({'season': season, 'team': team})
        
        return teams

    else:
        
        print(rider_name, 'No Teams scraped')

def save_data(rider_name, teams):
    
    path = '../data/pcs-scraping'
    
    # check if rider already has teams data
    if not os.path.isdir(f'{path}/teams/rider/{rider_name}'):
        
        os.mkdir(f'{path}/teams/rider/{rider_name}')
        
    # save teams data
    teams.to_csv(f'../data/pcs-scraping/teams/rider/{rider_name}/teams.csv', index=False)

In [43]:
rider_names = list(pd.read_csv('../data/pcs-scraping/rider_names.csv'))

In [67]:
for rider_name in tqdm.tqdm(rider_names[16:100]):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        pass

  2%|█                                           | 2/84 [00:01<01:12,  1.13it/s]

jonas-vingegaard No Teams scraped


 13%|█████▋                                     | 11/84 [00:12<01:20,  1.10s/it]

frølich-honoré-mikkel No Teams scraped


 27%|███████████▊                               | 23/84 [00:25<01:09,  1.13s/it]

alexey-lutsenko No Teams scraped


 29%|████████████▎                              | 24/84 [00:26<01:02,  1.04s/it]

ben-o'connor No Teams scraped


 52%|██████████████████████▌                    | 44/84 [00:49<00:42,  1.07s/it]

michael-valgren No Teams scraped


 70%|██████████████████████████████▏            | 59/84 [01:09<00:26,  1.06s/it]

biniam-ghirmay-hailu No Teams scraped


 89%|██████████████████████████████████████▍    | 75/84 [01:26<00:08,  1.02it/s]

ángel-lópez-miguel No Teams scraped


 90%|██████████████████████████████████████▉    | 76/84 [01:27<00:07,  1.06it/s]

esteban-chaves No Teams scraped


 92%|███████████████████████████████████████▍   | 77/84 [01:28<00:06,  1.10it/s]

michał-kwiatkowski No Teams scraped


 98%|█████████████████████████████████████████▉ | 82/84 [01:34<00:02,  1.13s/it]

jesús-herrada No Teams scraped


100%|███████████████████████████████████████████| 84/84 [01:36<00:00,  1.15s/it]

christian-eiking-odd No Teams scraped





- RIDER NAMES NOT CORRECT ON URL FOR FAILURES