In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
import csv

# Scraping

### Scraping functions

In [48]:
def get_rider_names(n_pages):
    
    rider_names = []

    offsets = np.arange(0, 3001, 100)

    for offset in offsets[:n_pages]:

        url = f'https://www.procyclingstats.com/rankings.php?date=2022-01-12&nation=&age=&zage=&page=smallerorequal&team=&offset={offset}&continent=&teamlevel=&filter=Filter&p=me&s=uci-individual'
        res = requests.get(url)

        tables = pd.read_html(res.content)
        rider_names.append(tables[0])

        time.sleep(0.5)
    
    return rider_names

def normalize_rider_name(rider_name):
    
    surname = rider_name.split(" ")[-1].lower()
    name = "-".join(rider_name.split(" ")[:-1]).lower()
    full_name = surname + '-' + name
    
    return full_name

def clean_pcs_table_results(df_table):
    
    df_table.drop('Unnamed: 3', axis=1, inplace=True)
    df_table.drop('Unnamed: 8', axis=1, inplace=True)
    df_table.rename(columns={'Unnamed: 2': 'GC'}, inplace=True)
    
    return df_table

def clean_pcs_table_ranking(df_table):
    
    df_table.rename(columns={'Unnamed: 0': 'year'}, inplace=True)

    return df_table

def get_rider_stats(rider_name, years):

    pcs_ranking = []
    results = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/rider/{rider_name}/{year}'
            res = requests.get(url)

            tables = pd.read_html(res.content)
            pcs_ranking.append(clean_pcs_table_ranking(tables[1]))
            results.append(clean_pcs_table_results(tables[0]))

            time.sleep(0.5)
            
        except Exception as e:
            pass

    df_results = [(year, x) for year, x in zip(years, results) if not x.empty]
    
    return df_results, pcs_ranking[0]


def save_data(rider_name, results, pcs_ranking):
    
    path = './data/pcs-scraping'
    
    # check if rider already has results data
    if not os.path.isdir(f'{path}/results/rider/{rider_name}'):
        
        os.mkdir(f'{path}/results/rider/{rider_name}')
    
    # check if rider already has pcs-ranking data
    if not os.path.isdir(f'{path}/pcs-ranking/rider/{rider_name}'):
        
        os.mkdir(f'{path}/pcs-ranking/rider/{rider_name}')
        
    # save season results
    [x[1].to_csv(f'./data/pcs-scraping/results/rider/{rider_name}/{x[0]}.csv', index=False) for x in results]
    
    # save pcs_ranking
    pcs_ranking.to_csv(f'./data/pcs-scraping/pcs-ranking/rider/{rider_name}/pcs_ranking.csv', index=False)

### Get Rider Names

In [4]:
rider_names = get_rider_names(n_pages=5)

In [5]:
riders = []
[riders.extend(x['Rider']) for x in rider_names]

rider_names = [normalize_rider_name(x) for x in riders]

In [57]:
with open('./data/pcs-scraping/rider_names.csv', 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(rider_names)

### Get Race Results and Rankings

In [52]:
years = np.arange(2011, 2022)
rider_name = 'julian-alaphilippe'

for rider_name in rider_names:

    results, pcs_ranking = get_rider_stats(rider_name, years)
    save_data(rider_name, results, pcs_ranking)

IndexError: list index out of range

- stopped at Jasper Stuyven

### Get Teams

In [47]:
def get_rider_teams(rider_name):
    
    url = f'https://www.procyclingstats.com/rider/{rider_name}'
    res = requests.get(url)
    
    soup = BeautifulSoup(res.content)
    
    ul = soup.find('ul', {'class': 'list rdr-teams moblist moblist'})
    if ul:
        season = [x.find('div', {'class': 'season'}).text for x in ul.find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul.find_all('li')]
    else:
        ul = soup.find('ul', {'class': 'list rdr-teams moblist moblist'})
        season = [x.find('div', {'class': 'season'}).text for x in ul.find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul.find_all('li')]
        
    teams = pd.DataFrame({'season': season, 'team': team})
    
    time.sleep(0.5)

    return teams

def save_data(rider_name, teams):
    
    path = '../data/pcs-scraping'
    
    # check if rider already has teams data
    if not os.path.isdir(f'{path}/teams/rider/{rider_name}'):
        
        os.mkdir(f'{path}/teams/rider/{rider_name}')
        
    # save teams data
    teams.to_csv(f'../data/pcs-scraping/teams/rider/{rider_name}/teams.csv', index=False)

In [37]:
rider_names = list(pd.read_csv('../data/pcs-scraping/rider_names.csv'))

In [52]:
for rider_name in rider_names[:15]:
    
    teams = get_rider_teams(rider_name)
    save_data(rider_name, teams)

AttributeError: 'NoneType' object has no attribute 'find_all'

- stopped at Jasper Philipsen