In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os
import csv
import tqdm
import re
import datetime
from dotenv import load_dotenv

# Scraping

### env variables

In [2]:
load_dotenv()

BASE_PATH = os.environ['BASE_PATH']
RESULTS_PATH = os.environ['RESULTS_PATH']
RANKINGS_PATH = os.environ['RANKINGS_PATH']
SPECIALITY_RANKINGS_PATH = os.environ['SPECIALITY_RANKINGS_PATH']
TEAMS_PATH = os.environ['TEAMS_PATH']
CALENDARS_PATH = os.environ['CALENDARS_PATH']
STARTLISTS_PATH = os.environ['STARTLISTS_PATH']
RACERESULTS_PATH = os.environ['RACERESULTS_PATH']
IMG_PATH = os.environ['IMG_PATH']
RIDERSTATS_PATH = os.environ['RIDERSTATS_PATH']
RACENAMES_PATH = os.environ['RACENAMES_PATH']

### Scraping functions

In [None]:
def get_rider_names(n_pages):
    
    rider_names = []

    offsets = np.arange(0, 3001, 100)

    for offset in offsets[:n_pages]:

        url = f'https://www.procyclingstats.com/rankings.php?date=2022-01-12&nation=&age=&zage=&page=smallerorequal&team=&offset={offset}&continent=&teamlevel=&filter=Filter&p=me&s=uci-individual'
        res = requests.get(url)

        tables = pd.read_html(res.content, encoding='utf-8')
        rider_names.append(tables[0])

        time.sleep(0.5)
    
    return rider_names

def normalize_rider_name(rider_name):
    
    surname = rider_name.split(" ")[-1].lower()
    name = "-".join(rider_name.split(" ")[:-1]).lower()
    full_name = surname + '-' + name
    
    return full_name

def clean_pcs_table_results(df_table):
    
    df_table.drop('Unnamed: 3', axis=1, inplace=True)
    df_table.drop('Unnamed: 8', axis=1, inplace=True)
    df_table.rename(columns={'Unnamed: 2': 'GC'}, inplace=True)
    
    return df_table

def clean_pcs_table_ranking(df_table):
    
    df_table.rename(columns={'Unnamed: 0': 'year'}, inplace=True)

    return df_table

def get_rider_results_rankings(rider_name, years):

    pcs_ranking = []
    results = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/rider/{rider_name}/{year}'
            res = requests.get(url)

            tables = pd.read_html(res.content, encoding='utf-8')
            pcs_ranking.append(clean_pcs_table_ranking(tables[1]))
            results.append(clean_pcs_table_results(tables[0]))

            time.sleep(0.3)
            
        except Exception as e:
            print(e)

    df_results = [(year, x) for year, x in zip(years, results) if not x.empty]
    
    return df_results, pcs_ranking[0]

def init_dirs():

    if not os.path.isdir(BASE_PATH):
        os.makedirs(BASE_PATH)
        
    if not os.path.isdir(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)
        
    if not os.path.isdir(RANKINGS_PATH):
        os.makedirs(RANKINGS_PATH)
        
    if not os.path.isdir(TEAMS_PATH):
        os.makedirs(TEAMS_PATH)
        
    if not os.path.isdir(IMG_PATH):
        os.makedirs(IMG_PATH)
        
    if not os.path.isdir(RIDERSTATS_PATH):
        os.makedirs(RIDERSTATS_PATH)
        
    if not os.path.isdir(RACENAMES_PATH):
        os.makedirs(RACENAMES_PATH)
        
    if not os.path.isdir(RACERESULTS_PATH):
        os.makedirs(RACERESULTS_PATH)
        

def save_data(rider_name, results, pcs_ranking):
        
    # check if rider already has results data
    if not os.path.isdir(os.path.join(RESULTS_PATH, rider_name)):
        os.mkdir(os.path.join(RESULTS_PATH, rider_name))
    
    # check if rider already has pcs-ranking data
    if not os.path.isdir(os.path.join(RANKINGS_PATH, rider_name)):
        os.mkdir(os.path.join(RANKINGS_PATH, rider_name))
        
    # save season results
    #[x[1].to_csv(f'../data/pcs-scraping/results/rider/{rider_name}/{x[0]}.csv', index=False) for x in results]
    [x[1].to_csv(os.path.join(RESULTS_PATH, rider_name , f'{x[0]}.csv'), encoding='utf-8', index=False) for x in results]
    
    # save pcs_ranking
    #pcs_ranking.to_csv(f'../data/pcs-scraping/pcs-ranking/rider/{rider_name}/pcs_ranking.csv', index=False)
    pcs_ranking.to_csv(os.path.join(RANKINGS_PATH, rider_name, 'pcs_ranking.csv'), encoding='utf-8', index=False)

# Initialize data directories

In [None]:
init_dirs()

# Get Rider Names

In [None]:
rider_names = get_rider_names(n_pages=31)

In [None]:
riders = []
[riders.extend(x['Rider']) for x in rider_names]

rider_names = [normalize_rider_name(x) for x in riders]

In [None]:
with open(os.path.join(BASE_PATH, 'rider_names.csv'), 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(rider_names)

In [None]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

# Get Race Results and Rankings

- NEED TO SCRAP ALL POSSIBLE YEARS (BEFORE 2011) !!!

- RIDER NAMES IN URL NOT CORRECT FOR FAILURES

In [None]:
years = np.arange(2000, 2023)

for rider_name in tqdm.tqdm(rider_names[:200]):

    try:
        results, pcs_ranking = get_rider_results_rankings(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

In [None]:
rider_names_corrected_1 = ['jonas-vingegaard-rasmussen', 'mikkel-honore', 'aleksey-lutsenko', 'ben-o-connor',
                         'michael-valgren-andersen', 'biniam-girmay', 'miguel-angel-lopez', 'johan-esteban-chaves',
                         'michal-kwiatkowski', 'jesus-herrada-lopez', 'odd-christian-eiking', 'magnus-cort-nielsen',
                         'daniel-felipe-martinez', 'luis-leon-sanchez', 'tobias-halland-johannessen', 'jose-manuel-diaz-gallego',
                         'juan-ayuso-pesquera']
rider_names_corrected_2 = []

years = np.arange(2000, 2023)

for rider_name in tqdm.tqdm(rider_names_corrected_1):

    try:
        results, pcs_ranking = get_rider_results_rankings(rider_name, years)
        save_data(rider_name, results, pcs_ranking)
    except:
        print(rider_name)
        pass

# Get Teams

In [None]:
def get_rider_teams(rider_name):
    
    time.sleep(0.3)
    
    url = f'https://www.procyclingstats.com/rider/{rider_name}'
    res = requests.get(url)
    
    soup = BeautifulSoup(res.content)
    
    possible_classes = ['list rdr-teams moblist moblist', 'list rdr-teams moblist', 'list rdr-teams moblist moblist ']
    ul = list(filter(None, [soup.find('ul', {'class': class_}) for class_ in possible_classes]))
    
    if ul:
        
        season = [x.find('div', {'class': 'season'}).text for x in ul[0].find_all('li')]
        team = [x.find('div', {'class': 'name'}).text for x in ul[0].find_all('li')]
        teams = pd.DataFrame({'season': season, 'team': team})
        
        return teams

    else:
        
        print(rider_name, 'No Teams scraped')

def save_data(rider_name, teams):
        
    # check if rider already has teams data
    if not os.path.isdir(os.path.join(TEAMS_PATH, rider_name)):
        os.mkdir(os.path.join(TEAMS_PATH, rider_name))
        
    # save teams data
    teams.to_csv(os.path.join(TEAMS_PATH, rider_name, 'teams.csv'), index=False, encoding='utf-8')

In [None]:
rider_names = list(pd.read_csv(os.path.join(BASE_PATH, 'rider_names.csv')))

In [None]:
for rider_name in tqdm.tqdm(rider_names[:200]):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        print(rider_name)

In [None]:
for rider_name in tqdm.tqdm(rider_names_corrected_1):
    
    try:
        teams = get_rider_teams(rider_name)
        save_data(rider_name, teams)
    except:
        print(rider_name)

# Get race calendar

In [None]:
def get_race_calendar(years):

    calendar = []

    for year in years:

        try:
            url = f'https://www.procyclingstats.com/races.php?year={year}&circuit=&class=&filter=Filter'
            res = requests.get(url)

            tables = pd.read_html(res.content, encoding='utf-8')
            calendar.append(tables[0].dropna())

            time.sleep(0.5)
            
        except Exception as e:
            print(e)

    df_calendar = [(year, cal) for year, cal in zip(years, calendar) if not cal.empty]
    
    return df_calendar

def save_data(calendar):
        
    if not os.path.isdir(CALENDARS_PATH):
        os.mkdir(CALENDARS_PATH)
        
    # save calendar data
    calendar[1].to_csv(os.path.join(CALENDARS_PATH, f'{calendar[0]}.csv'), index=False, encoding='utf-8')

In [None]:
years = np.arange(2000, 2023)
calendar = get_race_calendar(years)
[save_data(cal) for cal in calendar]

In [None]:
year = '2021'
calendar = pd.read_csv(os.path.join(CALENDARS_PATH, f'{year}.csv'))

# Get Racename urls

In [56]:
def get_racename_urls(year, circuit, save=False):
    
    circuits = {'uci_wt': 1,
            'uci_world_championships': 2,
            'men_junior': 15,
            'women_elite': 16,
            'women_junior': 17,
            'woment_wt': 24,
            'europe_tour': 13,
            'africa_tour': 11,
            'asia_tour': 12,
            'oceania_tour': 14,
            'america_tour': 18,
            'uci_pro_series': 26,
            'olympic_games': 3,
            'nations_cup': 21}

    url = f'https://www.procyclingstats.com/races.php?year={year}&circuit={circuits[circuit]}&class=&filter=Filter'
    res = requests.get(url)
    
    table = pd.read_html(res.content, encoding='utf-8')[0]

    soup = BeautifulSoup(res.content)
    
    racename_urls = soup.find_all('a', href=re.compile('race/.*'))
    
    racename_urls_keep = []
    mappings_racename = []
    for race_name in racename_urls:
        if race_name.text in list(table.Race):
            racename_urls_keep.append(race_name['href'].split('/')[1])
            mappings_racename.append(table[table.Race == race_name.text].Race.values[0])

    df_mappings = pd.DataFrame({'RaceName': mappings_racename,
                                'RaceURL': racename_urls_keep})

    racename_urls_keep = list(set(racename_urls_keep))
    
    if save:
        
        if not os.path.isdir(os.path.join(RACENAMES_PATH, str(year), str(circuit))):
            os.makedirs(os.path.join(RACENAMES_PATH, str(year), str(circuit)))
        
        with open(os.path.join(RACENAMES_PATH, str(year), str(circuit), 'racename_urls.csv'), 'w') as f:
            wr = csv.writer(f, quoting=csv.QUOTE_ALL)
            wr.writerow(racename_urls_keep)
            
        df_mappings.to_csv(os.path.join(RACENAMES_PATH, str(year), str(circuit), 'racename_mappings.csv'), encoding='utf-8', index=False)
    
    return racename_urls
    
def get_race_results(race_url, year):
    
    url = f'https://www.procyclingstats.com/race/{race_url}/{year}'
    res = requests.get(url)

    tables = pd.read_html(res.content, encoding='utf-8')
    
    # ALSO NEED individual stages, kom, points, youth, teams final + stage results
    
    try:
        last_stage = tables[0]
        gc = tables[1]
        points = tables[2]
        kom = tables[3]
        youth = tables[4]
        teams = tables[5]
        
        return last_stage, gc, points, kom, youth, teams
    except Exception as e:
        last_stage = tables[0]

        return last_stage

def get_startlist(PATH, circuit, race_url, year, save=False):
    
    try:
        url = f'https://www.procyclingstats.com/race/{race_url}/{year}/gc/startlist/alphabetical-with-filters'
        res = requests.get(url)
        startlist = pd.read_html(res.content, encoding='utf-8')[0].drop(['Unnamed: 3'], axis=1)
        
    except Exception as e:
        url = f'https://www.procyclingstats.com/race/{race_url}/{year}/result/startlist/alphabetical-with-filters'
        res = requests.get(url)
        startlist = pd.read_html(res.content, encoding='utf-8')[0].drop(['Unnamed: 3'], axis=1)
    
    if save:
        
        if not os.path.isdir(os.path.join(PATH, circuit, race_url, year)):
            os.makedirs(os.path.join(PATH, circuit, race_url, year))
        
        startlist.to_csv(os.path.join(PATH, circuit, race_url, year, 'startlist.csv'), index=False, encoding='utf-8')
        
    return startlist

### Get racename urls

In [57]:
circuits = {'uci_wt': 1,
            'uci_world_championships': 2,
            'men_junior': 15,
            'women_elite': 16,
            'women_junior': 17,
            'woment_wt': 24,
            'europe_tour': 13,
            'africa_tour': 11,
            'asia_tour': 12,
            'oceania_tour': 14,
            'america_tour': 18,
            'uci_pro_series': 26,
            'olympic_games': 3,
            'nations_cup': 21}

In [59]:
years = np.arange(2000, 2023)

for circuit in circuits.keys():
    for year in years:
        racename_url = get_racename_urls(year, circuit, save=True)

### Get startlist of all races in year and given circuit

In [None]:
year = '2022'
circuit = 'men_junior'
racename_urls = list(pd.read_csv(os.path.join(RACENAMES_PATH, year, circuit, 'racename_urls.csv')))

In [None]:
for circuit in tqdm.tqdm(circuits.keys()):

    try:
        racename_urls = list(pd.read_csv(os.path.join(RACENAMES_PATH, year, circuit, 'racename_urls.csv')))
    
        for race_url in tqdm.tqdm(racename_urls):

            time.sleep(0.2)
            try:
                startlist = get_startlist(STARTLISTS_PATH, circuit, race_url, year, save=True)
            except:
                print(race_url)
                
    except Exception as e:
        print(circuit, race_url, e)

### Get GC race results of all races in year and given circuit

In [None]:
def save_data(results, PATH, circuit, race_url, year):
        
    if not os.path.isdir(os.path.join(PATH, circuit, race_url, year)):
        os.makedirs(os.path.join(PATH, circuit, race_url, year))
        
    # save data
    if isinstance(results, pd.DataFrame):
        results.to_csv(os.path.join(PATH, circuit, race_url, year, 'race_results.csv'), index=False, encoding='utf-8')
    else:
        [res.to_csv(os.path.join(PATH, circuit, race_url, year, f'{name}.csv'), index=False, encoding='utf-8') for res, name in zip(results, ['last_stage', 'gc', 'points', 'kom', 'youth', 'team'])]


In [None]:
year = '2022'

for circuit in tqdm.tqdm(circuits.keys()):
    
    try:
        racename_urls = list(pd.read_csv(os.path.join(RACENAMES_PATH, year, circuit, 'racename_urls.csv')))

        for race_url in tqdm.tqdm(racename_urls):

            time.sleep(0.2)
            try:
                results = get_race_results(race_url, year)
                save_data(results, RACERESULTS_PATH, circuit, race_url, year)
            except:
                print(race_url)
                
    except Exception as e:
        print(circuit, race_url, e)

### Get all stage results of a given race and year

In [15]:
def getStageResults(circuit, race_url, year, save=False):
    
    # get number of stages in race
    try:
        url = f'https://www.procyclingstats.com/race/{race_url}/{year}/gc/stages/winners'
        res = requests.get(url)
        tables = pd.read_html(res.content, encoding='utf-8')
        n_stages = len(tables[0])
    except ValueError:
        return None
        
    # get individual stage results
    stage_result = []
    for n in range(1, n_stages+1):
    
        url = f'https://www.procyclingstats.com/race/{race_url}/{year}/stage-{n}'
        res = requests.get(url)
        tables = pd.read_html(res.content, encoding='utf-8')
        stage_result.append(tables[0])
        time.sleep(0.1)
        
    if save:
        [res.to_csv(os.path.join(RACERESULTS_PATH, circuit, race_url, year, f'stage_{stage}.csv'), index=False, encoding='utf-8') for res, stage in zip(stage_result, range(1, n_stages+1))]
        
    return stage_result

In [None]:
circuit = 'uci_wt'
year = '2022'
race_url = 'paris-nice'

In [None]:
stages = getStageResults(circuit, race_url, year, save=True)

In [None]:
year = '2022'

for circuit in tqdm.tqdm(circuits.keys()):
    
    try:
        racename_urls = list(pd.read_csv(os.path.join(RACENAMES_PATH, year, circuit, 'racename_urls.csv')))

        for race_url in tqdm.tqdm(racename_urls):

            time.sleep(0.2)
            try:
                stages = getStageResults(circuit, race_url, year, save=True)
            except:
                print(race_url)
                
    except Exception as e:
        print(circuit, race_url, e)

# Get rider images

In [None]:
def get_rider_img(rider_name):
    
    # check if rider already has img data
    if not os.path.isdir(os.path.join(IMG_PATH, rider_name)):
        os.mkdir(os.path.join(IMG_PATH, rider_name))
        
    try:
        url = f'https://www.procyclingstats.com/rider/{rider_name}'
        res = requests.get(url)

        soup = BeautifulSoup(res.content)
        img_path = soup.find_all('img', {'src': re.compile(r'images.*\.jpeg')})

        url = os.path.join('https://www.procyclingstats.com', img_path[0]['src'])
        res = requests.get(url)
        
        with open(os.path.join(IMG_PATH, rider_name, f'{rider_name}-img.jpeg'), 'wb') as file:
            file.write(res.content)
            
        time.sleep(0.1)
    
    except Exception as e:
        print(rider_name, '---', e)

In [None]:
for rider_name in rider_names[:200]:
    get_rider_img(rider_name)

In [None]:
rider_names_corrected_1 = ['jonas-vingegaard-rasmussen', 'mikkel-honore', 'aleksey-lutsenko', 'ben-o-connor',
                         'michael-valgren-andersen', 'biniam-girmay', 'miguel-angel-lopez', 'johan-esteban-chaves',
                         'michal-kwiatkowski', 'jesus-herrada-lopez', 'odd-christian-eiking', 'magnus-cort-nielsen',
                         'daniel-felipe-martinez', 'luis-leon-sanchez', 'tobias-halland-johannessen', 'jose-manuel-diaz-gallego',
                         'juan-ayuso-pesquera']

for rider_name in rider_names_corrected_1:
    get_rider_img(rider_name)

### Get rider performance stats

In [None]:
def get_rider_stats(race, year):
        
    # check if race already has stats data
    if not os.path.isdir(os.path.join(STARTLISTS_PATH, race, year)):
        os.makedirs(os.path.join(STARTLISTS_PATH, race, year))

    try:
        url = f'https://www.procyclingstats.com/race/{race}/{year}/result/startlist/kpis'
        res = requests.get(url)

        stats = pd.read_html(res.content, encoding='utf-8')[0]
        date = str(datetime.datetime.now()).split(' ')[0]
        
        stats.to_csv(os.path.join(STARTLISTS_PATH, race, year, 'stats-kpis.csv'), index=False, encoding='utf-8')

        for row in stats.iterrows():
            
            rider_name = '-'.join(reversed(row[1].Rider.split(' '))).lower()
            
            # check if rider already has stats data
            if not os.path.isdir(os.path.join(RIDERSTATS_PATH, rider_name)):
                os.mkdir(os.path.join(RIDERSTATS_PATH, rider_name))
                
            pd.DataFrame(row[1]).T.to_csv(os.path.join(RIDERSTATS_PATH, rider_name, f'stats-kpis-{date}.csv'), index=False, encoding='utf-8')
            
        
    except Exception as e:
        print(race, '---', e)
    
    

In [None]:
year = '2022'
race = 'milano-sanremo'

rider_stats = get_rider_stats(race, year)

# Get Rankings by speciality

In [3]:
def get_itt_rankings(date):

    url = f'https://www.procyclingstats.com/rankings.php?date={date}&nation=&age=&zage=&page=smallerorequal&team=&offset=0&filter=Filter&p=me&s=time-trial'
    res = requests.get(url)
    
    table = pd.read_html(res.content, encoding='utf-8')[0]
    
    return table

In [12]:
year = '2022'
date = datetime.datetime.now().strftime('%Y-%m-%d')

itt_rankings = get_itt_rankings(date)

In [13]:
if not os.path.isdir(os.path.join(SPECIALITY_RANKINGS_PATH, 'ITT', year, date)):
    os.makedirs(os.path.join(SPECIALITY_RANKINGS_PATH, 'ITT', year, date))

In [14]:
itt_rankings.to_csv(os.path.join(SPECIALITY_RANKINGS_PATH, 'ITT', year, date, 'itt.csv'), encoding='utf-8')