In [6]:
#always run this part

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
from dateutil.parser import parse
from IPython.display import clear_output

In [7]:
######base functions

#download source code for any url
user_agent = {'User-agent': 'Mozilla/5.0'}

def download_soup(url):
    response = requests.get(url, headers=user_agent)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    return soup

def sleep(t):
    #sleep for between t and 2*t
    timer = t + t * random.random()
    print('downloading and parsing done, sleeping for',round(timer,2),'sec')
    time.sleep(timer)

In [8]:
#####main functions

# main for creating the csv file containing the calendar
race_circuits_men = (1, 2, 11, 12, 13, 14, 18)
circuits_code = {1:"WT", 2:"WC", 11:"Africa Tour", 12:"Asia Tour", 13:"Europe Tour", 14:"Oceania Tour", 15:"America Tour"}
# race_circuits_men = [1, 2]
# circuits_code = {1:"WT"}

In [9]:
# for race soups

def get_name(race_soup):
    name = race_soup.find('title').text.replace(" | Results", "")
    return name

def get_cat(soup):
    inter_soup = soup.find('div', {'class':'entry race'})
    try:
        inter_info = inter_soup.contents[1].contents[3][1:-1].split(" (")
        race_cat = inter_info[1]
    except IndexError:
        try:
            inter_info = inter_soup.contents[1].contents[2][1:-1].split(" (")
            race_cat = inter_info[1]
        except:
            race_cat = " - "
    except:
            race_cat = " - "
    return race_cat

def get_stages_url(race_soup):
    inter_soup = race_soup.find('div', {'class':'ESNav stages'})
    prl = inter_soup.find('option', value = re.compile('prologue'))
    stages = inter_soup.find_all('option', value = re.compile('stage'))
    
    stage_urls = []
    if prl:
        stage_urls.append(prl['value'])
    stage_urls.extend([x['value'] for x in stages])
    return stage_urls

# for stage soups

def get_start_finish_type_length(stage_soup):
    inter_soup = stage_soup.find('div', {'class':'entry race'})
    # start and finish
    try:
        sflist = inter_soup.find("span", class_='red').text.replace("\xa0", "").split("›")
        start = sflist[0].strip()
        finish = sflist[1].strip()
    except:
        start = "Start"
        finish = "Finish"
    
    # stage type
    try:
        tmp = inter_soup.find("span", class_='blue').text
        if tmp.find("(TTT)") >= 0:
            stage_type = "TTT"
        elif tmp.find("(ITT)") >= 0:
            stage_type = "ITT"
        elif tmp.find("Prologue"):
            stage_type = "PRL"
        else:
            stage_type = "RR"
    except:
        race_type = "-"
        
    # stage length km
    try:
        length = float(inter_soup.find("span", class_='red distance').text[1:-2])
    except:
        length = 0
    
    return start, finish, stage_type, length

In [12]:
def create_df_calendar(year_start, year_end, race_circuits):
    #Prepare dataframes and variable
    print("STARTING CALENDAR")
    race_df = pd.DataFrame(columns=["Race_Name","Category","Number_stages"])
    stage_df = pd.DataFrame(columns=["Stage_Name","Date","Stage_Type","Start", "Finish", "Race_ID","Stage#","url","Length(km)"])
    special_races = ('world','nc','championship')

    #main part
    #get data for many yeras
    for year in range(year_start, year_end+1):

        #a year has many race circuits
        #get data for all circuits of a year
        for circuit in race_circuits:
            circuit_url = "https://www.procyclingstats.com/races.php?year=" + str(year) + "&circuit=" + str(circuit) + "&ApplyFilter=Filter"
            circuit_soup = download_soup(circuit_url)
            inter_soup = circuit_soup.find('div', {'class' : 'content'})
            inter_soup = inter_soup.find('table')

            #a circuit has many races
            #get data about each race
            for race_tmp in inter_soup.find_all('a', href = re.compile('race/')):
                clear_output(wait=True)
                print(race_df.tail())
                #sleep(0.4)
                path = race_tmp['href']
                race_url = 'https://www.procyclingstats.com/' + path
                if race_url[-8:] == "overview":
                    race_url = race_url[:-8]
                    
                #print race_url 
                print(race_url)
                
                #get source code for that race_url
                race_soup = download_soup(race_url)
                
                #check if page exists, else skip this race
                if race_soup.find('div', {"class":"entry"}).text == "Could not find race":
                    continue
                
                # get race name and race category from soup
                race_name = race_soup.find('title').text.replace(" | Results", "")
                race_cat = get_cat(race_soup)
                
                tag_stages = race_soup.find("div", class_="ESNav stages")
                if tag_stages:
                    is_stage_race = True
                    stage_urls = get_stages_url(race_soup)
                    nb_stages = len(stage_urls)
                else:
                    is_stage_race = False
                    nb_stages = 1
                
                #save race in dataframe
                temp_df = pd.DataFrame({"Race_Name" : [race_name], "Category" : [race_cat], "Number_stages" : [nb_stages]})
                race_df = pd.concat([race_df, temp_df], ignore_index=True)
                race_index = race_df.index[race_df['Race_Name'] == race_name].tolist()[0]
                #print(race_index)
                #print(race_df)

                #If stage race, get data for stages
                #"Stage_Name","Date","Stage_Type","Start", "Finish", "Race_ID","Stage#","url","Length
                if is_stage_race:
                    print("IS A STAGE RACE")
                    for idx, stage_url in enumerate(stage_urls):
                        #sleep(0.4)
                        
                        #Get this stage's soup
                        stage_url = 'https://www.procyclingstats.com/' + stage_url
                        stage_soup = download_soup(stage_url)
                        print(stage_url)
                        
                        #Get stage name from stage soup
                        stage_name = stage_soup.find('title').text.replace(" | Results", "")
                        
                        #Get stage number from loop rank
                        stage_rank = idx+1
                        
                        #Get start location, finish location, type of stage and length
                        start_str, finish_str, stage_type, stage_length = get_start_finish_type_length(stage_soup)
                        
                        #Get this stage's date from the stage soup
                        date_soup = stage_soup.find('div', {'class' : 'res-right'})
                        temp_dt = date_soup.find(text=True, recursive=False)
                        stage_date = parse(temp_dt).date()
                     
                        #save in dataframe
                        #"Stage_Name","Stage_Type","Start", "Finish", "Race_ID","Stage#","url","Length"
                        temp_df = pd.DataFrame({"Stage_Name" : [stage_name], "Date" : [stage_date], 
                                                "Stage_Type" : [stage_type], "Start" : [start_str], 
                                                "Finish" : [finish_str], "Race_ID" : [race_index], 
                                                "Stage#" : [stage_rank], "url" : [stage_url], 
                                                "Length" : [stage_length]})
                        #print(temp_df)
                        stage_df = pd.concat([stage_df, temp_df], ignore_index=True)

                #if one day race, get data about the course
                else:
                    print("IS A ONE DAY RACE")                    
                    #Get this ODR's date
                    date_soup = race_soup.find('div', {'class' : 'res-right'})
                    temp_dt = date_soup.find(text=True, recursive=False)
                    odr_date = parse(temp_dt).date()
                    
                    #Get start location, finish location, and the type of ODR it is
                    odr_start, odr_finish, odr_type, odr_length = get_start_finish_type_length(race_soup)
 
                    #"Stage_Name","Date","Stage_Type","Start", "Finish", "Race_ID","Stage#","url","Length"
                    temp_df = pd.DataFrame({"Stage_Name" : [race_name], "Date" : [odr_date], 
                                            "Stage_Type" : [odr_type], "Start" : [odr_start], 
                                            "Finish" : [odr_finish], "Race_ID" : [race_index], 
                                            "Stage#" : [0], "url" : [path], 
                                            "Length" : [odr_length]})
                    stage_df = pd.concat([stage_df, temp_df], ignore_index=True)
        
        # Save every year
        race_df.to_csv("data/race_infos3.csv")
        stage_df.to_csv("data/stage_infos3.csv")
    
    return race_df, stage_df

In [13]:
create_df_calendar(2014, 2018, race_circuits_men)

                                              Race_Name Category Number_stages
3227        National Championships Argentina - ITT 2018       NC             1
3228    National Championships Argentina U23 - ITT 2018       NC             1
3229  National Championships Argentina - Road Race 2018       NC             1
3230  National Championships Argentina U23 - Road Ra...       NC             1
3231                         Joe Martin Stage Race 2018      2.2             8
https://www.procyclingstats.com/race/tour-of-the-gila/2018
IS A STAGE RACE
https://www.procyclingstats.com/race/tour-of-the-gila/2018/stage-1
https://www.procyclingstats.com/race/tour-of-the-gila/2018/stage-2
https://www.procyclingstats.com/race/tour-of-the-gila/2018/stage-3
https://www.procyclingstats.com/race/tour-of-the-gila/2018/stage-4


KeyboardInterrupt: 