In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import time
from collections import defaultdict
import requests

In [2]:
# Headers for all the data we will be scraping in this notebook
headers = ['Year','TRK','Track','Category','Session','Date','Track_Condition','Track_Temp','Air_Temp',
           'Humidity','Position','Points','Rider_Number','Rider_Name','Nationality','Team_Name',
           'Bike','Avg_Speed','Time']
headers2 = ['Track', "Trk Length", 'Left_Corners', 'Right_Corners', 'track_width', 'length of longest straight', 'MotoGP_avg_speed', 'GP_distance', 'Moto2_distance', 'Moto3_distance'] 


#had to remove 2002 - 2004
years = ['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']

base_url = 'http://www.motogp.com/en/Results+Statistics/'

In [3]:
def soup_stuff(url):
    """Returns a BeautifulSoup object for the provided url"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup

In [4]:
def get_date(soup):
    """ Returns the date of the race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='padbot5')
    if find is None:
        r = 'n/a'
    else:
        r = ','.join(find.text.replace(',',' ').split()[-3:])
    return r

In [5]:
def get_tr_con(soup):
    """ Returns the track condition during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather track_condition')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[2]
    return r

In [6]:
def get_tr_tmp(soup):
    """ Returns the track temperature during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather ground')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
    return r

def get_air_tmp(soup):
    """ Returns the air temperature during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather air')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
    return r

def get_humidity(soup):
    """ Returns the track humidity during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather humidity')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
    return r

def get_all_races(soup):
    """ Returns all the races that took place in a particular season
        for which the soup was passed in """
    find = soup.find(id='event')
    if find is None:
        r = []
    else:
        r = find.find_all('option')
    return r

def get_all_cats(soup):
    """ Returns all the different categories (MotoGP, Moto2, etc.)
        that took place at a particular track in the provided soup """
    find = soup.find(id='category')
    if find is None:
        r = []
    else:
        r = find.find_all('option')
    return r

def get_race_sessions(soup):
    """ Returns all the different race sessions (RACE, RACE2, etc.)
        that took place at a particular track in the provided soup """
    find = soup.find(id='session')
    r = []
    if find is None:
        return r
    else:
        r2 = find.find_all('option')
        for s in r2:
            if s.text.find('RACE') > -1:
                r.append(s.text.replace('E',''))
        return r

In [7]:
def get_all_stats(soup, year, trk, track, cat, ssn):
    
    if soup.find('tbody') is None:
        return [dict(zip(headers, [year, trk, track, cat, ssn]+['n/a']*(len(headers)-3)))]
    else:
        riders = soup.find('tbody').find_all('a')
        stats_to_return = []

        # raceday stats
        date = get_date(soup)
        tr_con = get_tr_con(soup)
        tr_tmp = get_tr_tmp(soup)
        air_tmp = get_air_tmp(soup)
        humid = get_humidity(soup)
        
        # rider stats
        for r in riders:
            pos = r.findPrevious().findPrevious().findPrevious().findPrevious().text
            if pos=='':
                pos='crash'
            else:
                pos=int(pos)    
            points = r.findPrevious().findPrevious().findPrevious().text
            if points=='':
                points=0
            else:
                points=float(points)
            r_num = r.findPrevious().findPrevious().text
            if r_num != '':
                r_num = int(r_num)
            r_nam = r.text
            r_nat = r.findNext().text
            team = r.findNext().findNext().text
            bike = r.findNext().findNext().findNext().text
            avgspd = r.findNext().findNext().findNext().findNext().text
            time = r.findNext().findNext().findNext().findNext().findNext().text

            stats_dict = dict(zip(headers, [year, trk, track, cat, ssn, date, tr_con, tr_tmp, air_tmp,
                                            humid, pos, points, r_num, r_nam, r_nat, team,
                                            bike, avgspd, time]))
            stats_to_return.append(stats_dict)

        return stats_to_return

In [13]:
# loop through all parameters

for yr in reversed(years):
    data_list = []
    soup_yr = soup_stuff(base_url + yr)
    races = get_all_races(soup_yr)
    print(yr)
    
    for rc in races:
        TRK = rc['value']
        Track = rc['title']
        print(TRK, end=", ")
        url_rc = base_url +yr +'/' +TRK +'/'
        soup_rc = soup_stuff(url_rc)
        categories = get_all_cats(soup_rc)
        
        for cat in categories:
            CAT = cat.text
            url_c = base_url +yr +'/' +TRK +'/' + CAT + '/'
            soup_c = soup_stuff(url_c)
            sessions = get_race_sessions(soup_c)
            
            for ssn in sessions:
                SSN = ssn
                url_ssn = base_url +yr +'/' +TRK +'/' + CAT + '/' + SSN + '/Classification'
                soup_ssn = soup_stuff(url_ssn)
                data_list.extend(get_all_stats(soup_ssn, yr, TRK, Track, CAT, SSN))
                time.sleep(1+np.random.random())
    
    df = pd.DataFrame(data_list, columns=headers)
    fn = '/Archive/' + yr + '_data.csv'
    df.to_csv(fn)
    print(fn)
    time.sleep(1+np.random.random())

print('>> Scraping complete!')

2018
QAT, ARG, AME, SPA, FRA, ITA, CAT, NED, GER, CZE, AUT, GBR, RSM, ARA, THA, JPN, AUS, MAL, VAL, 

FileNotFoundError: [Errno 2] No such file or directory: '/Archive/2018_data.csv'

In [None]:
# first, get all tracks from 2002-2018
track_list = []
GPs_list = []
track_names = []

for yr in reversed(years):
    soup_yr = soup_stuff(base_url + yr)
    races = get_all_races(soup_yr)
    print('')
    print(yr, end = " - ")
    
    for rc in races:
        TRK = rc['value']
        Track = rc['title']
        print(TRK, end=", ")
        track_list.append(TRK)
        GPs_list.append(Track.split(' - ')[0])
        track_names.append(Track.split(' - ')[1])
        
    time.sleep(1+np.random.random())

In [None]:
# extract the unique ones 
combined_list = []
for index, item in enumerate(track_list):
    combined_list.append(item+' - '+track_names[index])
combined_track_set = set(combined_list)

In [None]:
combined_track_set = combined_track_set.remove("JPN - Suzuka Circuit")
combined_track_set = combined_track_set.remove("RSA - Phakisa Freeway")
combined_track_set = combined_track_set.remove("ITA - Autodromo Internazionale del Mugello':'Italy")


In [None]:
combined_track_set.remove('AUS - Phillip Island')
combined_track_set.remove('AUT - Red Bull Ring')
combined_track_set.remove('RIO - Nelson Piquet Circuit')
combined_track_set.remove('THA - Chang International Circuit')
combined_track_set.remove('PAC - Twin Ring Motegi')

In [None]:
#Manually inserting tracks that lack a URL 

track_url_dict = {'AME - Circuit Of The Americas':'Americas',
                  'ARA - MotorLand Aragón':'Aragon',
                  'ARG - Termas de Río Hondo':'Argentina',
                  'AUS - Phillip Island':'Australia',
                  'AUT - Red Bull Ring – Spielberg':'Austria',
                  'CAT - Circuit de Barcelona-Catalunya':'Catalunya',
                  'CHN - Shanghai Circuit':0,
                  'CZE - Automotodrom Brno':'Czech+Republic',
                  'FRA - Le Mans':'France',
                  'GBR - Donington Park Circuit':0,
                  'GBR - Silverstone Circuit':'Great+Britain',
                  'GER - Sachsenring':'Germany',
                  'INP - Indianapolis Motor Speedway':0,
                  'ITA - Autodromo del Mugello':'Italy',
                  'JPN - Twin Ring Motegi':'Japan',
                  'MAL - Sepang International Circuit':'Malaysia',
                  'NED - TT Circuit Assen':'Netherlands',
                  'THA - Chang International Circuit' : 'Thailand',
                  'POR - Estoril Circuit':0,
                  'QAT - Losail International Circuit':'Qatar',
                  'RIO - Nelson Piquet Circuit': 0,
                  'RSM - Misano World Circuit Marco Simoncelli':'San+Marino',
                  'SPA - Circuito de Jerez':'Spain',
                  'TUR - Istanbul Circuit':0,
                  'USA - Mazda Raceway Laguna Seca':0,
                  'VAL - Circuit Ricardo Tormo':'Valencia'    
}

In [None]:
# functions to get basic track info
def get_GP_info(track_url_str):
    """
    Returns a list with track length, number of left corners, number of right corners,
    track width, and length of longest straight. For any unavailable values, it returns
    'n/a' instead of a float or int.
    """
    url = 'http://www.motogp.com/en/event/' + track_url_str + '#info-track'
    soupy = soup_stuff(url)
    attributes = soupy.find(id='circuit_numbers').find_all(class_='circuit_number_content')
    strs = []
    list_data = []
    
    for s in range(len(attributes)):
        strs.append(attributes[s].text)

    if float(strs[0].split()[0])==0:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[0].split()[0]))

    if strs[1]=='':
        list_data.append('n/a')
    else:
        list_data.append(int(strs[1]))
    
    if strs[2]=='':
        list_data.append('n/a')
    else:
        list_data.append(int(strs[2]))
    
    if len(strs[3].split())==1:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[3].split()[0]))
    
    if len(strs[4].split())==1:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[4].split()[0]))

    return list_data

def get_GP_info_additional(track_url_str):
    """
    Returns MotoGP average speed, MotoGP distance, Moto2 distance,
    and Moto3 distance for the particular track. If data does not exist,
    it returns 'n/a' in place of a float or int.
    """
    url = 'http://www.motogp.com/en/event/' + track_url_str + '#info-track'
    soupy = soup_stuff(url)
    
    # MotoGP average speed
    avg_speed_str = soupy.find(class_='c-statistics__speed-item').text
    if avg_speed_str == '-' or avg_speed_str == '':
        avg_speed = 'n/a'
    else:
        avg_speed = float(avg_speed_str)
    
    attributes = soupy.find(class_='c-event__row-item col-xs-12 col-lg-7 col-lg-pull-5').find_all(class_='c-laps__item')
    
    # MotoGP distance
    GP_dist = attributes[1].text.split()[0]
    GP_dist = float(GP_dist)
    if GP_dist==0: GP_dist='n/a'
        
     # Moto2 distance
    m2_dist = attributes[4].text.split()[0]
    m2_dist = float(m2_dist)    
    if m2_dist==0: m2_dist='n/a'
        
     # Moto3 distance
    m3_dist = attributes[7].text.split()[0]
    m3_dist = float(m3_dist)   
    if m3_dist==0: m3_dist='n/a'
        
    return [avg_speed, GP_dist, m2_dist, m3_dist]
    

In [None]:
# make a list of dictionaries for track information
headers_2 = ['GP','track_length_km','l_corners','r_corners',
           'width_m','straight_m','GP_avg_speed','gp_dist',
           'm2_dist','m3_dist']

track_data = []
for track in combined_track_set:
    if track_url_dict[track] != 0:
        print('//', end='')
        l_GP, L_c, R_c, wid, strt = get_GP_info(track_url_dict[track])
        GP_avg_spd, gp_d, m2_d, m3_d = get_GP_info_additional(track_url_dict[track])
        track_dict = dict(zip(headers2, [track,l_GP,L_c,R_c,wid,strt,GP_avg_spd,gp_d,m2_d,m3_d]))
        track_data.append(track_dict)
        time.sleep(1+np.random.random())
print('Complete!')

In [None]:
# url = 'http://www.motogp.com/en/event/Qatar'
# soupy = soup_stuff(url)
# attributes = soupy.find(class_='c-event__row-item col-xs-12 col-lg-7 col-lg-pull-5').find_all(class_='c-laps__item')
# GP_dist = float(attributes[1].text.split()[0])
# if GP_dist==0: GP_dist='n/a'
# print(GP_dist)        

In [None]:
# manually add in the info for tracks which have a 0 in the track_url_dict
# information is from archived PDFs like the one at this following link
# http://resources.motogp.com/files/results/2006/CHN/circuit+information.pdf?v1_96143780


dict_austria = {'Track' : 'AUT - Red Bull Ring', 'MotoGP_avg_speed' : 182.8, 'GP_distance' : 120.9, 
                'Left_Corners' : 3.0, 'Moto2_distance' : 108.0, ' Moto3_distance' : 99.3, 'Right_Corners' : 7.0, 
                'length of longest straight' : 626.0, 'Trk Length':4.3, 'track_width':13.0}

dict_shanghai = {'Track' : 'CHN - Shanghai Circuit', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 7.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 7.0, 
                'length of longest straight' : 1202.0, 'Trk Length':5.281, 'track_width':14.0}

dict_donington = {'Track' : 'GBR - Donington Park Circuit', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 4.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 7.0, 
                'length of longest straight' : 564.0, 'Trk Length':4.023, 'track_width':10.0}

dict_indianapolis = {'Track' : 'INP - Indianapolis Motor Speedway', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 10.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 6.0, 
                'length of longest straight' : 644.0, 'Trk Length':4.216, 'track_width':16.0}

dict_estoril = {'Track' : 'POR - Estoril Circuitt', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 4.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 9.0, 
                'length of longest straight' : 986.0, 'Trk Length':4.182, 'track_width':14.0}

dict_istanbul = {'Track' : 'TUR - Istanbul Circuit', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 8.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 6.0, 
                'length of longest straight' : 720.0, 'Trk Length':5.340, 'track_width':21.0}

dict_laguna = {'Track' : 'USA - Mazda Raceway Laguna Seca', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 7.0, 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 4.0, 
                'length of longest straight' : 966.0, 'Trk Length':3.610, 'track_width':15.0}

dict_australia = {'Track' : 'AUS - Phillip Island', 'MotoGP_avg_speed' : 176.3, 'GP_distance' : 120.1, 
                'Left_Corners' : 7.0, 'Moto2_distance' : 111.2, ' Moto3_distance' : 102.3, 'Right_Corners' : 5.0, 
                'length of longest straight' : 900.0, 'Trk Length':4.4, 'track_width':13.0}

dict_brazil = {'Track' : 'RIO - Autódromo Internacional Nelson Piquet', 'MotoGP_avg_speed' : 'n/a', 'GP_distance' : 'n/a', 
                'Left_Corners' : 6.0 , 'Moto2_distance' : 'n/a', ' Moto3_distance' : 'n/a', 'Right_Corners' : 6.0, 
                'length of longest straight' : 'n/a', 'Trk Length': 5.031, 'track_width':'n/a'}

dict_Thai = {'Track' : 'THA - Chang International Circuit', 'MotoGP_avg_speed' : 177.9, 'GP_distance' : 118.4, 
                #'Left_Corners' : 5.0, 'Moto2_distance' : 109.3, ' Moto3_distance' : 100.2, 'Right_Corners' : 7.0, 
                #'length of longest straight' : 1000.0, 'Trk Length':4.6, 'track_width':12.0}

dict_motegi = {'Track' : 'JPN - Twin Ring Motegi', 'MotoGP_avg_speed' : 162.2, 'GP_distance' : 115.2, 
                'Left_Corners' : 6.0, 'Moto2_distance' : 105.6, ' Moto3_distance' : 96.0, 'Right_Corners' : 8.0, 
                'length of longest straight' : 762.0, 'Trk Length':4.8, 'track_width':15.0}

dist_italy = {'Track' : 'ITA - Autodromo del Mugello', 'MotoGP_avg_speed' : 174.2, 'GP_distance' : 120.6, 
                'Left_Corners' : 6.0, 'Moto2_distance' : 110.1, ' Moto3_distance' : 104.9, 'Right_Corners' : 9, 
                'length of longest straight' : 1141.0, 'Trk Length':5.2, 'track_width':14.0}



track_data.append(dict_shanghai)
track_data.append(dict_donington)
track_data.append(dict_indianapolis)
track_data.append(dict_estoril)
track_data.append(dict_istanbul)
track_data.append(dict_laguna)
track_data.append(dict_australia)
track_data.append(dict_brazil)
track_data.append(dict_Thai)
track_data.append(dict_motegi)
track_data.append(dict_austria)
track_data.append(dict_italy)


In [None]:
#create dataframe from new data
df_tracks = pd.DataFrame(track_data, columns=headers2)
df_tracks

In [None]:
fn = 'Racetrack_data.csv'
df_tracks.to_csv(fn)
print(fn)

In [None]:
track_data


In [None]:
track_df_new =pd.DataFrame(track_data)

In [None]:
track_df_new.drop_duplicates("Track")