In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import time
from collections import defaultdict
import requests
import os

In [2]:
# Headers for all the data we will be scraping in this notebook
headers = ['Year','TRK','Track','Category','Session','Date','Track_Condition','Track_Temp','Air_Temp',
           'Humidity','Position','Points','Rider_Number','Rider_Name','Nationality','Team_Name',
           'Bike','Avg_Speed','Time']
headers2 = ['Track', "Trk Length", 'Left_Corners', 'Right_Corners', 'track_width', 'length of longest straight', 'MotoGP_avg_speed', 'GP_distance', 'Moto2_distance', 'Moto3_distance'] 


#had to remove 2002 - 2004
years = ['2019']

base_url = 'http://www.motogp.com/en/Results+Statistics/'

In [3]:
def soup_stuff(url):
    #Returns a BeautifulSoup object for the provided url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup

In [4]:
def get_date(soup):
    """ Returns the date of the race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='padbot5')
    if find is None:
        r = 'n/a'
    else:
        r = ','.join(find.text.replace(',',' ').split()[-3:])
    return r

In [5]:
def get_tr_con(soup):
    """ Returns the track condition during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather track_condition')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[2]
    return r

In [6]:
def get_tr_tmp(soup):
    """ Returns the track temperature during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather ground')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
        r= r.strip('º')
        r = float(r)
    return r

def get_air_tmp(soup):
    """ Returns the air temperature during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather air')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
        r= r.strip('º')
        r = float(r)
    return r

def get_humidity(soup):
    """ Returns the track humidity during a race, or 'n/a' if 
        information does not exist in the provided soup """
    find = soup.find(class_='sprite_weather humidity')
    if find is None:
        r = 'n/a'
    else:
        r = find.findNext().text.split()[1]
        r = re.sub('[%]','',r) 
        r = int(r)
    return r

def get_all_races(soup):
    """ Returns all the races that took place in a particular season
        for which the soup was passed in """
    find = soup.find(id='event')
    if find is None:
        r = []
    else:
        r = find.find_all('option')
    return r

def get_all_cats(soup):
    """ Returns all the different categories (MotoGP, Moto2, etc.)
        that took place at a particular track in the provided soup """
    find = soup.find(id='category')
    if find is None:
        r = []
    else:
        r = find.find_all('option')
    return r

def get_race_sessions(soup):
    """ Returns all the different race sessions (RACE, RACE2, etc.)
        that took place at a particular track in the provided soup """
    find = soup.find(id='session')
    r = []
    if find is None:
        return r
    else:
        r2 = find.find_all('option')
        for s in r2:
            if s.text.find('RACE') > -1:
                r.append(s.text.replace('E',''))
        return r

In [7]:
def get_all_stats(soup, year, trk, track, cat, ssn):
    
    if soup.find('tbody') is None:
        return [dict(zip(headers, [year, trk, track, cat, ssn]+['n/a']*(len(headers)-3)))]
    else:
        riders = soup.find('tbody').find_all('a')
        stats_to_return = []

        # raceday stats
        date = get_date(soup)
        tr_con = get_tr_con(soup)
        tr_tmp = get_tr_tmp(soup)
        air_tmp = get_air_tmp(soup)
        humid = get_humidity(soup)
        
        # rider stats
        for r in riders:
            pos = r.findPrevious().findPrevious().findPrevious().findPrevious().text
            if pos=='':
                pos='crash'
            else:
                pos=int(pos)    
            points = r.findPrevious().findPrevious().findPrevious().text
            if points=='':
                points=0
            else:
                points=float(points)
            r_num = r.findPrevious().findPrevious().text
            if r_num != '':
                r_num = int(r_num)
            r_nam = r.text
            r_nat = r.findNext().text
            team = r.findNext().findNext().text
            bike = r.findNext().findNext().findNext().text
            avgspd = r.findNext().findNext().findNext().findNext().text
            time = r.findNext().findNext().findNext().findNext().findNext().text

            stats_dict = dict(zip(headers, [year, trk, track, cat, ssn, date, tr_con, tr_tmp, air_tmp,
                                            humid, pos, points, r_num, r_nam, r_nat, team,
                                            bike, avgspd, time]))
            stats_to_return.append(stats_dict)

        return stats_to_return

In [9]:
# loop through all parameters

for yr in reversed(years):
    data_list = []
    soup_yr = soup_stuff(base_url + yr)
    races = get_all_races(soup_yr)
    print(yr)
    
    for rc in races:
        TRK = rc['value']
        Track = rc['title']
        print(TRK, end=", ")
        url_rc = base_url +yr +'/' +TRK +'/'
        soup_rc = soup_stuff(url_rc)
        categories = get_all_cats(soup_rc)
        
        for cat in categories:
            CAT = cat.text
            url_c = base_url +yr +'/' +TRK +'/' + CAT + '/'
            soup_c = soup_stuff(url_c)
            sessions = get_race_sessions(soup_c)
            
            for ssn in sessions:
                SSN = ssn
                url_ssn = base_url +yr +'/' +TRK +'/' + CAT + '/' + SSN + '/Classification'
                soup_ssn = soup_stuff(url_ssn)
                data_list.extend(get_all_stats(soup_ssn, yr, TRK, Track, CAT, SSN))
                time.sleep(1+np.random.random())
    
    df = pd.DataFrame(data_list, columns=headers)
    fn = './Archive/' + yr + '_data.csv'
    df.to_csv(fn)
    print(fn)
    time.sleep(25)

print('>> Scraping complete!')

2019
QAT, ARG, AME, SPA, FRA, ITA, CAT, NED, GER, CZE, AUT, GBR, RSM, ARA, THA, JPN, AUS, MAL, VAL, ./Archive/2019_data.csv
>> Scraping complete!


In [10]:
# first, get all tracks from 2019
track_list = []
GPs_list = []
track_names = []

for yr in reversed(years):
    soup_yr = soup_stuff(base_url + yr)
    races = get_all_races(soup_yr)
    print('')
    print(yr, end = " - ")
    
    for rc in races:
        TRK = rc['value']
        Track = rc['title']
        print(TRK, end=", ")
        track_list.append(TRK)
        GPs_list.append(Track.split(' - ')[0])
        track_names.append(Track.split(' - ')[1])
        
    time.sleep(1+np.random.random())


2019 - QAT, ARG, AME, SPA, FRA, ITA, CAT, NED, GER, CZE, AUT, GBR, RSM, ARA, THA, JPN, AUS, MAL, VAL, 

In [11]:
combined_list = []
for index, item in enumerate(track_list):
    combined_list.append(item+' - '+track_names[index])
combined_track_set = set(combined_list)

In [12]:
def get_GP_info(track_url_str):
    """
    Returns a list with track length, number of left corners, number of right corners,
    track width, and length of longest straight. For any unavailable values, it returns
    'n/a' instead of a float or int.
    """
    url = 'http://www.motogp.com/en/event/' + track_url_str + '#info-track'
    soupy = soup_stuff(url)
    attributes = soupy.find(id='circuit_numbers').find_all(class_='circuit_number_content')
    strs = []
    list_data = []
    
    for s in range(len(attributes)):
        strs.append(attributes[s].text)

    if float(strs[0].split()[0])==0:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[0].split()[0]))

    if strs[1]=='':
        list_data.append('n/a')
    else:
        list_data.append(int(strs[1]))
    
    if strs[2]=='':
        list_data.append('n/a')
    else:
        list_data.append(int(strs[2]))
    
    if len(strs[3].split())==1:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[3].split()[0]))
    
    if len(strs[4].split())==1:
        list_data.append('n/a')
    else:
        list_data.append(float(strs[4].split()[0]))

    return list_data

def get_GP_info_additional(track_url_str):
    """
    Returns MotoGP average speed, MotoGP distance, Moto2 distance,
    and Moto3 distance for the particular track. If data does not exist,
    it returns 'n/a' in place of a float or int.
    """
    url = 'http://www.motogp.com/en/event/' + track_url_str + '#info-track'
    soupy = soup_stuff(url)
    
    # MotoGP average speed
    avg_speed_str = soupy.find(class_='c-statistics__speed-item').text
    if avg_speed_str == '-' or avg_speed_str == '':
        avg_speed = 'n/a'
    else:
        avg_speed = float(avg_speed_str)
    
    attributes = soupy.find(class_='c-event__row-item col-xs-12 col-lg-7 col-lg-pull-5').find_all(class_='c-laps__item')
    
    # MotoGP distance
    GP_dist = attributes[1].text.split()[0]
    GP_dist = float(GP_dist)
    if GP_dist==0: GP_dist='n/a'
        
     # Moto2 distance
    m2_dist = attributes[4].text.split()[0]
    m2_dist = float(m2_dist)    
    if m2_dist==0: m2_dist='n/a'
        
     # Moto3 distance
    m3_dist = attributes[7].text.split()[0]
    m3_dist = float(m3_dist)   
    if m3_dist==0: m3_dist='n/a'
        
    return [avg_speed, GP_dist, m2_dist, m3_dist]
    

In [14]:
track_url_dict = {'AME - Circuit Of The Americas':'Americas',
                  'ARA - MotorLand Aragón':'Aragon',
                  'ARG - Termas de Río Hondo':'Argentina',
                  'AUS - Phillip Island':'Australia',
                  'AUT - Red Bull Ring – Spielberg':'Austria',
                  'CAT - Circuit de Barcelona-Catalunya':'Catalunya',
                  'CHN - Shanghai Circuit':0,
                  'CZE - Automotodrom Brno':'Czech+Republic',
                  'FRA - Le Mans':'France',
                  'GBR - Donington Park Circuit':0,
                  'GBR - Silverstone Circuit':'Great+Britain',
                  'GER - Sachsenring':'Germany',
                  'INP - Indianapolis Motor Speedway':0,
                  'ITA - Autodromo del Mugello':'Italy',
                  'JPN - Twin Ring Motegi':'Japan',
                  'MAL - Sepang International Circuit':'Malaysia',
                  'NED - TT Circuit Assen':'Netherlands',
                  'THA - Chang International Circuit' : 'Thailand',
                  'POR - Estoril Circuit':0,
                  'QAT - Losail International Circuit':'Qatar',
                  'RIO - Nelson Piquet Circuit': 0,
                  'RSM - Misano World Circuit Marco Simoncelli':'San+Marino',
                  'SPA - Circuito de Jerez':'Spain',
                  'TUR - Istanbul Circuit':0,
                  'USA - Mazda Raceway Laguna Seca':0,
                  'VAL - Circuit Ricardo Tormo':'Valencia'    
}



In [16]:
dict_italy= {'Track' : 'ITA - Autodromo Internazionale del Mugello','MotoGP_avg_speed' : 174.1, 'GP_distance' : 120.6, 
                'Left_Corners' : 6.0, 'Moto2_distance' : 110.1, ' Moto3_distance' : 104.9, 'Right_Corners' : 9.0, 
                'length of longest straight' : 1141.0, 'Trk Length':5.2, 'track_width':14.0}

track_data.append(dict_italy)

In [29]:
#combined_track_set.remove('ITA - Autodromo Internazionale del Mugello')
#combined_track_set.remove('AUS - Phillip Island')
#combined_track_set.remove('AUT - Red Bull Ring')
#combined_track_set.remove('RIO - Nelson Piquet Circuit')
#combined_track_set.remove('THA - Chang International Circuit')
#combined_track_set.remove('PAC - Twin Ring Motegi')
#combined_track_set.remove('JPN - Suzuka Circuit')
#combined_track_set.remove('RSA - Phakisa Freeway')
#combined_track_set.remove('ITA - Autodromo Internazionale del Mugello')

In [30]:
# make a list of dictionaries for track information
headers_2 = ['GP','track_length_km','l_corners','r_corners',
           'width_m','straight_m','GP_avg_speed','gp_dist',
           'm2_dist','m3_dist']

track_data = []
for track in combined_track_set:
    if track_url_dict[track] != 0:
        print('//', end='')
        l_GP, L_c, R_c, wid, strt = get_GP_info(track_url_dict[track])
        GP_avg_spd, gp_d, m2_d, m3_d = get_GP_info_additional(track_url_dict[track])
        track_dict = dict(zip(headers2, [track,l_GP,L_c,R_c,wid,strt,GP_avg_spd,gp_d,m2_d,m3_d]))
        track_data.append(track_dict)
        time.sleep(1+np.random.random())
print('Complete!')

//////////////////////////////Complete!


In [31]:
df_tracks = pd.DataFrame(track_data, columns=headers2)

In [32]:
df_tracks

Unnamed: 0,Track,Trk Length,Left_Corners,Right_Corners,track_width,length of longest straight,MotoGP_avg_speed,GP_distance,Moto2_distance,Moto3_distance
0,VAL - Circuit Ricardo Tormo,4.0,9,5,12.0,876.0,139.8,108.1,100.1,92.1
1,GER - Sachsenring,3.7,10,3,12.0,700.0,160.6,110.1,102.8,99.1
2,ARA - MotorLand Aragón,5.1,10,7,15.0,968.0,167.0,116.8,106.6,96.5
3,CAT - Circuit de Barcelona-Catalunya,4.6,6,8,12.0,1047.0,164.4,111.0,101.8,97.2
4,ARG - Termas de Río Hondo,4.8,5,9,16.0,1076.0,172.7,120.2,110.5,100.9
5,GBR - Silverstone Circuit,5.9,8,10,15.0,770.0,,118.0,106.2,100.3
6,SPA - Circuito de Jerez,4.4,6,8,11.0,607.0,161.2,110.6,101.7,97.3
7,NED - TT Circuit Assen,4.5,6,12,14.0,487.0,173.1,118.1,109.0,99.9
8,FRA - Le Mans,4.2,5,9,13.0,674.0,161.8,113.0,104.6,92.1
9,MAL - Sepang International Circuit,5.5,5,10,16.0,920.0,164.0,110.9,99.8,94.2
