# Web scrape the JSON files from the simresults webpage

In [1]:
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import random

In [2]:
# base_url will be used to concatenate with results from webscraping to provide the next step in gathering data
base_url = 'http://174.173.35.207:8772'
url_server_list = ['http://174.173.35.207:8772/results?server=0',
                   'http://174.173.35.207:8772/results?server=1',
                   'http://174.173.35.207:8772/results?server=3',
                   'http://174.173.35.207:8772/results?server=4']

In [3]:
def grab_final_page_num(url):
    '''
    Finds the final page number of a webpage that has multiple pages

    Parameters:
        url(str): the initial page of the webpage

    Returns:
        A number that shows the final page of the webpage
    '''

    initial_url = requests.get(url)
    webpage_info = BeautifulSoup(initial_url.text, 'lxml')
    # Looks at all the page-links and selects the final one as it is the final page and grab its link to create a link to the final page
    last_page_url = base_url + webpage_info.find_all('a', class_='page-link')[-1]['href']
    # From the final webpage, grab the final page number at the bottom of the navigation bar
    final_url = requests.get(last_page_url)
    final_webpage_info = BeautifulSoup(final_url.text, 'lxml')
    final_page_number = final_webpage_info.find('li', class_ = 'page-item disabled pagination-current-page').text.strip()
    return int(final_page_number)


In [4]:
def grab_server_num(url):
    '''
    Obtains the server number of the webpage

    Parameters:
        url(str): the initial page of the webpage

    Returns:
        A number that indicates which server it is
    '''
    server_num = url[-1]
    return int(server_num)

In [5]:
def grab_json_url(page_num, server_num):
    '''
    With the page number, the function will web scrape to get a list of race urls that contain the json download url.

    Parameters:
        page_num(int): the final page of the webpage that holds race information.
    
    Returns:
        A list of webpage urls that are races
    
    '''
    
    url = requests.get('http://174.173.35.207:8772/results?page=' + str(page_num) + '&server='+ str(server_num))
    race = BeautifulSoup(url.text, 'lxml')
    all_rows = race.find_all('tr', class_='row-link')
    result_urls = [(base_url + link['data-href']) for link in all_rows if link.find_all('td')[1].text.strip() == 'Race']
    return result_urls

In [6]:
def grab_json_info(url):
    '''
    Utilizes the url that has the json download file to scrape the download url
    
    Parameters:
        url(str): the url with the json download file

    Returns:
        A string for the download url of the json file
    '''
    
    json_url = requests.get(url)
    race_info = BeautifulSoup(json_url.text, 'lxml')
    json_file = race_info.find('div', class_='float-right').find('a', class_='btn btn-primary btn-sm')['href']
    return json_file

# Gather the data from JSON files and organize it into ChampionshipID

In [7]:
import numpy as np
import pandas as pd

In [10]:
def create_key(key_list):
    ''' 
    Creates a new unique alphanumeric string that will be added to a master list of strings
    
    Parameters:
        key_list(list): a list of unique alphanumeric strings

    Returns:
        an updated list of unique alphanumeric strings 
    '''
    
    key = ''.join(random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in range(10))
    if key in key_list:
        while key in key_list:
            key = ''.join(random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ') for i in range(10))
            if key in key_list:
                continue
            else:
                key_list.append(key)
                break
    else:
        key_list.append(key)
    return key_list

In [11]:
def create_master_key_list(num_of_keys):
    ''' 
    Creates a master key list to the desired number of keys wanted

    Parameters:
        num_of_keys(int): the number of unique keys to be created for the list

    Returns:
        an updated list of unique alphanumeric strings with the desired number of keys

    '''
    master_key_list = []
    for _ in range(num_of_keys):
        create_key(master_key_list)
    return master_key_list

In [16]:
# For each race, create a dataframe with the ChampionshipID, Date and Time
def create_dataframe(url, master_key_list, race_num):
    '''
    From a json url, creates a dataframe with the following column names: BallastKG, BestLap, CarId, CarModel, DriverGuid, DriverName,
    Restrictor, TotalTime, HasPenalty, PenaltyTime, LapPenalty, Disqualified, ClassID, GridPosition, ChampionshipID, Date, Time, TrackName and Rank.

    Parameters:
        url(str): a json url that leads to the race information
        
    Returns:
        a dataframe consisting of the columns specified
    '''
    
    load_json = json.loads(urlopen(url).read())
    df = pd.DataFrame.from_dict(load_json['Result'])
    df['ChampionshipID'] = load_json['ChampionshipID']
    df['Date'] = pd.to_datetime(load_json['Date']).date()
    df['Time'] = pd.to_datetime(load_json['Date']).time()
    df['TrackName'] = load_json['TrackName']
    df['BestLap'] = pd.to_datetime(df['BestLap'], unit = 'ms').dt.time
    df['TotalTime'] = pd.to_datetime(df['TotalTime'], unit = 'ms').dt.time
    df['Rank'] = [num for num in range(1, len(df)+1)]
    df['RaceID'] = master_key_list[race_num]
    return df

In [127]:
def create_lap_df(url, master_key_list, race_num):
    load_json = json.loads(urlopen(url).read())
    df = pd.DataFrame.from_dict(load_json['Laps'])
    df['LapTime'] = pd.to_datetime(df['LapTime'], unit = 'ms').dt.time
    df['RaceID'] = master_key_list[race_num]
    df.drop(['Timestamp'], axis=1, inplace=True)
    return df

In [13]:
def combine_dataframes(num_df, list_df):
    '''
    Combines dataframes depending on the number given into one large dataframe

    Parameters:
        num_df(int): the number of dataframes to be merged
        list_df(list): a list of json urls that leads to the race information
    
    Returns:
        a single dataframe consisting of all the information
    '''

    df = list_df[0]
    for num in range(1,num_df):
        df = pd.concat([df, list_df[num]])
    return df

In [133]:
def complete_dataframe(url):
    '''
    Utilizes all previous functions to create a final dataframe with all the necessary information for a given server

    Parameters:
        url(str): the webpage that will be scraped to gather the JSON url for data about the races
        
    Returns:
        a single dataframe consisting of all the information for the specific url
    '''
    last_page_num = grab_final_page_num(url)
    server_num = grab_server_num(url)
    # Create a list of all urls that contain the json download url only containing Races
    all_urls = []
    for num in range(last_page_num):
        all_urls.extend(grab_json_url(num, server_num))
    # Creates a list of json_urls to gather information
    json_urls = [base_url + grab_json_info(link) for link in all_urls]
    # The number of races in the specific server
    num_races = len(json_urls)
    # Create the master key list to assign to each race
    master_key_list = create_master_key_list(num_races)
    # Create a list of dataframes to be combined
    df_list = [create_dataframe(json_urls[num], master_key_list, num) for num in range(num_races)]
    # Combiness the list of dataframes into one entity consisting of information for the server
    final_df = combine_dataframes(num_races, df_list)
    # Create a list of dataframes for lap information to be combined
    lap_df_list = [create_lap_df(json_urls[num], master_key_list, num) for num in range(num_races)]
    # Combines the list of dataframes into one entity consisting of information for the server laps
    lap_df = combine_dataframes(num_races, lap_df_list)
    return (final_df, lap_df)

In [137]:
server_0_results = complete_dataframe(url_server_list[0])[0].reset_index(drop=True)
server_1_results = complete_dataframe(url_server_list[1])[0].reset_index(drop=True)
server_3_results = complete_dataframe(url_server_list[2])[0].reset_index(drop=True)
server_4_results = complete_dataframe(url_server_list[3])[0].reset_index(drop=True)

server_0_laps = complete_dataframe(url_server_list[0])[1].reset_index(drop=True)
server_1_laps = complete_dataframe(url_server_list[1])[1].reset_index(drop=True)
server_3_laps = complete_dataframe(url_server_list[2])[1].reset_index(drop=True)
server_4_laps = complete_dataframe(url_server_list[3])[1].reset_index(drop=True)

# Export each dataframe into it a respective csv and json file

In [None]:
server_0_results.to_csv('server_0_results.csv', index=False)
server_0_laps.to_csv('server_0_laps.csv', index=False)

server_1_results.to_csv('server_1_results.csv', index=False)
server_1_laps.to_csv('server_1_laps.csv', index=False)

server_3_results.to_csv('server_3_results.csv', index=False)
server_3_laps.to_csv('server_3_laps.csv', index=False)

server_4_results.to_csv('server_4_results.csv', index=False)
server_4_laps.to_csv('server_4_laps.csv', index=False)