In [1]:
#Import Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [2]:
#Look at race results URL 
url = 'https://nsga-results.fusesport.com/drawindividual.asp?id=341961&seasonid=485'

In [3]:
#Test out Pandas read_htl - it works! 
tables = pd.read_html(url)
tables

[  Unnamed: 0            Name State             S            T1             B  \
 0        1st    REMBAC, Ross    AZ  00:06:29.000  00:04:58.000  00:35:22.000   
 1        2nd  GALLARDO, Paul    NM  00:08:07.000  00:03:44.000  00:38:18.000   
 2        3rd      KASSA, JOE    NM  00:09:39.000  00:04:28.000  00:38:55.000   
 3        4th      HOBBS, Bob    KY  00:11:35.000  00:04:06.000  00:39:53.000   
 4        5th    WYATT, James    TX  00:07:23.000  00:05:32.000  00:42:55.000   
 5        6th   DEMERLY, Mike    IN  00:12:39.000  00:06:35.000  00:43:18.000   
 6        7th   MASON, Andrew    NM  00:13:14.000  00:06:04.000  00:48:00.000   
 7          0      LINK, Dave    TX             -             -             -   
 
              T2             R          Time  
 0  00:00:47.000  00:23:43.000  01:11:19.000  
 1  00:01:09.000  00:20:24.000  01:11:42.000  
 2  00:00:52.000  00:24:36.000  01:18:30.000  
 3  00:00:46.000  00:24:30.000  01:20:50.000  
 4  00:01:21.000  00:27:36.000  01

In [4]:
#Launch Splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [5]:
#Use main site URL as starting point
url2 = 'https://nsga-results.fusesport.com/competitions.asp?compID=341959&id=485'

#Visit the browser using splinter since using pd.to_html on the main page doesn't bring back the necessary information
browser.visit(url2)

In [6]:
#we want the HTML from the page our browser is currently on
html = browser.html


#soupify (parse) the html so we can read it
soup = bs(html, 'html.parser')

In [7]:
#since sex and ages aren't located within the tables but are given in the main table's text, 
#we have to set these variables ourselves
sexes = []
ages = []
links = []

#Since the initial table is all the male results - assign male as the sex variable
sex = "Male"

#Find all table element tags within the main page 
table_h = soup.find_all('table')

#For each table in the list of tables...
for table in table_h: 
    #Get the table rows
    rows = table.find_all('tr')
    #For each row in the given table...
    for row in rows:
        #Bring back each cell
        cells = row.find_all('td')
        #The age is the text within the cell 
        age = cells[0].text
        #The link to the results is in href
        link = cells[1].a['href']
        
        #append to the lists
        sexes.append(sex)
        ages.append(age)
        links.append(link)

    #after the first table, switch to sex = Female since the second table is all Female results     
    sex = 'Female'
        

In [8]:
#create a dataframe with the data we extracted
link_data = pd.DataFrame()
link_data['sex'] = sexes
link_data['age'] = ages
#have to create the full link URLs 
link_data['link'] = ['https://nsga-results.fusesport.com/'+ link for link in links]

In [9]:
link_data.head()

Unnamed: 0,sex,age,link
0,Male,50-54,https://nsga-results.fusesport.com/drawindivid...
1,Male,55-59,https://nsga-results.fusesport.com/drawindivid...
2,Male,60-64,https://nsga-results.fusesport.com/drawindivid...
3,Male,65-69,https://nsga-results.fusesport.com/drawindivid...
4,Male,70-74,https://nsga-results.fusesport.com/drawindivid...


In [16]:
# create an empty df
df = pd.DataFrame()

#for row in link_data...
for index,row in link_data.iterrows(): 
    try:
        #bring back the table for the row given the link 
        table = pd.read_html(row.link)
        #from that table, bring back as dataframe 
        temp = table[0]
        #add in a column with the gender
        temp['sex'] = row.sex
        #add in a column with the age range
        temp['age'] = row.age
        #add to the empty df
        df = df.append(temp)
    except Exception as e: 
        print(e)

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,State,S,T1,B,T2,R,Time,sex,age
0,1st,"REMBAC, Ross",AZ,00:06:29.000,00:04:58.000,00:35:22.000,00:00:47.000,00:23:43.000,01:11:19.000,Male,50-54
1,2nd,"GALLARDO, Paul",NM,00:08:07.000,00:03:44.000,00:38:18.000,00:01:09.000,00:20:24.000,01:11:42.000,Male,50-54
2,3rd,"KASSA, JOE",NM,00:09:39.000,00:04:28.000,00:38:55.000,00:00:52.000,00:24:36.000,01:18:30.000,Male,50-54
3,4th,"HOBBS, Bob",KY,00:11:35.000,00:04:06.000,00:39:53.000,00:00:46.000,00:24:30.000,01:20:50.000,Male,50-54
4,5th,"WYATT, James",TX,00:07:23.000,00:05:32.000,00:42:55.000,00:01:21.000,00:27:36.000,01:24:47.000,Male,50-54


In [18]:
#Look at the time values
df.sort_values(by = "Time")

Unnamed: 0.1,Unnamed: 0,Name,State,S,T1,B,T2,R,Time,sex,age
20,0,"WILLS, Chris",TX,-,-,-,-,-,-,Female,50-54
9,0,"BICH, Howard",SD,-,-,-,-,-,-,Male,75-79
8,0,"BALTHROP, Edward",AL,-,-,-,-,-,-,Male,75-79
32,0,"ASHLEY, Matthew",MO,-,-,-,-,-,-,Male,55-59
33,0,"BRADLEY, Tim",NY,-,-,-,-,-,-,Male,55-59
...,...,...,...,...,...,...,...,...,...,...,...
12,,"MIKA, DARRYL",OH,00:03:44.000,-,-,-,-,DNF,Male,65-69
1,,"THOMPSON, Brad",TX,00:20:33.000,00:12:16.000,-,-,-,DNF,Male,85-89
16,,"CHOKEL, Naira",NH,00:11:02.000,-,-,-,-,DNF,Female,55-59
19,,"TITTLE, Kathleen",FL,00:10:41.000,00:09:24.000,00:09:09.000,-,-,DNF,Female,65-69


In [19]:
#We need to remove 23 data rows
df.Time.value_counts()

-               17
DNF              6
02:26:03.000     2
01:35:10.000     2
01:33:43.000     2
                ..
02:07:13.000     1
01:37:48.000     1
01:11:19.000     1
01:45:20.000     1
02:34:23.000     1
Name: Time, Length: 194, dtype: int64

In [20]:
#Drop the index for the rows where the Time entry equals DNF

new = df.drop(df[df.Time == 'DNF'].index)

In [21]:
new

Unnamed: 0.1,Unnamed: 0,Name,State,S,T1,B,T2,R,Time,sex,age
0,1st,"REMBAC, Ross",AZ,00:06:29.000,00:04:58.000,00:35:22.000,00:00:47.000,00:23:43.000,01:11:19.000,Male,50-54
2,3rd,"KASSA, JOE",NM,00:09:39.000,00:04:28.000,00:38:55.000,00:00:52.000,00:24:36.000,01:18:30.000,Male,50-54
3,4th,"HOBBS, Bob",KY,00:11:35.000,00:04:06.000,00:39:53.000,00:00:46.000,00:24:30.000,01:20:50.000,Male,50-54
4,5th,"WYATT, James",TX,00:07:23.000,00:05:32.000,00:42:55.000,00:01:21.000,00:27:36.000,01:24:47.000,Male,50-54
5,6th,"DEMERLY, Mike",IN,00:12:39.000,00:06:35.000,00:43:18.000,00:00:55.000,00:28:54.000,01:32:21.000,Male,50-54
...,...,...,...,...,...,...,...,...,...,...,...
3,3rd,"FOUTS, Raechel",CO,00:15:13.000,00:08:38.000,01:14:24.000,00:01:34.000,00:46:14.000,02:26:03.000,Female,75-79
4,5th,"SCHNEIDER, Alice",OH,00:21:56.000,00:08:07.000,01:14:17.000,00:02:08.000,00:41:50.000,02:28:18.000,Female,75-79
5,6th,"PLEIN, Linda",NM,00:14:04.000,00:14:41.000,01:13:19.000,00:04:08.000,00:59:22.000,02:45:34.000,Female,75-79
6,7th,"LILLEHEI, Patsy",MN,00:23:24.000,00:12:53.000,01:14:08.000,00:01:51.000,01:02:25.000,02:54:41.000,Female,75-79


### Figuring Out the Links

In [None]:
#pull the anchor tags from the main page
anchors = soup.find_all('a')
anchors

In [None]:
#test out the first link
anchors[0]['href']

In [None]:
#bring back all the href links on the page
link_list = []
for anchor in anchors: 
    links = anchor['href']
    link_list.append(links)

print(link_list)

In [None]:
#grab a link and take a look - it isn't complete
link_list[2]

In [None]:
#build the entire link and return a list of full links
data_url_list = ['https://nsga-results.fusesport.com/'+ link for link in link_list]

In [None]:
data_url_list

In [None]:
#for each link in the list - get the table on the page and then append it to a list 
tables = []
for url in data_url_list: 
    #since the first two and the last two aren't results pages, the url generated won't work so we need a try/except
    try:
        table = pd.read_html(url)
        tables.append(table[0])

    except Exception as e: 
        print(e)

In [None]:
tables

In [None]:
len(tables[0])

In [None]:
tables[0][0]