# Set up shell environment
Before we start, let's make sure our environment is set up to run all the code we will need.

In [None]:
# conda create -n spiders-env python=3.11 -y
# conda activate spiders-env

!conda install pip -y
!pip install -r ../requirements.txt

# Import libraries and programs

Now that we're operating in Python, install all the libraries etc called on in the code

In [21]:
import os
import json
import requests

import pandas as pd
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import Selector

from tqdm.notebook import tqdm
from pprint import pprint as print

In [362]:
# Setting program-level variables
driver = webdriver.Chrome()
year_url_root = "https://www.roadtonationals.com/api/women/finalresults/"
years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015] # These are the years that we are interested in evaluating

In [280]:
# Setting up a fetch page function with a retry function and error handling
def fetch_page(url, retries=3):
    for i in range(retries):
        try:
            driver.get(url)
            return driver.page_source
        except Exception as e:
            # print(f"Error fetching page: {url}, retrying...")
            # print(e)
            pass
    return None


# Step 1: Get the team data from the season results landing page

In [287]:
# TODO - Perhaps wrap this into a method that takes a year?

# For every year, access the website and save the data to a json file
for year in years:
    year_url = year_url_root + str(year)

    payload = {}
    headers = {
        'Cookie': 'PHPSESSID=c48eb24102c0c45390a5d64809741f95'
    }

    response = requests.request("GET", year_url, headers=headers, data=payload)

    # Save the data to a json file
    with open(f'../Data/Raw/teams/{year}_teams.json', 'w') as f:
        # pure text
        f.write(response.text)

In [323]:
# Read the json files into a dataframe

# Create an empty dataframe
teams_data_df = pd.DataFrame()

# For every year, load the data from the json file and append to the dataframe
for year in years:
    filename = f'../Data/Raw/teams/{year}_teams.json'

    # Read the json file into a temporary df
    temp_df = pd.read_json(filename)
    temp_df['year'] = year

    # Append the temporary df to the main df
    teams_data_df = pd.concat([teams_data_df, temp_df])


teams_data_df = teams_data_df.reset_index(drop=True)
teams_df = pd.json_normalize(teams_data_df['data']).reset_index(drop=True)
teams_df['year'] = teams_data_df['year']

In [349]:
teams_df

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://www.roadtonationals.com/api/women/dash...
1,California,15,2024,https://www.roadtonationals.com/api/women/dash...
2,Utah,69,2024,https://www.roadtonationals.com/api/women/dash...
3,Florida,22,2024,https://www.roadtonationals.com/api/women/dash...
4,Stanford,61,2024,https://www.roadtonationals.com/api/women/dash...
...,...,...,...,...
84,Utica,164,2024,https://www.roadtonationals.com/api/women/dash...
85,Simpson,155,2024,https://www.roadtonationals.com/api/women/dash...
86,Hamline,26,2024,https://www.roadtonationals.com/api/women/dash...
87,Seattle Pacific,57,2020,https://www.roadtonationals.com/api/women/dash...


In [325]:
# Drop the columns that we are not interested in
teams_df = teams_df.drop(columns=['rank', 'ncaa_final', 'nqs', 'regionals', 'rqs', 'division_id', 'average_score', 'high_score', 'ncaa'])

In [326]:
# Preview the df
teams_df.head()

Unnamed: 0,team_name,team_id,year
0,LSU,34,2024
1,California,15,2024
2,Utah,69,2024
3,Florida,22,2024
4,Stanford,61,2024


In [350]:
# Remove duplicates - ie. if team_id & team_name are identical, retain years as a list

teams_df = teams_df.drop_duplicates(subset=['team_id', 'team_name']).reset_index(drop=True)

In [330]:
# Preview the df
teams_df.head()

Unnamed: 0,team_name,team_id,year
0,LSU,34,2024
1,California,15,2024
2,Utah,69,2024
3,Florida,22,2024
4,Stanford,61,2024


In [352]:
# Determine the link to access the team's dashboard
base_team_url = 'https://www.roadtonationals.com/api/women/dashboard'

# Add the team links to the team_url column
teams_df['team_url'] = teams_df.apply(lambda x: f'{base_team_url}/{str(x["year"])}/{str(x["team_id"])}', axis=1)

In [355]:
# Preview the df - this looks good to work with now
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://www.roadtonationals.com/api/women/dash...
1,California,15,2024,https://www.roadtonationals.com/api/women/dash...
2,Utah,69,2024,https://www.roadtonationals.com/api/women/dash...
3,Florida,22,2024,https://www.roadtonationals.com/api/women/dash...
4,Stanford,61,2024,https://www.roadtonationals.com/api/women/dash...


Now we go to each of the links in the teams df and scrape the data for the meets

In [357]:
#Create a list of desired urls using two list comprehensions
base_team_url = 'https://www.roadtonationals.com/api/women/dashboard'

# Create a list of all team dashboards across all years and teams 
# NB: Some of these will be inactive, but we will filter these out later
meet_urls = [f'{base_team_url}/{str(year)}/{str(team_id)}' for year in years for team_id in teams_df['team_id']]


In [363]:
# Get the meet info for every team in every year
def get_the_meet_info(url):

    # If we are able to fetch the page without timing out
    if fetch_page(url):   
        payload = {}
        headers = {
                'Cookie': 'PHPSESSID=c48eb24102c0c45390a5d64809741f95'
                }

        response = requests.request("GET", url, headers=headers, data=payload)

        # Save the data to a json file
        with open(f'../Data/Raw/meets/{year}_{team}_meets.json', 'w') as f:
            # pure text
            f.write(response.text)
    else:
        pass



In [365]:
for url in tqdm(meet_urls):
    get_the_meet_info(url)

  0%|          | 0/890 [00:00<?, ?it/s]

ConnectTimeout: HTTPSConnectionPool(host='www.roadtonationals.com', port=443): Max retries exceeded with url: /api/women/dashboard/2024/59 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x13147d4f0>, 'Connection to www.roadtonationals.com timed out. (connect timeout=None)'))

In [284]:
# Read the json files into a dataframe

# Create an empty dataframe
meets_data_df = pd.DataFrame()

# For every year, load the data from the json file and append to the dataframe
for year in years:
    for team in team_ids:
        filename = f'../Data/Raw/meets/{year}_{team}_meets.json'

    # Read the json file into a temporary df
    temp_df = pd.read_json(filename)
    temp_df['year'] = year
    temp_df['team_id'] = team

    # Append the temporary df to the main df
    meets_data_df = pd.concat([meets_data_df, temp_df])


meets_data_df = meets_data_df.reset_index(drop=True)
meets_df = pd.json_normalize(meets_data_df['data']).reset_index(drop=True)
meets_df['year'] = meets_data_df['year']
meets_df['team_id'] = meets_data_df['team_id']

ValueError: Expected object or value

In [132]:
# PART 1: Go to the url, wait until everything on the page loads

def get_url_and_wait_for_elements_to_load(url, css_selector):
    try:
        driver.get(url)
        print("*****************************")
        element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)) # (Source: https://selenium-python.readthedocs.io/waits.html )
        )
        print(element)
    except:
        print("oh no it didn't work")
        pass # ?? Trying to make it so that the program doesn't crash if the element isn't found
    

get_url_and_wait_for_elements_to_load(url, 'div.rt-table > div.rt-tbody')
response = Selector(text=driver.page_source)

print(response)

'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.F2F0A140D0AB90353F48C2CDCCFC3BC5.e.14")>
<Selector query=None data='<html class="whatinput-types-initial"...'>


In [133]:
# PART 2: Set up the dataframes we exported in the last session as variables

teams_df = pd.read_csv('../Data/Raw/teams.csv')
meets_df = pd.read_csv('../Data/Raw/meets.csv')

#print(teams_df)
for meet in meets_df['link'][0:20]:
    print(meet)

'https://roadtonationals.com/results/schedule/meet/29998'
'https://roadtonationals.com/results/schedule/meet/29057'
'https://roadtonationals.com/results/schedule/meet/29134'
'https://roadtonationals.com/results/schedule/meet/29224'
'https://roadtonationals.com/results/schedule/meet/29291'
'https://roadtonationals.com/results/schedule/meet/29423'
'https://roadtonationals.com/results/schedule/meet/29515'
'https://roadtonationals.com/results/schedule/meet/29616'
'https://roadtonationals.com/results/schedule/meet/29671'
'https://roadtonationals.com/results/schedule/meet/29810'
'https://roadtonationals.com/results/schedule/meet/29840'
'https://roadtonationals.com/results/schedule/meet/29871'
'https://roadtonationals.com/results/schedule/meet/29949'
'https://roadtonationals.com/results/schedule/meet/30143'
'https://roadtonationals.com/results/schedule/meet/30171'
'https://roadtonationals.com/results/schedule/meet/30206'
'https://roadtonationals.com/results/schedule/meet/30226'
'https://roadt

In [134]:
css_selector = 'div.rt-tbody'
get_url_and_wait_for_elements_to_load(meet_link, css_selector)
response = Selector(text=driver.page_source)  
meet_results_table = response.css(css_selector)
meet_results_table_rows = meet_results_table.css('div.rt-tr-group')
teams_button_clicker = driver.find_element(By.CSS_SELECTOR, '#teambtn')
row_count = 0
meet_host = ''

if response.css('p:nth-child(4)').get():
    meet_host = response.css('p:nth-child(4)::text').get()
else:
    meet_host = 'NaN'

meet_hosts.append(meet_host)


'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.419B255BDF322B9765ED112CE6060F9E.e.15")>


In [135]:
get_url_and_wait_for_elements_to_load(meet_link, css_selector)
response = Selector(text=driver.page_source)

meet_results_table = response.css(css_selector)
meet_results_table.css('div.rt-tr-group::text').getall()

meet_results_table_rows.css('div:nth-child(4)::text').getall()
len(meet_results_table_rows)

'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.6F69F191DAF84E72334C4147AB188957.e.17")>


2

In [136]:
def get_team_score_info(url):

    #Load the page and set the selectors
    get_url_and_wait_for_elements_to_load(url, css_selector)
    response = Selector(text=driver.page_source)  
    meet_results_table_rows = meet_results_table.css('div.rt-tr-group')

    # Determind the meet id
    meet_id = url.split('/')[-1]
    meet_ids.append(meet_id)
    
    # Find out if there is a host for the meet, and if so add to the meet_hosts list
    meet_host = ''
    
    if response.css('p:nth-child(4)').get():
        meet_host = response.css('p:nth-child(4)::text').get()
    else:
        meet_host = 'NaN'

    meet_hosts.append(meet_host)
    
    # Find out how many teams are competing in the meet
    team_count = len(meet_results_table_rows)

    # Add the meet id to the list of team_meet_ids list for each team
    for team_meet_id in range(0, team_count):
        team_meet_id = meet_id
        team_meet_ids.append(team_meet_id)
    
    # Get the hrefs for each team
    team_hrefs = meet_results_table_rows.css('div > div > a::attr(href)').getall()
    # Splitting out the team_id and adding them to the team_ids list
    for team_href in team_hrefs:
        team_id = team_href.split('/')[-1]
        team_ids.append(team_id)

    # Get the scores for each event and the total meet score (this generates a list of lists)
    current_meet_team_vt_scores = meet_results_table_rows.css('div:nth-child(4)::text').getall()
    current_meet_team_ub_scores = meet_results_table_rows.css('div:nth-child(5)::text').getall()
    current_meet_team_bb_scores = meet_results_table_rows.css('div:nth-child(6)::text').getall()
    current_meet_team_fx_scores = meet_results_table_rows.css('div:nth-child(7)::text').getall()
    current_meet_team_meet_scores = meet_results_table_rows.css('div:nth-child(8) > strong::text').getall()

    # Iterating over the lists generated above and adding them to the appropriate (variable) list
    for score in current_meet_team_vt_scores:
        team_vt_scores.append(score)
    
    for score in current_meet_team_ub_scores:
        team_ub_scores.append(score)
    
    for score in current_meet_team_bb_scores:
        team_bb_scores.append(score)
    
    for score in current_meet_team_fx_scores:
        team_fx_scores.append(score)
    
    for score in current_meet_team_meet_scores:
        team_meet_scores.append(score)
    
    
    return team_ids, team_meet_ids, team_vt_scores, team_ub_scores, team_bb_scores, team_fx_scores, team_meet_scores, meet_hosts
    


In [None]:
def get_gymnast_score_info(url):

    #Load the page and set the selectors
    get_url_and_wait_for_elements_to_load(url, css_selector)
    response = Selector(text=driver.page_source)  
    meet_results_table_rows = response.css('div.rt-tr-group')
    
    # Determind the meet id
    meet_id = url.split('/')[-1]
    meet_ids.append(meet_id)

    # Find out how many teams are competing in the meet
    team_count = len(meet_results_table_rows)

    #Click the "Teams" button
    driver.find_element(By.CSS_SELECTOR, '#teambtn').click()
    gymnast_results_table_rows = response.css('div.rt-tr-group')

    for i in range(0, team_count): # Looping through the teams
        # Click on the Team Name
        team_clicker_selector = "#team" + str(i)
        team_clicker = driver.find_element(By.CSS_SELECTOR, team_clicker_selector)
        team_clicker.click()

        # Get the gymnast metadata
        gymnast_hrefs = gymnast_results_table_rows.css('a::attr(href)').getall()
        gymnast_names = gymnast_results_table_rows.css('a::text').getall()
        
        for href in gymnast_hrefs:
            gymnast_id = href.split('/')[-1]
            gymnast_ids.append(gymnast_id)
            gymnast_team_id = href.split('/')[-2]
            gymnast_team_ids.append(gymnast_team_id)
        
        for name in gymnast_names:
            gymnast_names.append(name)

        # Get the gymnast scores
        gymnast_vt_scores = gymnast_results_table_rows.css('div:nth-child(3)::text').getall()
        gymnast_ub_scores = gymnast_results_table_rows.css('div:nth-child(4)::text').getall()
        gymnast_bb_scores = gymnast_results_table_rows.css('div:nth-child(5)::text').getall()
        gymnast_fx_scores = gymnast_results_table_rows.css('div:nth-child(6)::text').getall()
        gymnast_aa_scores = gymnast_results_table_rows.css('div:nth-child(7)::text').getall()



        

    meet_hosts.append(meet_host)
    
    # Find out how many teams are competing in the meet
    team_count = len(meet_results_table_rows)

    # Add the meet id to the list of team_meet_ids list for each team
    for team_meet_id in range(0, team_count):
        team_meet_id = meet_id
        team_meet_ids.append(team_meet_id)
    
    # Get the hrefs for each team
    team_hrefs = meet_results_table_rows.css('div > div > a::attr(href)').getall()
    # Splitting out the team_id and adding them to the team_ids list
    for team_href in team_hrefs:
        team_id = team_href.split('/')[-1]
        team_ids.append(team_id)

    # Get the scores for each event and the total meet score (this generates a list of lists)
    current_meet_team_vt_scores = meet_results_table_rows.css('div:nth-child(4)::text').getall()
    current_meet_team_ub_scores = meet_results_table_rows.css('div:nth-child(5)::text').getall()
    current_meet_team_bb_scores = meet_results_table_rows.css('div:nth-child(6)::text').getall()
    current_meet_team_fx_scores = meet_results_table_rows.css('div:nth-child(7)::text').getall()
    current_meet_team_meet_scores = meet_results_table_rows.css('div:nth-child(8) > strong::text').getall()

    # Iterating over the lists generated above and adding them to the appropriate (variable) list
    for score in current_meet_team_vt_scores:
        team_vt_scores.append(score)
    
    for score in current_meet_team_ub_scores:
        team_ub_scores.append(score)
    
    for score in current_meet_team_bb_scores:
        team_bb_scores.append(score)
    
    for score in current_meet_team_fx_scores:
        team_fx_scores.append(score)
    
    for score in current_meet_team_meet_scores:
        team_meet_scores.append(score)
    
    
    return team_ids, team_meet_ids, team_vt_scores, team_ub_scores, team_bb_scores, team_fx_scores, team_meet_scores, meet_hosts
    


In [137]:
# 5 Go to each of the meet's links and scrape the score information

# dfs we are adding to: meets_df, team_scores_df (meet_id, team_id), gymnast_scores_df (team_id, meet_id)

# Setting up the variables we will be using
meet_links = meets_df['link']
meet_ids = []
team_ids = []
team_meet_ids = []
team_vt_scores = []
team_ub_scores = []
team_bb_scores = []
team_fx_scores = []
team_meet_scores = []
gymnast_ids = []
gymnast_names = []
gymnast_team_ids = []
gymnast_meet_ids = []
gymnast_vt_scores = []
gymnast_ub_scores = []
gymnast_bb_scores = []
gymnast_fx_scores = []
gymnast_aa_scores = []
meet_hosts = []

meet_hosts

[]

In [121]:
meet_links[0:20]

0     https://roadtonationals.com/results/schedule/m...
1     https://roadtonationals.com/results/schedule/m...
2     https://roadtonationals.com/results/schedule/m...
3     https://roadtonationals.com/results/schedule/m...
4     https://roadtonationals.com/results/schedule/m...
5     https://roadtonationals.com/results/schedule/m...
6     https://roadtonationals.com/results/schedule/m...
7     https://roadtonationals.com/results/schedule/m...
8     https://roadtonationals.com/results/schedule/m...
9     https://roadtonationals.com/results/schedule/m...
10    https://roadtonationals.com/results/schedule/m...
11    https://roadtonationals.com/results/schedule/m...
12    https://roadtonationals.com/results/schedule/m...
13    https://roadtonationals.com/results/schedule/m...
14    https://roadtonationals.com/results/schedule/m...
15    https://roadtonationals.com/results/schedule/m...
16    https://roadtonationals.com/results/schedule/m...
17    https://roadtonationals.com/results/schedu

In [141]:
subset_meet_links = meet_links[0:100]

def get_all_the_team_results_from_all_the_meets(url):
    for meet_link in tqdm(meet_links):
        #print(meet_link)
        get_team_score_info(meet_link)
    return team_ids, team_meet_ids, team_vt_scores, team_ub_scores, team_bb_scores, team_fx_scores, team_meet_scores, meet_hosts

get_all_the_team_results_from_all_the_meets(url)

print(len(team_ids))
print(len(team_meet_ids))
print(len(team_vt_scores))
print(len(team_ub_scores))
print(len(team_bb_scores))
print(len(team_fx_scores))
print(len(team_meet_scores))


team_results_df = pd.DataFrame({'team_id': team_ids, 'meet_id': team_meet_ids, 'vt_score': team_vt_scores, 'ub_score': team_ub_scores, 'bb_score': team_bb_scores, 'fx_score': team_fx_scores, 'meet_score': team_meet_scores})

  0%|          | 0/7866 [00:00<?, ?it/s]

'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.09560E5813B5EEF4D2C0C11BF5215C51.e.378")>
'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.547CEA642249738285557D9C5FD2BEF7.e.379")>
'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.D271AECA735777AE3187CFF47AFF5022.e.380")>
'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A324CCA1A2641FC4D0D7ED1778.d.938F012127A510F302255E3454A385D4.e.381")>
'*****************************'
<selenium.webdriver.remote.webelement.WebElement (session="07009a743688c6649c21bd2153854f3e", element="f.ED1D65A

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=126.0.6478.183)
Stacktrace:
0   chromedriver                        0x00000001063f2078 chromedriver + 5169272
1   chromedriver                        0x00000001063e9f4a chromedriver + 5136202
2   chromedriver                        0x0000000105f6636c chromedriver + 402284
3   chromedriver                        0x0000000105f4bd26 chromedriver + 294182
4   chromedriver                        0x0000000105f4bc13 chromedriver + 293907
5   chromedriver                        0x0000000105f68782 chromedriver + 411522
6   chromedriver                        0x0000000105ff592b chromedriver + 989483
7   chromedriver                        0x0000000105fd6853 chromedriver + 862291
8   chromedriver                        0x0000000105fa65c6 chromedriver + 665030
9   chromedriver                        0x0000000105fa6e4e chromedriver + 667214
10  chromedriver                        0x00000001063b4c90 chromedriver + 4918416
11  chromedriver                        0x00000001063b9c8d chromedriver + 4938893
12  chromedriver                        0x00000001063ba365 chromedriver + 4940645
13  chromedriver                        0x0000000106395d74 chromedriver + 4791668
14  chromedriver                        0x00000001063ba659 chromedriver + 4941401
15  chromedriver                        0x0000000106387544 chromedriver + 4732228
16  chromedriver                        0x00000001063da828 chromedriver + 5072936
17  chromedriver                        0x00000001063da9e7 chromedriver + 5073383
18  chromedriver                        0x00000001063e9afe chromedriver + 5135102
19  libsystem_pthread.dylib             0x00007ff805ef51d3 _pthread_start + 125
20  libsystem_pthread.dylib             0x00007ff805ef0bd3 thread_start + 15


In [140]:
team_results_df

Unnamed: 0,team_id,meet_id,vt_score,ub_score,bb_score,fx_score,meet_score
0,15,29998,49.3750,49.5250,49.5500,49.4250,197.8750
1,71,29998,49.0000,49.0000,48.8500,49.2000,196.0500
2,15,29057,49.3750,49.5250,49.5500,49.4250,197.8750
3,71,29057,49.0000,49.0000,48.8500,49.2000,196.0500
4,15,29134,49.3750,49.5250,49.5500,49.4250,197.8750
...,...,...,...,...,...,...,...
395,71,24734,49.0000,49.0000,48.8500,49.2000,196.0500
396,15,24807,49.3750,49.5250,49.5500,49.4250,197.8750
397,71,24807,49.0000,49.0000,48.8500,49.2000,196.0500
398,15,24836,49.3750,49.5250,49.5500,49.4250,197.8750


## Scraping using hidden APIS

In [None]:
# # Replace with the actual API endpoint you discovered
# years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]

# api_url = 'https://www.roadtonationals.com/api/women/finalresults/2024'

# # Include necessary headers, cookies, or auth tokens
# headers = {}

# response = requests.get(api_url, headers=headers)

# if response.status_code == 200:
#     data = response.json()
#     print(data)
# else:
#     print(f"Failed to retrieve data: {response.status_code}")