In [1]:
import os
import json
import requests
import datetime

import numpy as np
import pandas as pd 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import Selector

from tqdm.notebook import tqdm
from pprint import pprint as print

In [3]:
# Read the teams data json files into a dataframe

# Create an empty dataframe
years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015] # These are the years that we are interested in evaluating
teams_data_df = pd.DataFrame()

# For every year, load the data from the json file and append to the dataframe
for year in years:
    filename = f'../Data/Raw/teams/{year}_teams.json'

    # Read the json file into a temporary df
    temp_df = pd.read_json(filename)
    temp_df['year'] = year

    # Append the temporary df to the main df
    teams_data_df = pd.concat([teams_data_df, temp_df])


teams_data_df = teams_data_df.reset_index(drop=True)
teams_df = pd.json_normalize(teams_data_df['data']).reset_index(drop=True)
teams_df['year'] = teams_data_df['year']

In [4]:
teams_df

Unnamed: 0,rank,team_name,team_id,ncaa_final,ncaa,nqs,regionals,rqs,division_id,average_score,high_score,year
0,1,LSU,34,198.225,198.113,396.465,198.250,198.215,1,197.908,198.475,2024
1,2,California,15,197.850,197.713,396.455,198.275,198.180,1,197.833,198.550,2024
2,3,Utah,69,197.800,197.938,395.470,197.575,197.895,1,197.704,198.300,2024
3,4,Florida,22,197.438,197.875,396.230,198.325,197.905,1,197.670,198.225,2024
4,5,Stanford,61,,197.075,394.620,197.575,197.045,1,196.563,197.975,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
803,78,Hamline,26,,,,,184.345,3,182.275,187.100,2015
804,79,UW-Eau Claire,77,,,,,184.155,3,169.692,187.850,2015
805,80,UW-Oshkosh,79,,,,,183.930,3,155.261,188.325,2015
806,81,Gustavus Adolphus,25,,,,,178.820,3,153.094,180.200,2015


In [5]:
# Drop the columns that we are not interested in
teams_df = teams_df.drop(columns=['rank', 'ncaa_final', 'nqs', 'regionals', 'rqs', 'division_id', 'average_score', 'high_score', 'ncaa'])

In [6]:
# Preview the df
teams_df.head()

Unnamed: 0,team_name,team_id,year
0,LSU,34,2024
1,California,15,2024
2,Utah,69,2024
3,Florida,22,2024
4,Stanford,61,2024


In [7]:
# Determine the link to access the team's dashboard
base_team_url = 'https://www.roadtonationals.com/api/women/dashboard'

# Add the team links to the team_url column
teams_df['team_url'] = teams_df.apply(lambda x: f'{base_team_url}/{str(x["year"])}/{str(x["team_id"])}', axis=1)

In [8]:
# Preview the df - this looks good to work with now
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://www.roadtonationals.com/api/women/dash...
1,California,15,2024,https://www.roadtonationals.com/api/women/dash...
2,Utah,69,2024,https://www.roadtonationals.com/api/women/dash...
3,Florida,22,2024,https://www.roadtonationals.com/api/women/dash...
4,Stanford,61,2024,https://www.roadtonationals.com/api/women/dash...


In [50]:
# Read the json files into a meets dataframe

# Create an empty dataframe
team_ids = teams_df['team_id'].tolist()
meets_data_df = pd.DataFrame()

with open(filename) as data_file:    
    data = json.load(data_file)  


# For every year, load the data from the json file and append to the dataframe
for year in years:
    for team in team_ids:
        filename = f'../Data/Raw/meets/{year}_{team}_meets.json'

        with open(filename) as data_file:    
            data = json.load(data_file) 

            # Read the json file into a temporary df
            temp_df = pd.json_normalize(data, 'meets')
            temp_df['year'] = year
            temp_df['team_id'] = team

            # Append the temporary df to the main df
            meets_data_df = pd.concat([meets_data_df, temp_df])


meets_data_df = meets_data_df.reset_index(drop=True)

In [51]:
# Preview the df
meets_data_df.sort_values(by='meet_id', ascending=False).head()


Unnamed: 0,team_id,team_name,meet_id,meet_date,team_score,home,opponent,meet_desc,linked_id,jas,year
9668,69,Utah,30231,"Sat, Apr-20-2024",197.8,A,"California, Florida, LSU",NCAA Championships Finals,6392,,2024
4308,69,Utah,30231,"Sat, Apr-20-2024",197.8,A,"California, Florida, LSU",NCAA Championships Finals,6392,,2024
48,69,Utah,30231,"Sat, Apr-20-2024",197.8,A,"California, Florida, LSU",NCAA Championships Finals,6392,,2024
7558,69,Utah,30231,"Sat, Apr-20-2024",197.8,A,"California, Florida, LSU",NCAA Championships Finals,6392,,2024
2311,69,Utah,30231,"Sat, Apr-20-2024",197.8,A,"California, Florida, LSU",NCAA Championships Finals,6392,,2024


In [52]:
meets_df['meet_url'] = meets_df['meet_id'].apply(lambda x: f"{results_url_root}{str(x)}")
meets_df.set_index('meet_url').head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meets_df['meet_url'] = meets_df['meet_id'].apply(lambda x: f"{results_url_root}{str(x)}")


Unnamed: 0_level_0,team_id,team_name,meet_id,meet_date,team_score,home,opponent,meet_desc,linked_id,jas,year,team_opponent,all_teams
meet_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
https://www.roadtonationals.com/api/women/meetresults/28977,34,LSU,28977,"Fri, Jan-05-2024",196.975,H,Ohio State,,5986,,2024,"[LSU, Ohio State]","(LSU, Ohio State)"
https://www.roadtonationals.com/api/women/meetresults/29040,34,LSU,29040,"Sat, Jan-13-2024",197.15,A,"Oklahoma, UCLA, Utah",Sprouts Farmers Market Collegiate Quad,6011,,2024,"[LSU, Oklahoma, UCLA, Utah]","(LSU, Oklahoma, UCLA, Utah)"
https://www.roadtonationals.com/api/women/meetresults/29098,34,LSU,29098,"Fri, Jan-19-2024",198.125,H,Kentucky,,6030,,2024,"[LSU, Kentucky]","(Kentucky, LSU)"
https://www.roadtonationals.com/api/women/meetresults/29215,34,LSU,29215,"Fri, Jan-26-2024",197.225,A,Missouri,,6078,,2024,"[LSU, Missouri]","(LSU, Missouri)"
https://www.roadtonationals.com/api/women/meetresults/29303,34,LSU,29303,"Fri, Feb-02-2024",198.475,H,Arkansas,,6111,,2024,"[LSU, Arkansas]","(Arkansas, LSU)"


In [55]:
# Create a new column that stores the team name and the opponent names as a sorted list
meets_data_df['all_teams'] = meets_data_df.apply(lambda x: [x['team_name']] + x['opponent'].split(', '), axis=1)

meets_data_df['all_teams'] = meets_data_df['all_teams'].apply(lambda x: sorted(x))

meets_data_df['all_teams'] = meets_data_df['all_teams'].apply(tuple)

meets_data_df

# Drop duplicates (when all_teams and meet_date column are identical, they are duplicates)
meets_df = meets_data_df.drop_duplicates(subset=['all_teams', 'meet_date'])

len(meets_df)


3614

In [96]:
# Read the json files into a results dataframe
meet_ids = meets_df['meet_id'].tolist()

# Create an empty dataframe
team_results_data_df = pd.DataFrame()
gymnasts_data_df = pd.DataFrame()

# with open(filename) as data_file:    
#     data = json.load(data_file)  


# For every meet, load the data from the results json file and append to the dataframe
for meet_id in meet_ids:
    filename = f'../Data/Raw/results/{meet_id}_results.json'

    if os.path.exists(filename):
        if os.path.getsize(filename) == 0:
            print(f"File {filename} is empty.")
            continue

        try:
            with open(filename) as data_file:
                data = json.load(data_file)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from file {filename}: {e}")
            continue

        # Read the json file into temporary dataframes
        temp_team_df = pd.json_normalize(data, 'teams')

        # Normalising the scores data
        scores_data = data['scores']

        # Flatten the nested structure
        # Since 'scores' is a list of lists, we need to flatten it first
        flattened_scores = [item for sublist in scores_data for item in sublist]

        # Create DataFrame
        temp_gymnast_df = pd.json_normalize(flattened_scores)
        temp_gymnast_df['meet_id'] = meet_id

        # Append the temporary dataframes to the main dataframes
        team_results_data_df = pd.concat([team_results_data_df, temp_team_df])
        gymnasts_data_df = pd.concat([gymnasts_data_df, temp_gymnast_df])
    
    else:
        print(f"File {filename} does not exist.")
        continue

# Reset index for the final dataframes
team_results_data_df = team_results_data_df.reset_index(drop=True)
gymnasts_data_df = gymnasts_data_df.reset_index(drop=True)

('Error decoding JSON from file ../Data/Raw/results/27977_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/26843_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/24822_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/21326_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/20001_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/19660_results.json: '
 'Expecting value: line 1 column 1 (char 0)')
('Error decoding JSON from file ../Data/Raw/results/20016_results.json: '
 'Expecting value: line 1 column 1 (char 0)')


In [97]:
# Preview the dataframes
team_results_data_df

# This one looks ok!

Unnamed: 0,mid,tid,tname,vault,bars,beam,floor,tscore,year,home,lead
0,28977,34,LSU,49.3750,49.3750,48.7000,49.5250,196.9750,2024,H,0.000
1,28978,46,Ohio State,49.3000,49.1250,49.0500,49.3000,196.7750,2024,A,0.200
2,29039,47,Oklahoma,49.4500,49.4500,49.5250,49.4750,197.9000,2024,A,0.000
3,29040,34,LSU,49.2250,49.6500,48.7500,49.5250,197.1500,2024,A,0.750
4,29042,66,UCLA,49.4000,49.2500,49.2500,49.2000,197.1000,2024,A,0.800
...,...,...,...,...,...,...,...,...,...,...,...
10262,18077,25,Gustavus Adolphus,45.5250,40.1750,43.2500,43.9000,172.8500,2015,A,10.250
10263,18258,76,Winona State,46.9000,46.2750,46.3750,46.4000,185.9500,2015,A,0.000
10264,18805,26,Hamline,46.7750,45.5500,46.3500,46.3250,185.0000,2015,H,0.950
10265,18800,26,Hamline,44.5000,44.0000,42.7500,44.2250,175.4750,2015,A,0.000


In [98]:
# Preview the dataframes
gymnasts_data_df
# This one needs rationalization

Unnamed: 0,gid,first_name,last_name,vault,bars,beam,floor,all_around,team_name,team_id,yr,vt_url,ub_url,bb_url,fx_url,meet_id
0,30950,Sierra,Ballard,,,9.2000,9.9000,,LSU,34,2024,,,,,28977
1,30952,Haleigh,Bryant,9.9500,9.8750,9.9250,9.9250,39.6750,LSU,34,2024,,,,,28977
2,31947,Ashley,Cowan,,9.8000,,,,LSU,34,2024,,,,,28977
3,32453,Amari,Drayton,9.9250,,,9.9250,,LSU,34,2024,,,,,28977
4,30953,Olivia,Dunne,,,,9.8750,,LSU,34,2024,,,,,28977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113695,23535,Jamie,Niedermayer,8.8000,,8.5250,8.1750,,Gustavus Adolphus,25,2015,,,,,18073
113696,23536,Jessica,Niedermayer,,7.5750,8.5500,,,Gustavus Adolphus,25,2015,,,,,18073
113697,23537,Hilary,Sabourin,,,,8.8000,,Gustavus Adolphus,25,2015,,,,,18073
113698,23538,Kate,Schneider,8.8250,,,,,Gustavus Adolphus,25,2015,,,,,18073


In [82]:
# Rationalize the gymnasts dataframe
pd.json_normalize(gymnasts_data_df['vault']).reset_index(drop=True)

KeyError: 'vault'