In [1]:
!pip install -r ../requirements.txt



In [2]:
!pip install --ugrade iphython-sql
!pip install --upgrade SQLAlchemy


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: --ugrade


In [3]:
!pip show SQLAlchemy

Name: SQLAlchemy
Version: 2.0.31
Summary: Database Abstraction Library
Home-page: https://www.sqlalchemy.org
Author: Mike Bayer
Author-email: mike_mp@zzzcomputing.com
License: MIT
Location: /Users/jessicahyne/opt/miniconda3/envs/gymternet/lib/python3.12/site-packages
Requires: greenlet, typing-extensions
Required-by: ipython-sql, jupysql


In [4]:
import os
import json
import requests
import sqlite3
from tqdm.notebook import tqdm, trange
tqdm.pandas()
import numpy as np
import pandas as pd 
from sqlalchemy import create_engine

%load_ext sql
%config SqlMagic.autocommit=True

from pprint import pprint

# 1 Clean up `teams_df` and create `Teams.db`

What I want in this table in my database is:

| **team_id** | **team_name** | **team_url**                           |
|-------------|---------------|----------------------------------------|
| 1           | 'Auburn'      | "https://www.roadtonationals/team/645" | 


etc.

The `team_id` column should act as the primary key.


In [5]:
# Make a database to keep all my lovely tables in
%sql sqlite:///../data/clean/gymternet.db --alias gymternet
engine = create_engine('sqlite:///../data/clean/gymternet.db')

In [6]:
teams_df = pd.read_pickle('../data/raw/dirty_dfs/teams_df.pkl')
meets_df = pd.read_pickle('../data/raw/dirty_dfs/meets_df.pkl')
gymnasts_data_df = pd.read_pickle('../data/raw/dirty_dfs/gymnasts_data_df.pkl')
team_results_data_df = pd.read_pickle('../data/raw/dirty_dfs/team_results_data_df.pkl')

# 1 Clean up `teams_df` and create `Teams.db`

What I want in this table in my database is:

| **team_id** | **team_name** | **team_url**                           |
|-------------|---------------|----------------------------------------|
| 1           | 'Auburn'      | "https://www.roadtonationals/team/645" | 


etc.

The `team_id` column should act as the primary key.


In [7]:
# Preview the DataFrame
#teams_df.dtypes
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://www.roadtonationals.com/api/women/dash...
1,California,15,2024,https://www.roadtonationals.com/api/women/dash...
2,Utah,69,2024,https://www.roadtonationals.com/api/women/dash...
3,Florida,22,2024,https://www.roadtonationals.com/api/women/dash...
4,Stanford,61,2024,https://www.roadtonationals.com/api/women/dash...


This DataFrame is already pretty tidy, but the links are pointing towards the API, which is not what we want, and we still have an irrelevant column (`year`)

In [8]:
base_team_url = 'https://roadtonationals.com/results/teams/dashboard'
teams_df['team_url'] = teams_df.apply(lambda x: f'{str(base_team_url)}/{str(x["year"])}/{str(x["team_id"])}', axis=1)

In [9]:
# Preview the df
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://roadtonationals.com/results/teams/dash...
1,California,15,2024,https://roadtonationals.com/results/teams/dash...
2,Utah,69,2024,https://roadtonationals.com/results/teams/dash...
3,Florida,22,2024,https://roadtonationals.com/results/teams/dash...
4,Stanford,61,2024,https://roadtonationals.com/results/teams/dash...


In [10]:
# Drop the year column
teams_df.drop(columns='year', inplace=True)
teams_df.head()

Unnamed: 0,team_name,team_id,team_url
0,LSU,34,https://roadtonationals.com/results/teams/dash...
1,California,15,https://roadtonationals.com/results/teams/dash...
2,Utah,69,https://roadtonationals.com/results/teams/dash...
3,Florida,22,https://roadtonationals.com/results/teams/dash...
4,Stanford,61,https://roadtonationals.com/results/teams/dash...


In [11]:
teams_df.to_sql('teams', con=engine, if_exists = 'append', chunksize = 1000)

89

In [12]:
meets_df.head()

Unnamed: 0,team_id,team_name,meet_id,meet_date,team_score,home,opponent,meet_desc,linked_id,jas,year,meet_url,all_teams
0,34,LSU,28977,"Fri, Jan-05-2024",196.975,H,Ohio State,,5986,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Ohio State)"
1,34,LSU,29040,"Sat, Jan-13-2024",197.15,A,"Oklahoma, UCLA, Utah",Sprouts Farmers Market Collegiate Quad,6011,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Oklahoma, UCLA, Utah)"
2,34,LSU,29098,"Fri, Jan-19-2024",198.125,H,Kentucky,,6030,,2024,https://www.roadtonationals.com/api/women/meet...,"(Kentucky, LSU)"
3,34,LSU,29215,"Fri, Jan-26-2024",197.225,A,Missouri,,6078,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Missouri)"
4,34,LSU,29303,"Fri, Feb-02-2024",198.475,H,Arkansas,,6111,,2024,https://www.roadtonationals.com/api/women/meet...,"(Arkansas, LSU)"


# 1 Clean up `meets_df` and create `meets.db`

What I want in this table in my database is:

| **meet_id** | **year**      | **team_id** | **date**   |
|-------------|---------------|-------------|------------|
| 117897      | 2015          | 1           | 2015-01-09 |


etc.

The unique combination of `meet_id` and `team_id` columns should act as the primary key, the `team_id` column is a foreign key, connecting to the `teams` table.

This DataFrame is a bit more of a mess, so will require a bit more cleaning before I send to the Gymternet Database.

In [13]:
meets_df = pd.read_pickle('../data/raw/dirty_dfs/meets_df.pkl')
# Preview the meets_df
meets_df.head()

Unnamed: 0,team_id,team_name,meet_id,meet_date,team_score,home,opponent,meet_desc,linked_id,jas,year,meet_url,all_teams
0,34,LSU,28977,"Fri, Jan-05-2024",196.975,H,Ohio State,,5986,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Ohio State)"
1,34,LSU,29040,"Sat, Jan-13-2024",197.15,A,"Oklahoma, UCLA, Utah",Sprouts Farmers Market Collegiate Quad,6011,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Oklahoma, UCLA, Utah)"
2,34,LSU,29098,"Fri, Jan-19-2024",198.125,H,Kentucky,,6030,,2024,https://www.roadtonationals.com/api/women/meet...,"(Kentucky, LSU)"
3,34,LSU,29215,"Fri, Jan-26-2024",197.225,A,Missouri,,6078,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Missouri)"
4,34,LSU,29303,"Fri, Feb-02-2024",198.475,H,Arkansas,,6111,,2024,https://www.roadtonationals.com/api/women/meet...,"(Arkansas, LSU)"


In [14]:
# Drop the irrelevant columns: team_name, team_id, team_score, opponent, linked_id, jas)
meets_df.drop(columns=['team_name', 'team_id', 'home', 'meet_desc', 'team_score', 'opponent', 'linked_id', 'jas'], inplace=True)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,all_teams
0,28977,"Fri, Jan-05-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Ohio State)"
1,29040,"Sat, Jan-13-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Oklahoma, UCLA, Utah)"
2,29098,"Fri, Jan-19-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(Kentucky, LSU)"
3,29215,"Fri, Jan-26-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Missouri)"
4,29303,"Fri, Feb-02-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(Arkansas, LSU)"


In [15]:
# Tidy up the data types
meets_df['meet_id'] = meets_df['meet_id'].astype(int)

# Change the format of the meet_date column to be compatible with datetime
meets_df['meet_date'] = meets_df['meet_date'].apply(lambda x: x.split(' ')[1])

# Change the "Jan", "Feb", etc. to numbers
months = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
meets_df['meet_date'] = meets_df['meet_date'].apply(lambda x: f'{x.split("-")[2]}-{months[x.split("-")[0]]}-{x.split("-")[1]}')

# Change the datatype of the meet_date column to datetime
#meets_df['meet_date'] = pd.to_datetime(meets_df['meet_date'])

# Tidy up the meet_url column
base_meet_url = 'https://roadtonationals.com/results/schedule/meet/'
meets_df['meet_url'] = meets_df.apply(lambda x: f'{str(base_meet_url)}{str(x["meet_id"])}', axis=1)

# Preview the meets_df
meets_df.dtypes


meet_id       int64
meet_date    object
year          int64
meet_url     object
all_teams    object
dtype: object

In [16]:
# Melt the all_teams column so that each of the teams in the list has its own row
meets_df = meets_df.explode('all_teams').reset_index(drop=True)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,all_teams
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,LSU
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,Ohio State
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,LSU
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,Oklahoma
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,UCLA


In [17]:
# Merge the meets_df with the teams_df to get the team_id
meets_df = meets_df.merge(teams_df[['team_id', 'team_name']], left_on='all_teams', right_on='team_name', how='left')

# Drop the team_name column
meets_df.drop(columns=['team_name', 'all_teams'], inplace=True)

# Rename the team_id column
meets_df.rename(columns={'team_id': 'team_id'}, inplace=True)

# Preview the meets_df
meets_df.head()


Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,34.0
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,46.0
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,34.0
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,47.0
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,66.0


In [18]:
# Find the NA values in the team_id column
meets_df[meets_df['team_id'].isna()]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
2073,27977,2023-01-20,2023,https://roadtonationals.com/results/schedule/m...,
3131,26843,2022-01-21,2022,https://roadtonationals.com/results/schedule/m...,
5813,24822,2019-04-12,2019,https://roadtonationals.com/results/schedule/m...,
7426,21326,2017-04-14,2017,https://roadtonationals.com/results/schedule/m...,
9073,20001,2016-02-12,2016,https://roadtonationals.com/results/schedule/m...,
9256,19660,2016-03-19,2016,https://roadtonationals.com/results/schedule/m...,
9266,20016,2016-03-19,2016,https://roadtonationals.com/results/schedule/m...,


Looking at the dataframe, I noticed there are some NA values in the team_id column (indicating a meet with only one team participating). This seemed curious (as typically a competition requires at least two parties), and upon exploring the data, it seems there is a coding error on the website or perhaps there are meets listed that did not end up taking place due to cancellations, but were still listed as 'intended meets'. Furthermore, some teams are listed as participants for (semi) finals that did not actually make it through to that round of competition. 

As such, I can feel safe dropping dropping those meet_ids from the df.

Incidentally, these are the same meet_ids that weren't able to be retrieved in the scraping process, so that solves a mystery there.

In [19]:
meets_df[meets_df['meet_id'].isin([27977, 26843, 24822, 21326, 20001, 19660, 20016])]

# Drop any row that has a meet_id that of a row with a missing team_id
meets_df = meets_df[~meets_df['meet_id'].isin([27977, 26843, 24822, 21326, 20001, 19660, 20016])]

# Make sure there's no more NAs in the team_id column
meets_df[meets_df['team_id'].isna()]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id


In [20]:
# Change the team_id column to an integer
meets_df['team_id'] = meets_df['team_id'].astype(int)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,34
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,46
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,34
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,47
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,66


In [21]:
# Make sure the combination of meet_id and team_id is unique
meets_df[meets_df.duplicated(subset=['meet_id', 'team_id'], keep=False)]

# Drop the duplicate rows
meets_df.drop_duplicates(subset=['meet_id', 'team_id'], inplace=True)

# Make sure the combination of meet_id and team_id is unique
meets_df[meets_df.duplicated(subset=['meet_id', 'team_id'], keep=False)]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id


In [24]:
%%sql --alias gymternet

DROP TABLE IF EXISTS meets;

CREATE TABLE meets (
    meet_id INTEGER,
    team_id INTEGER,
    year INTEGER,
    meet_date DATE,
    meet_url TEXT,
    PRIMARY KEY (meet_id, team_id),
    FOREIGN KEY (team_id) REFERENCES teams (team_id)
);


In [25]:
%%sql --alias gymternet
PRAGMA table_info(meets);

cid,name,type,notnull,dflt_value,pk
0,meet_id,INTEGER,0,,1
1,team_id,INTEGER,0,,2
2,year,INTEGER,0,,0
3,meet_date,DATE,0,,0
4,meet_url,TEXT,0,,0


In [26]:
meets_df.to_sql('meets', con=engine, if_exists='append', index=False)

10273