# 1 Setting up the environment

## 1.1 Requirements & packages

Make sure the environment is configured as per the README file, the requirements are installed, and the relevant libraries, packages and modules are imported.


In [None]:
!pip install -r ../requirements.txt

In [4]:
import os
import json
import requests
import sqlite3
from tqdm.notebook import tqdm, trange
tqdm.pandas()
import numpy as np
import pandas as pd 
from sqlalchemy import create_engine

%load_ext sql
%config SqlMagic.autocommit=True

from pprint import pprint

## 1.2 Import the data and set up the Database

In [None]:
# Make a database to keep all my lovely tables in
%sql sqlite:///../data/clean/gymternet.db --alias gymternet
engine = create_engine('sqlite:///../data/clean/gymternet.db')

In [148]:
# Read the dataframes created during the scraping sessions into this notebook
teams_df = pd.read_pickle('../data/raw/dirty_dfs/teams_df.pkl')
meets_df = pd.read_pickle('../data/raw/dirty_dfs/meets_df.pkl')
gymnasts_data_df = pd.read_pickle('../data/raw/dirty_dfs/gymnasts_data_df.pkl')
gymnast_results_data_df = pd.read_pickle('../data/raw/dirty_dfs/gymnasts_data_df.pkl')
team_results_data_df = pd.read_pickle('../data/raw/dirty_dfs/team_results_data_df.pkl')

# 1 Clean up `teams_df` and create `Teams.db`

What I want in this table in my database is:

| **team_id** | **team_name** | **team_url**                           |
|-------------|---------------|----------------------------------------|
| 1           | 'Auburn'      | "https://www.roadtonationals/team/645" | 


etc.

The `team_id` column should act as the primary key.


# 1 Clean up `teams_df` and create `Teams.db`

What I want in this table in my database is:

| **team_id** | **team_name** | **team_url**                           |
|-------------|---------------|----------------------------------------|
| 1           | 'Auburn'      | "https://www.roadtonationals/team/645" | 


etc.

The `team_id` column should act as the primary key.


In [149]:
# Preview the DataFrame
#teams_df.dtypes
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://www.roadtonationals.com/api/women/dash...
1,California,15,2024,https://www.roadtonationals.com/api/women/dash...
2,Utah,69,2024,https://www.roadtonationals.com/api/women/dash...
3,Florida,22,2024,https://www.roadtonationals.com/api/women/dash...
4,Stanford,61,2024,https://www.roadtonationals.com/api/women/dash...


## 1.1 Cleaning `teams_df`

This DataFrame is already pretty tidy, but the links are pointing towards the API, which is not what we want, and we still have an irrelevant column (`year`)

In [150]:
base_team_url = 'https://roadtonationals.com/results/teams/dashboard'
teams_df['team_url'] = teams_df.apply(lambda x: f'{str(base_team_url)}/{str(x["year"])}/{str(x["team_id"])}', axis=1)

In [151]:
# Preview the df
teams_df.head()

Unnamed: 0,team_name,team_id,year,team_url
0,LSU,34,2024,https://roadtonationals.com/results/teams/dash...
1,California,15,2024,https://roadtonationals.com/results/teams/dash...
2,Utah,69,2024,https://roadtonationals.com/results/teams/dash...
3,Florida,22,2024,https://roadtonationals.com/results/teams/dash...
4,Stanford,61,2024,https://roadtonationals.com/results/teams/dash...


In [152]:
# Drop the year column
teams_df.drop(columns='year', inplace=True)
teams_df.head()

Unnamed: 0,team_name,team_id,team_url
0,LSU,34,https://roadtonationals.com/results/teams/dash...
1,California,15,https://roadtonationals.com/results/teams/dash...
2,Utah,69,https://roadtonationals.com/results/teams/dash...
3,Florida,22,https://roadtonationals.com/results/teams/dash...
4,Stanford,61,https://roadtonationals.com/results/teams/dash...


## 1.2 Creating a `teams` table in `gymternet.db`

I want to be fairly specific about the data types going into the database, so for me it's important to set up my table structure rather than letting it be determined by what's in the DataFrame.

In [153]:
%%sql --alias gymternet

DROP TABLE IF EXISTS meets;

CREATE TABLE meets (
    team_id TINYINT PRIMARY KEY,
    team_name TINYTEXT NOT NULL,
    team_url VARCHAR(128)
);

In [154]:
teams_df.to_sql('teams', con=engine, if_exists='append', index=False)

89

# 2 Clean up `meets_df` and create `meets.db`

What I want in this table in my database is:

| **meet_id** | **year**      | **team_id** | **date**   |
|-------------|---------------|-------------|------------|
| 117897      | 2015          | 1           | 2015-01-09 |


etc.

The unique combination of `meet_id` and `team_id` columns should act as the primary key, the `team_id` column is a foreign key, connecting to the `teams` table.

This DataFrame is a bit more of a mess, so will require a bit more cleaning before I send to the Gymternet Database.

## 2.1 Cleaning `meets_df`

In [155]:
# Preview the meets_df
meets_df.head()

Unnamed: 0,team_id,team_name,meet_id,meet_date,team_score,home,opponent,meet_desc,linked_id,jas,year,meet_url,all_teams
0,34,LSU,28977,"Fri, Jan-05-2024",196.975,H,Ohio State,,5986,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Ohio State)"
1,34,LSU,29040,"Sat, Jan-13-2024",197.15,A,"Oklahoma, UCLA, Utah",Sprouts Farmers Market Collegiate Quad,6011,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Oklahoma, UCLA, Utah)"
2,34,LSU,29098,"Fri, Jan-19-2024",198.125,H,Kentucky,,6030,,2024,https://www.roadtonationals.com/api/women/meet...,"(Kentucky, LSU)"
3,34,LSU,29215,"Fri, Jan-26-2024",197.225,A,Missouri,,6078,,2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Missouri)"
4,34,LSU,29303,"Fri, Feb-02-2024",198.475,H,Arkansas,,6111,,2024,https://www.roadtonationals.com/api/women/meet...,"(Arkansas, LSU)"


In [156]:
# Drop the irrelevant columns: team_name, team_id, team_score, opponent, linked_id, jas)
meets_df.drop(columns=['team_name', 'team_id', 'home', 'meet_desc', 'team_score', 'opponent', 'linked_id', 'jas'], inplace=True)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,all_teams
0,28977,"Fri, Jan-05-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Ohio State)"
1,29040,"Sat, Jan-13-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Oklahoma, UCLA, Utah)"
2,29098,"Fri, Jan-19-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(Kentucky, LSU)"
3,29215,"Fri, Jan-26-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(LSU, Missouri)"
4,29303,"Fri, Feb-02-2024",2024,https://www.roadtonationals.com/api/women/meet...,"(Arkansas, LSU)"


Now that I have the relevant columns, I have some fairly messy datatypes in this DataFrame.

To have clean data to import into my database, I want the following changes:
- `meet_id` should be an integer type
- `meet_date` should be compatible with SQL's DATE type
- `meet_url` should be rewritten as the public urls (rather than the cURL)

In [157]:
# Tidy up the data types
meets_df['meet_id'] = meets_df['meet_id'].astype(int)

# Change the format of the meet_date column to be compatible with datetime
meets_df['meet_date'] = meets_df['meet_date'].apply(lambda x: x.split(' ')[1])

# Change the "Jan", "Feb", etc. to numbers
# I'm keeping this as a string to avoid time components of a datetime object confusing the issue when it comes to importing into SQL as DATE
months = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
meets_df['meet_date'] = meets_df['meet_date'].apply(lambda x: f'{x.split("-")[2]}-{months[x.split("-")[0]]}-{x.split("-")[1]}')


# Tidy up the meet_url column
base_meet_url = 'https://roadtonationals.com/results/schedule/meet/'
meets_df['meet_url'] = meets_df.apply(lambda x: f'{str(base_meet_url)}{str(x["meet_id"])}', axis=1)

# Preview the meets_df
meets_df.dtypes


meet_id       int64
meet_date    object
year          int64
meet_url     object
all_teams    object
dtype: object

Now that everything is fairly tidy, I am ready to unmerge the information in the `all_teams` column, so that each row only has one team name.

In [158]:
# Explode the all_teams column so that each of the teams in the list has its own row
meets_df = meets_df.explode('all_teams').reset_index(drop=True)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,all_teams
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,LSU
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,Ohio State
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,LSU
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,Oklahoma
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,UCLA


I have set the primary key in the `team` table in database to be the primary key, so it is more appropriate to include the `team_id` value in this table as a foreign key than `team_name` - I have to merge the two DataFrames on to get the `team_id` for the value that is currently in `all_teams` (which is currently not a particularly descriptive column name, but never fear, it will be deleted soon and no longer be a source of confusion!)

In [159]:
# Merge the meets_df with the teams_df to get the team_id
meets_df = meets_df.merge(teams_df[['team_id', 'team_name']], left_on='all_teams', right_on='team_name', how='left')

# Drop the team_name column
meets_df.drop(columns=['team_name', 'all_teams'], inplace=True)

# Rename the team_id column
meets_df.rename(columns={'team_id': 'team_id'}, inplace=True)

# Preview the meets_df
meets_df.head()


Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,34.0
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,46.0
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,34.0
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,47.0
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,66.0


At this stage, trying to convert the `team_id` column to integer threw an error, complaining about some NaN values in the column.

In [160]:
# Find the NA values in the team_id column
meets_df[meets_df['team_id'].isna()]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
2073,27977,2023-01-20,2023,https://roadtonationals.com/results/schedule/m...,
3131,26843,2022-01-21,2022,https://roadtonationals.com/results/schedule/m...,
5813,24822,2019-04-12,2019,https://roadtonationals.com/results/schedule/m...,
7426,21326,2017-04-14,2017,https://roadtonationals.com/results/schedule/m...,
9073,20001,2016-02-12,2016,https://roadtonationals.com/results/schedule/m...,
9256,19660,2016-03-19,2016,https://roadtonationals.com/results/schedule/m...,
9266,20016,2016-03-19,2016,https://roadtonationals.com/results/schedule/m...,


This seemed curious, and upon exploring the data, it seems there is a coding error on the website or perhaps there are meets listed that did not end up taking place due to cancellations, but were still listed as 'intended meets'. Furthermore, some teams are listed as participants for (semi) finals that did not actually make it through to that round of competition. 

As such, I can feel safe dropping dropping those `meet_ids` from the `meets_df`.

Incidentally, these are the same meet_ids that weren't able to be retrieved in the scraping process, so that solves a mystery there.

In [161]:
meets_df[meets_df['meet_id'].isin([27977, 26843, 24822, 21326, 20001, 19660, 20016])]

# Drop any row that has a meet_id that of a row with a missing team_id
meets_df = meets_df[~meets_df['meet_id'].isin([27977, 26843, 24822, 21326, 20001, 19660, 20016])]

# Make sure there's no more NAs in the team_id column
meets_df[meets_df['team_id'].isna()]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id


And I can now retype the `meet_id` column.

In [162]:
# Change the team_id column to an integer
meets_df['team_id'] = meets_df['team_id'].astype(int)

# Preview the meets_df
meets_df.head()

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id
0,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,34
1,28977,2024-01-05,2024,https://roadtonationals.com/results/schedule/m...,46
2,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,34
3,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,47
4,29040,2024-01-13,2024,https://roadtonationals.com/results/schedule/m...,66


I also found that there were some duplicated combinations of `team_id` and `meet_id`, suggesting that some team competed in the same meets twice. 

As this is not how gymnastics meets are run (ie. teams only ever compete once per meet), I feel safe deleting these duplicates so that I have a nice clean primary key combination for my database.

In [163]:
# Make sure the combination of meet_id and team_id is unique
meets_df[meets_df.duplicated(subset=['meet_id', 'team_id'], keep=False)]

# Drop the duplicate rows
meets_df.drop_duplicates(subset=['meet_id', 'team_id'], inplace=True)

# Make sure the combination of meet_id and team_id is unique
meets_df[meets_df.duplicated(subset=['meet_id', 'team_id'], keep=False)]

Unnamed: 0,meet_id,meet_date,year,meet_url,team_id


## 2.2 Creating a `meets` table for the `gymternet` database. 

As with the `teams` table, my preference is for setting the structure of the table before importing the data.

In [164]:
%%sql --alias gymternet

DROP TABLE IF EXISTS meets;

CREATE TABLE meets (
    meet_id SMALLINT NOT NULL,
    team_id TINYINT NOT NULL,
    year SMALLINT NOT NULL,
    meet_date DATE,
    meet_url VARCHAR(128),
    PRIMARY KEY (meet_id, team_id),
    FOREIGN KEY (team_id) REFERENCES teams (team_id)
);


In [165]:
# Importing the data from the meets_df into the SQL database
meets_df.to_sql('meets', con=engine, if_exists='append', index=False)

10273

# 3 Clean up `team_results_data_df` and create `team_results` table

What I want in this table in my database is:

| **meet_id** | **team_id** | **vt_score** | **ub_score** | **bb_score** | **fx_score** | **team_score** |
|-------------|-------------|--------------|--------------|--------------|--------------|----------------|
| 117897      | 68          | 48.89725     | 48.89725     | 48.89725     | 48.89725     | 195.589        |


The unique combination of `meet_id` and `team_id` columns should act as the primary key, and both should also be foreign keys, connecting to the `meets` table and `teams` table respectively.

## 3.1 Cleaning `team_results_data_df`

In [166]:
team_results_data_df = pd.read_pickle('../data/raw/dirty_dfs/team_results_data_df.pkl')

# Preview the meets_df
team_results_data_df.head()

Unnamed: 0,mid,tid,tname,vault,bars,beam,floor,tscore,year,home,lead
0,28977,34,LSU,49.375,49.375,48.7,49.525,196.975,2024,H,0.0
1,28978,46,Ohio State,49.3,49.125,49.05,49.3,196.775,2024,A,0.2
2,29039,47,Oklahoma,49.45,49.45,49.525,49.475,197.9,2024,A,0.0
3,29040,34,LSU,49.225,49.65,48.75,49.525,197.15,2024,A,0.75
4,29042,66,UCLA,49.4,49.25,49.25,49.2,197.1,2024,A,0.8


This DataFrame is in better shape than I remember, so it should be a fairly straightforward exercise of removing unwanted columns, making sure our types are correct, changing column names and ensuring we don't have any duplicated rows.

### 3.1.1 Remove unwanted columns

In [167]:
# Delete tname, year, home and lead columns from the team_results_data_df
team_results_data_df.drop(columns=['tname', 'year', 'home', 'lead'], inplace=True)

# Preview the team_results_data_df
team_results_data_df.head()

Unnamed: 0,mid,tid,vault,bars,beam,floor,tscore
0,28977,34,49.375,49.375,48.7,49.525,196.975
1,28978,46,49.3,49.125,49.05,49.3,196.775
2,29039,47,49.45,49.45,49.525,49.475,197.9
3,29040,34,49.225,49.65,48.75,49.525,197.15
4,29042,66,49.4,49.25,49.25,49.2,197.1


### 3.1.2 Retype incorrectly typed columns

In [168]:
# Preview the df types
team_results_data_df.dtypes

mid       object
tid       object
vault     object
bars      object
beam      object
floor     object
tscore    object
dtype: object

In [169]:
# Check for any NA values
team_results_data_df[team_results_data_df.isna().any(axis=1)]

# Drop the NA values
team_results_data_df.dropna(inplace=True)

# Retype mid and tid columns as int, and all other columns as float (with four decimal places)
team_results_data_df['mid'] = team_results_data_df['mid'].astype(int)
team_results_data_df['tid'] = team_results_data_df['tid'].astype(int)
team_results_data_df['vault'] = team_results_data_df['vault'].astype(float)
team_results_data_df['bars'] = team_results_data_df['bars'].astype(float)
team_results_data_df['beam'] = team_results_data_df['beam'].astype(float)
team_results_data_df['floor'] = team_results_data_df['floor'].astype(float)
team_results_data_df['tscore'] = team_results_data_df['tscore'].astype(float)


team_results_data_df['vault'] = team_results_data_df['vault'].round(4)
team_results_data_df['bars'] = team_results_data_df['bars'].round(4)
team_results_data_df['beam'] = team_results_data_df['beam'].round(4)
team_results_data_df['floor'] = team_results_data_df['floor'].round(4)
team_results_data_df['tscore'] = team_results_data_df['tscore'].round(4)

# Preview the df types
team_results_data_df.dtypes

mid         int64
tid         int64
vault     float64
bars      float64
beam      float64
floor     float64
tscore    float64
dtype: object

### 3.1.3 Renaming columns

In [170]:
# Rename the columns
team_results_data_df.rename(columns={'mid': 'meet_id', 'tid': 'team_id', 'vault': 'vt_score', 'bars': 'ub_score', 'beam': 'bb_score', 'floor': 'fx_score', 'tscore': 'team_score'}, inplace=True)

# Preview the team_results_data_df
team_results_data_df.head()

# Looks lovely :,)

Unnamed: 0,meet_id,team_id,vt_score,ub_score,bb_score,fx_score,team_score
0,28977,34,49.375,49.375,48.7,49.525,196.975
1,28978,46,49.3,49.125,49.05,49.3,196.775
2,29039,47,49.45,49.45,49.525,49.475,197.9
3,29040,34,49.225,49.65,48.75,49.525,197.15
4,29042,66,49.4,49.25,49.25,49.2,197.1


### 3.1.4 Checking for, and removing, duplicates

In [171]:
# Make sure the combination of meet_id and team_id is unique
team_results_data_df[team_results_data_df.duplicated(subset=['meet_id', 'team_id'], keep=False)].sort_values(by=['meet_id'])

# Drop the duplicate rows
team_results_data_df.drop_duplicates(subset=['meet_id', 'team_id'], inplace=True)

# Make sure the combination of meet_id and team_id is unique
team_results_data_df[team_results_data_df.duplicated(subset=['meet_id', 'team_id'], keep=False)]

Unnamed: 0,meet_id,team_id,vt_score,ub_score,bb_score,fx_score,team_score


## 3.2 Creating a `team_results` table for the `gymternet` database. 

You know the drill - I'll set up the table structure before importing the data from the DataFrame.

In [172]:
%%sql --alias gymternet

DROP TABLE IF EXISTS team_results;

CREATE TABLE team_results (
    meet_id SMALLINT NOT NULL,
    team_id TINYINT NOT NULL,
    vt_score DECIMAL(6,4),
    ub_score DECIMAL(6,4),
    bb_score DECIMAL(6,4),
    fx_score DECIMAL(6,4),
    team_score DECIMAL(7,4) NOT NULL,
    PRIMARY KEY (meet_id, team_id),
    FOREIGN KEY (team_id) REFERENCES teams (team_id),
    FOREIGN KEY (meet_id) REFERENCES meets (meet_id)
);

In [173]:
# Importing the data from the team_results_data_df into the SQL database
team_results_data_df.to_sql('team_results', con=engine, if_exists='append', index=False)

10082

# 4 Clean up `gymnasts_data_df` and create `gymnasts`table

What I want in this table in my database is:

| **gymnast_id** | **first_name** | **last_name** | **team_id** |
|----------------|----------------|---------------|-------------|
| 22437          | "Julia"        | "Sebben"      | 1           | 

etc.

The `gymnast_id` column should act as the primary key, and the `team_id` column should be a foreign key, connecting to the `teams` table.

## 4.1 Cleaning `gymnasts_data_df`

In [188]:
# Preview the gymnast_data_df
gymnasts_data_df = pd.read_pickle('../data/raw/dirty_dfs/gymnasts_data_df.pkl')
# TODO: Remove the above line once we're finished cleaning the data
gymnasts_data_df.head()

Unnamed: 0,gid,first_name,last_name,vault,bars,beam,floor,all_around,team_name,team_id,yr,vt_url,ub_url,bb_url,fx_url,meet_id
0,30950,Sierra,Ballard,,,9.2,9.9,,LSU,34,2024,,,,,28977
1,30952,Haleigh,Bryant,9.95,9.875,9.925,9.925,39.675,LSU,34,2024,,,,,28977
2,31947,Ashley,Cowan,,9.8,,,,LSU,34,2024,,,,,28977
3,32453,Amari,Drayton,9.925,,,9.925,,LSU,34,2024,,,,,28977
4,30953,Olivia,Dunne,,,,9.875,,LSU,34,2024,,,,,28977


Again, this should be fairly simple to clean:
1. Remove unwanted columns
2. Rename columns
3. Remove duplicates
4. Retype columns as necessary

### 4.1.1 Remove unwanted columns

In [189]:
# Remove unwanted columns from the gymnasts_data_df
gymnasts_data_df.drop(columns=['vault', 'bars', 'beam', 'floor', 'all_around', 'team_name', 'yr', 'vt_url', 'ub_url', 'bb_url', 'fx_url', 'meet_id'], inplace=True)

# Preview the gymnasts_data_df
gymnasts_data_df.head()

Unnamed: 0,gid,first_name,last_name,team_id
0,30950,Sierra,Ballard,34
1,30952,Haleigh,Bryant,34
2,31947,Ashley,Cowan,34
3,32453,Amari,Drayton,34
4,30953,Olivia,Dunne,34


### 4.1.2 Rename columns

In [190]:
# Rename the columns
gymnasts_data_df.rename(columns={'gid': 'gymnast_id'}, inplace=True)

# Preview the gymnasts_data_df
gymnasts_data_df.head()

Unnamed: 0,gymnast_id,first_name,last_name,team_id
0,30950,Sierra,Ballard,34
1,30952,Haleigh,Bryant,34
2,31947,Ashley,Cowan,34
3,32453,Amari,Drayton,34
4,30953,Olivia,Dunne,34


### 4.1.3 Remove duplicates

In [191]:
# Remove duplicates (by gymnast_id)
gymnasts_data_df.drop_duplicates(subset=['gymnast_id'], inplace=True)

# Preview the gymnasts_data_df
gymnasts_data_df.shape

(4825, 4)

### 4.1.4 Retype columns

In [192]:
# Check the data types
gymnasts_data_df.dtypes

gymnast_id    object
first_name    object
last_name     object
team_id       object
dtype: object

In [193]:
# Check for any NA values
gymnasts_data_df[gymnasts_data_df.isna().any(axis=1)]

# None, woohoo!

Unnamed: 0,gymnast_id,first_name,last_name,team_id


In [196]:
# Retype gymnast_id and team_id columns
gymnasts_data_df['gymnast_id'] = gymnasts_data_df['gymnast_id'].astype(int)
gymnasts_data_df['team_id'] = gymnasts_data_df['team_id'].astype(int)

# Verify the data types
gymnasts_data_df.dtypes

gymnast_id     int64
first_name    object
last_name     object
team_id        int64
dtype: object

## 4.2 Creating a `gymnasts` table for the `gymternet` database. 

You know the drill - I'll set up the table structure before importing the data from the DataFrame.

In [197]:
%%sql --alias gymternet

DROP TABLE IF EXISTS gymnasts;

CREATE TABLE gymnasts (
    gymnast_id INT NOT NULL PRIMARY KEY,
    team_id TINYINT NOT NULL,
    first_name TINYTEXT NOT NULL,
    last_name TINYTEXT NOT NULL,
    FOREIGN KEY (team_id) REFERENCES teams (team_id)
);

In [198]:
# Importing the data from the gynmnasts_data_df into the SQL database
gymnasts_data_df.to_sql('gymnasts', con=engine, if_exists='append', index=False)

4825

# 5 Clean up `gymnast_results_data_df` and create `gymnast_results` table

What I want in this table in my database is:

| **meet_id** | **gymnast_id** | **vt_score** | **ub_score** | **bb_score** | **fx_score** | **aa_score** |
|-------------|----------------|--------------|--------------|--------------|--------------|--------------|
| 17903       | 22437          | 9.4750       | NULL         | 9.575        | NULL         | NULL         |
| 17903       | 22435          | 9.6250       | 9.7000       | 9.675        | 9.7000       | 38.7000      |

etc.

The combination of the `gymnast_id` and `meet_id` columns should act as the primary key, and the `meet_id` column should be a foreign key, connecting to the `meets` table.

## 5.1 Cleaning `gymnast_results_data_df`