### Import Dependencies

In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames 

In [2]:
athletes_file = "Resources/athlete_events.csv"
athletes_df = pd.read_csv(athletes_file)
athletes_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
cost_per_file = "Resources/Cost_per_ event_and_cost_ per_athlete_in_the_Olympics.csv"
cost_per_df = pd.read_csv(cost_per_file)
cost_per_df.head()

Unnamed: 0,Games,Year,Country,Type,"Cost per event, mio. USD","Cost per athlete, mio. USD","event,"
0,Tokyo,1964,Japan,Summer,1.7,0.1,
1,Munich,1972,Germany,Summer,5.2,0.1,
2,Montreal,1976,Canada,Summer,30.8,1.0,
3,Moscow,1980,Soviet Union,Summer,31.2,1.2,
4,Los Angeles,1984,United States,Summer,3.3,0.1,


In [4]:
total_cost_file = "Resources/Costs_ of_ the_ Olympic_ Games.csv"
total_cost_df = pd.read_csv(total_cost_file)
total_cost_df.head()

Unnamed: 0,Games,Year,Country,Type,Events,Athletes,"Cost, Billion USD"
0,Rome,1960,Italy,Summer,150,5338,
1,Tokyo,1964,Japan,Summer,163,5152,0.282
2,Mexico City,1968,Mexico,Summer,172,5516,n/a**
3,Munich,1972,Germany,Summer,195,7234,1.009
4,Montreal,1976,Canada,Summer,198,6048,6.093


### Transform athletes DataFrames

In [5]:
# Create a filtered dataframe from specific columns

new_athletes_df = athletes_df[["Name", "City", "Games", "Year", "Season", "Sport", "Medal"]]
new_athletes_df.copy()

# Rename the column headers
new_athletes_df = new_athletes_df.rename(columns={"Season": "Type"})
new_athletes_df["Id"] = new_athletes_df["Year"].astype(str) + " " + new_athletes_df["City"]

# Clean the data by dropping duplicates and setting the index
new_athletes_df.drop_duplicates("Id", inplace=True)
new_athletes_df.set_index("Id", inplace=True)

new_athletes_df.head()

Unnamed: 0_level_0,Name,City,Games,Year,Type,Sport,Medal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1992 Barcelona,A Dijiang,Barcelona,1992 Summer,1992,Summer,Basketball,
2012 London,A Lamusi,London,2012 Summer,2012,Summer,Judo,
1920 Antwerpen,Gunnar Nielsen Aaby,Antwerpen,1920 Summer,1920,Summer,Football,
1900 Paris,Edgar Lindenau Aabye,Paris,1900 Summer,1900,Summer,Tug-Of-War,Gold
1988 Calgary,Christine Jacoba Aaftink,Calgary,1988 Winter,1988,Winter,Speed Skating,


In [6]:
# do we want to drop rows with empty columns?

### Transform cost_per DataFrames

In [7]:
# Create a filtered dataframe from specific columns
cost_per_cols = ["Games", "Country", "Year", "Type", "Cost per event, mio. USD", "Cost per athlete, mio. USD"]
new_cost_per_df= cost_per_df[cost_per_cols].copy()

# Rename the column headers
new_cost_per_df = new_cost_per_df.rename(columns={"Games": "City"})
new_cost_per_df["Id"] = new_cost_per_df["Year"].astype(str) + " " + new_cost_per_df["City"]

# Clean the data by dropping duplicates and setting the index
new_cost_per_df.drop_duplicates("Id", inplace=True)
new_cost_per_df.set_index("Id", inplace=True)

new_cost_per_df.head()

Unnamed: 0_level_0,City,Country,Year,Type,"Cost per event, mio. USD","Cost per athlete, mio. USD"
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1964 Tokyo,Tokyo,Japan,1964,Summer,1.7,0.1
1972 Munich,Munich,Germany,1972,Summer,5.2,0.1
1976 Montreal,Montreal,Canada,1976,Summer,30.8,1.0
1980 Moscow,Moscow,Soviet Union,1980,Summer,31.2,1.2
1984 Los Angeles,Los Angeles,United States,1984,Summer,3.3,0.1


### Transform total_cost DataFrames

In [8]:
# Create a filtered dataframe from specific columns
total_cost_cols = ["Games", "Country", "Year", "Type", "Events", "Athletes", "Cost, Billion USD"]
new_total_cost_df= total_cost_df[total_cost_cols].copy()

# Rename the column headers
new_total_cost_df = total_cost_df.rename(columns={"Games": "City"})
new_total_cost_df["Id"] = new_total_cost_df["Year"].astype(str) + " " + new_total_cost_df["City"]

# Clean the data by dropping duplicates and setting the index
new_total_cost_df["Cost, Billion USD"].replace({"n/a**": "NaN"}, inplace=True)

new_total_cost_df.drop_duplicates("Id", inplace=True)
new_total_cost_df.set_index("Id", inplace=True)


new_total_cost_df.head()

Unnamed: 0_level_0,City,Year,Country,Type,Events,Athletes,"Cost, Billion USD"
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1960 Rome,Rome,1960,Italy,Summer,150,5338,
1964 Tokyo,Tokyo,1964,Japan,Summer,163,5152,0.282
1968 Mexico City,Mexico City,1968,Mexico,Summer,172,5516,
1972 Munich,Munich,1972,Germany,Summer,195,7234,1.009
1976 Montreal,Montreal,1976,Canada,Summer,198,6048,6.093
