In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
# Import data
athletes = pd.read_csv("data/athletes.csv")
counts = pd.read_csv("data/counts.csv")

# This dataframe will be the final output
# Initialize as all columns of counts except 'Rank'
olympics = counts.drop(columns=['Rank'])

# Order by 'NOC' and 'Year'
olympics = olympics.sort_values(by=['NOC', 'Year'])

In [27]:
# Feature 1: Total number of athletes per NOC and Year
olympics = olympics.merge(
    athletes.groupby(['NOC', 'Year']).size().reset_index(name='#Athletes'),
    on=['NOC', 'Year'],
    how='left'
)

In [28]:
# Feature 2: Total number of events per NOC and Year
olympics = olympics.merge(
    athletes.groupby(['NOC', 'Year'])['Event'].nunique().reset_index(name='#Events'),
    on=['NOC', 'Year'],
    how='left'
)

In [29]:
# Feature 3: Dummy variable for host country
# The year when the selected NOC hosted the Olympics
# are manually constructed from the data summerOly_host.csv

host_years = {
    'USA': [1904, 1932, 1984, 1996, 2028],
    'CHN': [2008],
    'JPN': [1964, 2020],
    'AUS': [1956, 2000],
    'FRA': [1900, 1924, 1968, 1992],
}

# Create the column 'isHost'
olympics['IsHost'] = olympics.apply(
    lambda row: 1 if row['Year'] in host_years[row['NOC']] else 0,
    axis=1
)

In [30]:
# Save the final data
olympics.to_csv("data/olympics.csv", index=False)