In [36]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display, HTML
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [37]:
def categorize_age(age):
    if (age<18):
        return 1
    elif (age<20):
        return 2
    elif (age<22):
        return 3
    elif (age<24):
        return 4
    elif (age<26):
        return 5
    elif (age<30):
        return 6
    elif (age<34):
        return 7
    elif (age<38):
        return 8
    elif (age<42):
        return 9
    elif (age<46):
        return 10
    else:
        return 11

def categorize_gender(gender):
    if gender == 'M':
        return 0
    else:
        return 1

In [38]:
num_bins = 10
label_names = list(range(1, num_bins + 1))

athlete_df = pd.read_csv('../data/athlete_data/athlete_events.csv')
summer_olympics_df = athlete_df.loc[athlete_df.Season == 'Summer']

noc_id_df = pd.read_csv('../data/athlete_data/NOC_Conversion.csv')

summer_olympics_df = pd.merge(summer_olympics_df, noc_id_df, on='NOC', suffixes=['','_y'])

# Filter out only events from 2016
olympics_2016_events = athlete_df.loc[athlete_df.Year == 2016]['Event'].unique()
summer_olympics_df = summer_olympics_df.loc[summer_olympics_df.Event.isin(olympics_2016_events)]

# Filter Events that have not been represented in at least 5 olympic games to eliminate newer events
summer_olympics_df['olympics_count'] = summer_olympics_df.groupby(['Event'])['Year'].transform('nunique')
summer_olympics_df = summer_olympics_df.loc[summer_olympics_df.olympics_count >= 5]

# Add columns
# add num competitors
summer_olympics_df['Competitor Count'] = summer_olympics_df.groupby(['Event', 'Year'])['ID'].transform('nunique')

# convert winning medal to whether athlete placed
summer_olympics_df['Placed'] = np.where(summer_olympics_df.Medal.notna(), 1, 0)

# add column for previously placed
previous_placed_df = summer_olympics_df.loc[summer_olympics_df.Placed == 1]
previous_placed_df = pd.merge(summer_olympics_df, previous_placed_df, on='ID', how='left', suffixes=('', '_y'))
previous_placed_df = previous_placed_df.loc[(previous_placed_df.Name_y.notna()) &
                                             (previous_placed_df.Year > previous_placed_df.Year_y)]
previous_placed_df['Previously Placed'] = 1

summer_olympics_df = pd.merge(summer_olympics_df, previous_placed_df, on=['ID', 'Year', 'Event'], how='left', suffixes=('', '_y'))

summer_olympics_df['Country Medals in Event'] = summer_olympics_df.groupby(['NOCID', 'Sport'])['Placed'].transform('sum')

summer_olympics_df = summer_olympics_df[['ID', 'Year', 'Age', 'Height', 'Weight', 'NOCID', 'Event', 'Previously Placed', 'Sex', 'Placed']]
summer_olympics_df = summer_olympics_df.drop_duplicates()
summer_olympics_df = summer_olympics_df.drop('ID', axis = 1)
summer_olympics_df['Age'] = summer_olympics_df['Age'].apply(categorize_age)
summer_olympics_df['Sex'] = summer_olympics_df['Sex'].apply(categorize_gender)
summer_olympics_df['Height'] = pd.cut(summer_olympics_df['Height'], bins = num_bins, labels = label_names, right=False, include_lowest=True)
summer_olympics_df['Weight'] = pd.cut(summer_olympics_df['Weight'], bins = num_bins, labels = label_names, right=False, include_lowest=True)
summer_olympics_df['Previously Placed'] = summer_olympics_df['Previously Placed'].fillna(0)

In [39]:
event_list = []

In [40]:
def calculate_variable_weight(Event):

    result_list = []

    summer_event_df = summer_olympics_df.loc[summer_olympics_df.Event == Event].copy()
    result_list.append(Event)

    event_df = summer_event_df.drop('Event', axis=1)
    event_df = event_df.drop('Year', axis=1)
    event_df = event_df.drop('Sex', axis=1)
    event_df.dropna(inplace=True)

    feature_names = event_df.columns

    X = event_df.iloc[:, :-1].values
    y = event_df.iloc[:, -1].values
    # labelencoder_X = LabelEncoder()
    # X[:,3] = labelencoder_X.fit_transform(X[:,3])



    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    clf.score(X_test, y_test)

    rf = RandomForestClassifier(n_estimators=200)
    rf = rf.fit(X_train, y_train)

    result_list.append(rf.score(X_test, y_test))
    result_list.append(rf.feature_importances_[0])
    result_list.append(rf.feature_importances_[1])
    result_list.append(rf.feature_importances_[2])
    result_list.append(rf.feature_importances_[3])
    result_list.append(rf.feature_importances_[4])

    return result_list

# import multiprocessing
#
# PROCESSES = 2
# with multiprocessing.Pool(PROCESSES) as pool:
#     pool.map(calculate_variable_weight,summer_olympics_df['Event'].unique()[0:10])

In [41]:
event_list.clear()

for event in summer_olympics_df['Event'].unique():
    print(event)
    try:
        event_list.append(calculate_variable_weight(event))
    except:
        print('Error occurred. Skipping')

# i = 0
# event_list.clear()
#
# for event in summer_olympics_df['Event'].unique():
#     print(event)
#     calculateVariableWeight(event)
#
#     i += 1
#     if i > 10:
#         break


Basketball Men's Basketball
Judo Men's Extra-Lightweight
Boxing Men's Middleweight
Swimming Women's 200 metres Freestyle
Swimming Women's 4 x 200 metres Freestyle Relay
Hockey Men's Hockey
Judo Men's Middleweight
Basketball Women's Basketball
Wrestling Men's Heavyweight, Greco-Roman
Archery Men's Individual
Swimming Women's 200 metres Backstroke
Boxing Men's Light-Heavyweight
Triathlon Men's Olympic Distance
Football Women's Football
Rhythmic Gymnastics Women's Individual
Athletics Women's 10,000 metres
Badminton Men's Singles
Football Men's Football
Rhythmic Gymnastics Women's Group
Wrestling Men's Light-Heavyweight, Freestyle
Gymnastics Women's Individual All-Around
Gymnastics Women's Team All-Around
Gymnastics Women's Horse Vault
Gymnastics Women's Uneven Bars
Gymnastics Women's Balance Beam
Athletics Men's Hammer Throw
Volleyball Men's Volleyball
Athletics Women's Shot Put
Athletics Women's 400 metres
Swimming Women's 100 metres Butterfly
Swimming Women's 4 x 100 metres Medley Rela

In [42]:
weight_variable_df = pd.DataFrame(event_list, columns=['Event', 'Fit', 'Age', 'Height', 'Weight', 'Country', 'Previously Placed'])
display(weight_variable_df)
weight_variable_df.to_csv('../data/results/variable_weight.csv', index=False)

Unnamed: 0,Event,Fit,Age,Height,Weight,Country,Previously Placed
0,Basketball Men's Basketball,0.872458,0.109819,0.072301,0.048216,0.740779,0.028884
1,Judo Men's Extra-Lightweight,0.819444,0.163160,0.052844,0.050919,0.608196,0.124881
2,Boxing Men's Middleweight,0.776471,0.251891,0.077406,0.011183,0.635506,0.024014
3,Swimming Women's 200 metres Freestyle,0.934579,0.231366,0.151308,0.085466,0.468306,0.063553
4,Swimming Women's 4 x 200 metres Freestyle Relay,0.863636,0.128944,0.068577,0.046523,0.695167,0.060790
...,...,...,...,...,...,...,...
267,"Athletics Men's 3,000 metres Steeplechase",0.905405,0.200492,0.082794,0.046875,0.608709,0.061130
268,Taekwondo Men's Featherweight,0.789474,0.126564,0.078420,0.023045,0.588683,0.183288
269,"Equestrianism Mixed Three-Day Event, Team",0.785714,0.256217,0.109741,0.049595,0.525667,0.058781
270,Athletics Men's 4 x 400 metres Relay,0.827684,0.154967,0.082529,0.038355,0.688959,0.035191


In [43]:
results_list = []

In [51]:
def predict_event_winner(Event):

    summer_event_df = summer_olympics_df.loc[summer_olympics_df.Event == Event].copy()

    event_df = summer_event_df.drop('Event', axis=1)
    event_df = event_df.drop('Sex', axis=1)
    event_df = event_df.loc[event_df.Year < 2016]
    event_df = event_df.drop('Year', axis = 1)
    event_df.dropna(inplace=True)

    feature_names = event_df.columns


    X = event_df.iloc[:, :-1].values
    y = event_df.iloc[:, -1].values
    labelencoder_X = LabelEncoder()
    X[:,3] = labelencoder_X.fit_transform(X[:,3])


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)


    model = Sequential()

    model.add(Dense(50, activation='relu', input_dim=X_train.shape[1]))

    model.add(Dense(40, activation='relu'))
    model.add(Dense(40, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(20, activation='relu'))


    model.add(Dense(y_train.shape[1],activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


    X_train=np.asarray(X_train).astype(np.int)
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)

    model.fit(X_train,
              y_train,
              epochs=10000,
              shuffle=True,
              callbacks=[callback],
              verbose=0)



    X_test = np.asarray(X_test).astype(np.int)


    summer_2016_df = summer_event_df.loc[summer_olympics_df.Year == 2016].dropna()
    for index, row in summer_2016_df.iterrows():
        competitor_array = np.asarray([[row['Age'], row['Height'], row['Weight'], row['NOCID'], row['Previously Placed']]]).astype(int)
        competitor = competitor_array.tolist()[0]
        competitor.append(Event)
        competitor.append(model.predict(competitor_array)[0][0])
        competitor.append(row['Placed'])
        results_list.append(competitor)

    # summer_2016_df['Predicted Placing'] = summer_2016_df.apply(lambda x: model.predict_classes(summer_event_df[['Age', 'Height', 'Weight', 'NOCID', 'Competitor Count', 'Previously Placed']]))

In [52]:
results_list.clear()

for event in summer_olympics_df['Event'].unique():
    print(event)
    predict_event_winner(event)

Basketball Men's Basketball
Judo Men's Extra-Lightweight
Boxing Men's Middleweight
Swimming Women's 200 metres Freestyle
Swimming Women's 4 x 200 metres Freestyle Relay
Hockey Men's Hockey
Judo Men's Middleweight
Basketball Women's Basketball
Wrestling Men's Heavyweight, Greco-Roman
Archery Men's Individual
Swimming Women's 200 metres Backstroke
Boxing Men's Light-Heavyweight
Triathlon Men's Olympic Distance
Football Women's Football
Rhythmic Gymnastics Women's Individual
Athletics Women's 10,000 metres
Badminton Men's Singles
Football Men's Football
Rhythmic Gymnastics Women's Group
Wrestling Men's Light-Heavyweight, Freestyle
Gymnastics Women's Individual All-Around
Gymnastics Women's Team All-Around
Gymnastics Women's Horse Vault
Gymnastics Women's Uneven Bars
Gymnastics Women's Balance Beam
Athletics Men's Hammer Throw
Volleyball Men's Volleyball
Athletics Women's Shot Put
Athletics Women's 400 metres
Swimming Women's 100 metres Butterfly
Swimming Women's 4 x 100 metres Medley Rela

In [55]:
print(results_list)

results_df = pd.DataFrame(results_list, columns=['Age', 'Height', 'Weight', 'NCOID', 'Previously Placed', 'Event', 'Prediction', 'Placed'])
results_df.to_csv('../data/results/placement_predictions.csv', index=False)

results_df['pred_rank'] = results_df.groupby('Event').Prediction.rank('max', ascending=False)
results_df.loc[(results_df.pred_rank < 3) & (results_df.Placed == 1)]

[[4, 8, 4, 73, 0, "Basketball Men's Basketball", 0.98874, 0], [4, 7, 4, 73, 0, "Basketball Men's Basketball", 0.9912674, 0], [6, 7, 5, 73, 0, "Basketball Men's Basketball", 0.9922375, 0], [5, 10, 5, 73, 0, "Basketball Men's Basketball", 0.97871953, 0], [5, 7, 4, 73, 0, "Basketball Men's Basketball", 0.9932359, 0], [4, 9, 4, 73, 0, "Basketball Men's Basketball", 0.9856302, 0], [6, 9, 5, 73, 0, "Basketball Men's Basketball", 0.9875207, 0], [4, 8, 4, 73, 0, "Basketball Men's Basketball", 0.98874, 0], [3, 6, 3, 73, 0, "Basketball Men's Basketball", 0.9943151, 0], [6, 8, 5, 73, 0, "Basketball Men's Basketball", 0.99009526, 0], [3, 10, 4, 73, 0, "Basketball Men's Basketball", 0.97732985, 0], [3, 8, 5, 73, 0, "Basketball Men's Basketball", 0.9817172, 0], [6, 8, 5, 5, 0, "Basketball Men's Basketball", 0.9879903, 0], [6, 7, 4, 5, 0, "Basketball Men's Basketball", 0.9381754, 0], [8, 8, 5, 5, 0, "Basketball Men's Basketball", 0.99515826, 0], [6, 7, 4, 5, 0, "Basketball Men's Basketball", 0.938175

Unnamed: 0,Age,Height,Weight,NCOID,Previously Placed,Event,Prediction,Placed,pred_rank
190,3,6,3,18,0,Boxing Men's Middleweight,0.999824,1,2.0
1517,6,6,4,4,1,"Wrestling Men's Light-Heavyweight, Freestyle",0.9996,1,1.0
1594,3,2,1,73,0,Gymnastics Women's Team All-Around,0.999917,1,2.0
3175,6,6,3,3,1,"Shooting Men's Small-Bore Rifle, Three Positio...",0.999981,1,1.0
3633,7,5,3,2,0,Weightlifting Women's Heavyweight,0.999915,1,1.0
3895,6,5,3,11,1,Diving Men's Synchronized Platform,0.999999,1,1.0
3943,4,5,3,11,0,Diving Men's Synchronized Springboard,0.939703,1,2.0
4151,3,5,2,73,0,Synchronized Swimming Women's Team,0.986732,1,2.0
5817,7,5,2,5,0,Sailing Women's Windsurfer,1.0,1,1.0
6127,4,4,3,168,0,Weightlifting Women's Middleweight,0.99997,1,1.0
