In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
from pathlib import Path

In [2]:
olympic_csv = Path('../Model Data/Olympic Medal Training.csv')

In [3]:
#  Import and read the Olympic CSV.
olympic_model_df = pd.read_csv(olympic_csv, encoding="utf-8", low_memory = True)
olympic_model_df.head()

Unnamed: 0.1,Unnamed: 0,athlete_id,name,sex,born,height,weight,country,edition,edition_id,sport,event,pos,medal,isTeamSport
0,0,65649,Ivanka Bonova,Female,4-Apr-49,166.0,55.0,Bulgaria,1976 Summer Olympics,19,Athletics,"4 × 400 metres Relay, Women",5 h2 r1/2,0,True
1,1,65649,Ivanka Bonova,Female,4-Apr-49,166.0,55.0,Bulgaria,1980 Summer Olympics,20,Athletics,"4 × 400 metres Relay, Women",AC r2/2,0,True
2,2,112510,Nataliya Uryadova,Female,15-Mar-77,184.0,70.0,Russian Federation,2008 Summer Olympics,53,Beach Volleyball,"Beach Volleyball, Women",19,0,True
3,3,114973,Essa Ismail Rashed,Male,14-Dec-86,165.0,55.0,Qatar,2008 Summer Olympics,53,Athletics,"10,000 metres, Men",20,0,False
4,4,30359,Péter Boros,Male,12-Jan-08,,,Hungary,1932 Summer Olympics,10,Artistic Gymnastics,"Individual All-Around, Men",19,0,False


## Prepare Preprocessed data to be scaled and trained

In [4]:
olympic_model_clean = olympic_model_df.drop(["Unnamed: 0", "athlete_id", "name", "edition", "edition_id", "born"], axis=1)
olympic_model_clean

Unnamed: 0,sex,height,weight,country,sport,event,pos,medal,isTeamSport
0,Female,166.0,55,Bulgaria,Athletics,"4 × 400 metres Relay, Women",5 h2 r1/2,0,True
1,Female,166.0,55,Bulgaria,Athletics,"4 × 400 metres Relay, Women",AC r2/2,0,True
2,Female,184.0,70,Russian Federation,Beach Volleyball,"Beach Volleyball, Women",19,0,True
3,Male,165.0,55,Qatar,Athletics,"10,000 metres, Men",20,0,False
4,Male,,,Hungary,Artistic Gymnastics,"Individual All-Around, Men",19,0,False
...,...,...,...,...,...,...,...,...,...
316822,Male,188.0,79,Nigeria,Athletics,"4 × 400 metres Relay, Men",5,0,True
316823,Male,188.0,79,Nigeria,Athletics,"400 metres, Men",5 h2 r3/4,0,False
316824,Male,188.0,79,Nigeria,Athletics,"4 × 400 metres Relay, Men",AC h2 r2/3,0,True
316825,Male,188.0,79,Nigeria,Athletics,"400 metres, Men",7 h4 r2/4,0,False


In [5]:
# Looking at columns unique count
olympic_model_clean.nunique()

sex               2
height           95
weight          576
country         707
sport           112
event           964
pos            2282
medal             2
isTeamSport       2
dtype: int64

In [14]:
# finding top25
unique_df = pd.DataFrame(olympic_model_clean['pos'].value_counts())
unique_df.head(25)



Unnamed: 0_level_0,count
pos,Unnamed: 1_level_1
5,17744
1,16509
9,16042
2,15917
3,15848
4,13741
7,11856
6,11652
8,10322
DNS,8516


In [7]:
# creating list to put remaining list into "Other"
pos_list = []
for list in unique_df.index[25:]:
    pos_list.append(list)

In [16]:
# Confirming list is created without top 25
len(pos_list)

2257

In [18]:
olympic_model_clean['pos']

0          5 h2 r1/2
1            AC r2/2
2                 19
3                 20
4                 19
             ...    
316822             5
316823     5 h2 r3/4
316824    AC h2 r2/3
316825     7 h4 r2/4
316826             1
Name: pos, Length: 316827, dtype: object

In [19]:
# Replace in dataframe
for app in pos_list:
    olympic_model_clean['pos'] = olympic_model_clean['pos'].replace(app, "other")

# Check to make sure replacement was successful
olympic_model_clean['pos'].value_counts()

pos
other    98818
5        17744
1        16509
9        16042
2        15917
3        15848
4        13741
7        11856
6        11652
8        10322
DNS       8516
17        8291
DNF       8104
11        7872
10        7732
AC        7602
12        6692
13        6070
14        4594
15        4366
16        4053
19        3189
18        3071
20        2868
33        2743
21        2615
Name: count, dtype: int64

In [20]:
olympic_model_clean

Unnamed: 0,sex,height,weight,country,sport,event,pos,medal,isTeamSport
0,Female,166.0,55,Bulgaria,Athletics,"4 × 400 metres Relay, Women",other,0,True
1,Female,166.0,55,Bulgaria,Athletics,"4 × 400 metres Relay, Women",other,0,True
2,Female,184.0,70,Russian Federation,Beach Volleyball,"Beach Volleyball, Women",19,0,True
3,Male,165.0,55,Qatar,Athletics,"10,000 metres, Men",20,0,False
4,Male,,,Hungary,Artistic Gymnastics,"Individual All-Around, Men",19,0,False
...,...,...,...,...,...,...,...,...,...
316822,Male,188.0,79,Nigeria,Athletics,"4 × 400 metres Relay, Men",5,0,True
316823,Male,188.0,79,Nigeria,Athletics,"400 metres, Men",other,0,False
316824,Male,188.0,79,Nigeria,Athletics,"4 × 400 metres Relay, Men",other,0,True
316825,Male,188.0,79,Nigeria,Athletics,"400 metres, Men",other,0,False


In [25]:
#removing other
olympic_model_clean = olympic_model_clean[olympic_model_clean['pos']!= "other"]
olympic_model_clean

Unnamed: 0,sex,height,weight,country,sport,event,pos,medal,isTeamSport
2,Female,184.0,70,Russian Federation,Beach Volleyball,"Beach Volleyball, Women",19,0,True
3,Male,165.0,55,Qatar,Athletics,"10,000 metres, Men",20,0,False
4,Male,,,Hungary,Artistic Gymnastics,"Individual All-Around, Men",19,0,False
5,Male,,,Hungary,Artistic Gymnastics,"Team All-Around, Men",4,0,True
6,Male,,,Hungary,Artistic Gymnastics,"Floor Exercise, Men",19,0,False
...,...,...,...,...,...,...,...,...,...
316817,Male,175.0,79,Hungary,Wrestling,"Middleweight, Freestyle, Men",AC,0,False
316818,Male,,,Hungary,Artistic Gymnastics,"Team All-Around, Men",2,1,True
316820,Male,187.0,84,East Germany,Athletics,"4 × 100 metres Relay, Men",2,1,True
316822,Male,188.0,79,Nigeria,Athletics,"4 × 400 metres Relay, Men",5,0,True


In [26]:
# Making a copy
olympic_model_clean_DF = olympic_model_clean#.copy()

In [27]:
#clean data and get_dummies
olympic_model_clean_DF = pd.get_dummies(olympic_model_clean_DF, dtype='int')
olympic_model_clean_DF

Unnamed: 0,height,medal,isTeamSport,sex_Female,sex_Male,weight_100,"weight_100, 104",weight_100-104,weight_100-105,weight_100-106,...,pos_33,pos_4,pos_5,pos_6,pos_7,pos_8,pos_9,pos_AC,pos_DNF,pos_DNS
2,184.0,0,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,165.0,0,False,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,0,False,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,,0,True,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,,0,False,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316817,175.0,0,False,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
316818,,1,True,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
316820,187.0,1,True,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
316822,188.0,0,True,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [28]:
# Split our preprocessed data into our features and target arrays
y = olympic_model_clean_DF['medal']
X = olympic_model_clean_DF.drop(columns='medal')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [29]:
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)