In [1]:
## Import Libraries
import pandas as pd
import pandas as np
# Import libraries
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
## Read the data
data = pd.read_csv('./data/clean_athlete.csv')
data.head()

Unnamed: 0,ID,Name,Age,Sex,Height,Weight,Year,Team,NOC,region,Games,Season,City,Sport,Event,Medal
0,1,A Dijiang,24,M,180,80,1992,China,CHN,China,1992 Summer,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,23,M,170,60,2012,China,CHN,China,2012 Summer,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,24,M,175,70,1920,Denmark,DEN,Denmark,1920 Summer,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,34,M,175,70,1900,Denmark/Sweden,DEN,Denmark,1900 Summer,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,21,F,185,82,1988,Netherlands,NED,Netherlands,1988 Winter,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [3]:
## Dropping irrevlant columns
data = data[['Age', 'Sex', 'Height', 'Weight', 'region', 'City', 'Sport', 'Medal']]

In [4]:
data

Unnamed: 0,Age,Sex,Height,Weight,region,City,Sport,Medal
0,24,M,180,80,China,Barcelona,Basketball,
1,23,M,170,60,China,London,Judo,
2,24,M,175,70,Denmark,Antwerpen,Football,
3,34,M,175,70,Denmark,Paris,Tug-Of-War,Gold
4,21,F,185,82,Netherlands,Calgary,Speed Skating,
...,...,...,...,...,...,...,...,...
263381,29,M,179,89,Poland,Innsbruck,Luge,
263382,27,M,176,59,Poland,Sochi,Ski Jumping,
263383,27,M,176,59,Poland,Sochi,Ski Jumping,
263384,30,M,185,96,Poland,Nagano,Bobsleigh,


In [5]:
# simple check for nulls
data.isna().sum()

Age       0
Sex       0
Height    0
Weight    0
region    0
City      0
Sport     0
Medal     0
dtype: int64

In [6]:
data['Sex'].unique()

array(['M', 'F'], dtype=object)

In [7]:
data['Sport'].unique()

array(['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Speed Skating',
       'Cross Country Skiing', 'Athletics', 'Ice Hockey', 'Swimming',
       'Badminton', 'Sailing', 'Biathlon', 'Gymnastics',
       'Art Competitions', 'Alpine Skiing', 'Handball', 'Weightlifting',
       'Wrestling', 'Luge', 'Water Polo', 'Hockey', 'Rowing', 'Bobsleigh',
       'Fencing', 'Equestrianism', 'Shooting', 'Boxing', 'Taekwondo',
       'Cycling', 'Diving', 'Canoeing', 'Tennis', 'Modern Pentathlon',
       'Figure Skating', 'Golf', 'Softball', 'Archery', 'Volleyball',
       'Synchronized Swimming', 'Table Tennis', 'Nordic Combined',
       'Baseball', 'Rhythmic Gymnastics', 'Freestyle Skiing',
       'Rugby Sevens', 'Trampolining', 'Beach Volleyball', 'Triathlon',
       'Ski Jumping', 'Curling', 'Snowboarding', 'Rugby',
       'Short Track Speed Skating', 'Skeleton', 'Lacrosse', 'Polo',
       'Cricket', 'Racquets', 'Military Ski Patrol', 'Croquet',
       'Jeu De Paume', 'Roque', 'Alpinism', 'Motorb

In [8]:
data['Sport'].unique()

array(['Basketball', 'Judo', 'Football', 'Tug-Of-War', 'Speed Skating',
       'Cross Country Skiing', 'Athletics', 'Ice Hockey', 'Swimming',
       'Badminton', 'Sailing', 'Biathlon', 'Gymnastics',
       'Art Competitions', 'Alpine Skiing', 'Handball', 'Weightlifting',
       'Wrestling', 'Luge', 'Water Polo', 'Hockey', 'Rowing', 'Bobsleigh',
       'Fencing', 'Equestrianism', 'Shooting', 'Boxing', 'Taekwondo',
       'Cycling', 'Diving', 'Canoeing', 'Tennis', 'Modern Pentathlon',
       'Figure Skating', 'Golf', 'Softball', 'Archery', 'Volleyball',
       'Synchronized Swimming', 'Table Tennis', 'Nordic Combined',
       'Baseball', 'Rhythmic Gymnastics', 'Freestyle Skiing',
       'Rugby Sevens', 'Trampolining', 'Beach Volleyball', 'Triathlon',
       'Ski Jumping', 'Curling', 'Snowboarding', 'Rugby',
       'Short Track Speed Skating', 'Skeleton', 'Lacrosse', 'Polo',
       'Cricket', 'Racquets', 'Military Ski Patrol', 'Croquet',
       'Jeu De Paume', 'Roque', 'Alpinism', 'Motorb

In [11]:
data['region'].unique()

array(['China', 'Denmark', 'Netherlands', 'USA', 'Finland', 'Norway',
       'Romania', 'Estonia', 'France', 'Morocco', 'Spain', 'Egypt',
       'Bulgaria', 'Italy', 'Chad', 'Azerbaijan', 'Sudan', 'Russia',
       'Argentina', 'Cuba', 'Belarus', 'Greece', 'Cameroon', 'Turkey',
       'Chile', 'Mexico', 'Nicaragua', 'Hungary', 'Nigeria', 'Algeria',
       'Kuwait', 'Bahrain', 'Pakistan', 'Iraq', 'Syria', 'Lebanon',
       'Qatar', 'Malaysia', 'Iran', 'Canada', 'Ireland', 'Australia',
       'South Africa', 'Eritrea', 'Tanzania', 'Jordan', 'Tunisia',
       'Libya', 'Belgium', 'Djibouti', 'Palestine', 'Comoros',
       'Kazakhstan', 'Brunei', 'Saudi Arabia', 'Ethiopia',
       'United Arab Emirates', 'Yemen', 'Indonesia', 'Philippines',
       'None', 'Uzbekistan', 'Kyrgyzstan', 'Tajikistan', 'Japan',
       'Switzerland', 'Brazil', 'Germany', 'Monaco', 'Israel', 'Uruguay',
       'Sweden', 'Virgin Islands, US', 'Sri Lanka', 'Armenia',
       'Ivory Coast', 'Kenya', 'Benin', 'Ukraine', '

In [12]:
data['City'].unique()

array(['Barcelona', 'London', 'Antwerpen', 'Paris', 'Calgary',
       'Albertville', 'Lillehammer', 'Los Angeles', 'Salt Lake City',
       'Helsinki', 'Lake Placid', 'Sydney', 'Atlanta', 'Stockholm',
       'Sochi', 'Nagano', 'Turin', 'Beijing', 'Rio de Janeiro', 'Athina',
       'Squaw Valley', 'Innsbruck', 'Sarajevo', 'Mexico City', 'Munich',
       'Seoul', 'Berlin', 'Oslo', "Cortina d'Ampezzo", 'Roma',
       'Amsterdam', 'Montreal', 'Melbourne', 'Moskva', 'Tokyo',
       'Vancouver', 'Grenoble', 'Sapporo', 'Chamonix', 'St. Louis',
       'Sankt Moritz', 'Garmisch-Partenkirchen'], dtype=object)

In [9]:
# set aside and save unseen data set
data_unseen = data.sample(n=10000)
data = data.drop(data_unseen.index)
print(f'Data for model: {data.shape},\nData for unseen predictions: {data_unseen.shape}')
data_unseen.to_csv('./data/olympics_unseen.csv', index=False)

Data for model: (253386, 8),
Data for unseen predictions: (10000, 8)


In [13]:
# data.columns!='Medal'
X = data.loc[: , data.columns!='Medal']
y = data.loc[: , data.columns =='Medal']

In [14]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
# encoding 
# get the categorical and numeric column names
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(num_cols, '\n', cat_cols)

['Age', 'Height', 'Weight'] 
 ['Sex', 'region', 'City', 'Sport']


In [20]:
 X_train.select_dtypes(exclude=['object']).columns.tolist()

['Age', 'Height', 'Weight']

In [21]:
# pipeline for numerical columns
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
num_pipe

In [22]:
# pipeline for categorical columns
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N/A'),
    OneHotEncoder(handle_unknown='ignore', sparse=False)
)
cat_pipe

In [23]:
# combine both the pipelines
full_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])
full_pipe

In [24]:
model = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini',
                                      max_depth=None, max_features='sqrt', max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                                      oob_score=False, random_state=4282, verbose=0, warm_start=False)

In [25]:
# build the model
et_olympics = make_pipeline(full_pipe, model)
et_olympics

In [27]:
%%time
# train the model
et_olympics.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Wall time: 1min 42s


In [28]:
# make predictions on the test set
y_pred = et_olympics.predict(X_test)
y_pred

array(['None', 'None', 'None', ..., 'None', 'None', 'None'], dtype=object)

In [29]:
# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy*100:,.1f}%')

Accuracy: 86.5%


In [31]:
# Export the pipeline to a file
import joblib
joblib.dump(et_olympics, 'sk_olympics.pkl')

['sk_olympics.pkl']

In [None]:
# Saving as a Pickle file
import pickle
pickle.dump(et_olympics,open('olympics.pkl','wb'))
##model=pickle.load(open('olympics.pkl','rb'))

### Predciting against unseen Data

In [32]:
new_dat = pd.read_csv('./data/olympics_unseen.csv')
new_dat

Unnamed: 0,Age,Sex,Height,Weight,region,City,Sport,Medal
0,27,M,163,55,Germany,Montreal,Gymnastics,
1,34,M,179,102,Canada,Sochi,Bobsleigh,
2,17,F,180,70,Germany,Mexico City,Swimming,
3,18,M,188,85,Peru,Moskva,Athletics,
4,29,M,181,75,France,Mexico City,Equestrianism,
...,...,...,...,...,...,...,...,...
9995,28,F,171,60,Australia,Sydney,Hockey,Gold
9996,25,M,181,74,China,Los Angeles,Swimming,
9997,24,M,186,74,Czech Republic,Rio de Janeiro,Canoeing,
9998,27,F,184,94,Russia,Atlanta,Athletics,


In [33]:
model = joblib.load('sk_olympics.pkl')
model

In [34]:
pred = model.predict(new_dat)
pred

array(['None', 'None', 'None', ..., 'None', 'None', 'None'], dtype=object)

In [35]:
accuracy = accuracy_score(new_dat.Medal, pred)
print(f'Accuracy: {accuracy*100:,.1f}%')

Accuracy: 86.8%
