### Import Libraries

In [60]:
# Standard Python Library
import numpy as np
import pandas as pd

# Scikit Learn library
from sklearn.impute import SimpleImputer

# Vizualization Library
import matplotlib as plt
import seaborn as sns

### Data Loading

In [61]:
df = pd.read_csv('athlete.csv')
print(df.shape)

(199462, 14)


Name           0
Sex            0
Age          519
Height      9783
Weight     10401
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     172120
dtype: int64

### Data EDA

In [64]:
# Check for Duplicates
df[df.duplicated()]
#None

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal


In [66]:
# Check for Nulls / Missing data
df.isna().sum()

Name           0
Sex            0
Age          519
Height      9783
Weight     10401
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     172120
dtype: int64

In [71]:
df.describe(include='object')

Unnamed: 0,Name,Sex,Team,NOC,Games,Season,City,Sport,Event,Medal
count,199462,199462,199462,199462,199462,199462,199462,199462,199462,27342
unique,98432,2,461,219,28,2,27,51,486,3
top,Andreas Wecker,M,United States,USA,2000 Summer,Summer,Sydney,Athletics,Ice Hockey Men's Ice Hockey,Bronze
freq,32,132236,11464,11959,13820,157818,13820,26723,3667,9333


In [72]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,198943.0,24.957712,5.460792,11.0,21.0,24.0,28.0,71.0
Height,189679.0,175.451716,10.669744,127.0,168.0,175.0,183.0,226.0
Weight,189061.0,70.726538,14.553855,25.0,60.0,70.0,80.0,214.0
Year,199462.0,1993.352949,15.441784,1964.0,1980.0,1996.0,2006.0,2016.0


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199462 entries, 0 to 199461
Data columns (total 14 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Name    199462 non-null  object 
 1   Sex     199462 non-null  object 
 2   Age     198943 non-null  float64
 3   Height  189679 non-null  float64
 4   Weight  189061 non-null  float64
 5   Team    199462 non-null  object 
 6   NOC     199462 non-null  object 
 7   Games   199462 non-null  object 
 8   Year    199462 non-null  int64  
 9   Season  199462 non-null  object 
 10  City    199462 non-null  object 
 11  Sport   199462 non-null  object 
 12  Event   199462 non-null  object 
 13  Medal   27342 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 21.3+ MB


In [92]:
df[df['Age'].isna()]

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
99,Mohamed Abakkar,M,,156.0,48.0,Sudan,SUD,1972 Summer,1972,Summer,Munich,Boxing,Boxing Men's Flyweight,
234,Mohamed Abdel Fatah,M,,170.0,69.0,Sudan,SUD,1972 Summer,1972,Summer,Munich,Football,Football Men's Football,
237,Ibrahim Saad Abdel Galil,M,,176.0,73.0,Sudan,SUD,1972 Summer,1972,Summer,Munich,Athletics,Athletics Men's 200 metres,
238,Ibrahim Saad Abdel Galil,M,,176.0,73.0,Sudan,SUD,1972 Summer,1972,Summer,Munich,Athletics,Athletics Men's 4 x 400 metres Relay,
246,Hwad Ibrahim Abdel Hamid Lumomba,M,,162.0,57.0,Sudan,SUD,1968 Summer,1968,Summer,Mexico City,Boxing,Boxing Men's Featherweight,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195006,Yi Yuong,M,,170.0,62.0,Cambodia,CAM,1964 Summer,1964,Summer,Tokyo,Cycling,Cycling Men's 100 kilometres Team Time Trial,
195068,Franois Yinga,M,,176.0,66.0,Cameroon,CMR,1988 Summer,1988,Summer,Seoul,Wrestling,"Wrestling Men's Featherweight, Freestyle",
195574,Mariana Ysrael,F,,173.0,56.0,Guam,GUM,1988 Summer,1988,Summer,Seoul,Athletics,Athletics Women's Marathon,
196270,Abdel Ali Zahraoui,M,,175.0,68.0,Morocco,MAR,1972 Summer,1972,Summer,Munich,Football,Football Men's Football,


In [93]:
df[df['Height'].isna()]

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
56,Nils Egil Aaness,M,27.0,,,Norway,NOR,1964 Winter,1964,Winter,Innsbruck,Speed Skating,"Speed Skating Men's 1,500 metres",
58,Johan Aantjes,M,26.0,,,Netherlands,NED,1984 Summer,1984,Summer,Los Angeles,Water Polo,Water Polo Men's Water Polo,
61,Willemien Aardenburg,F,22.0,,,Netherlands,NED,1988 Summer,1988,Summer,Seoul,Hockey,Hockey Women's Hockey,Bronze
80,Abdelhak Aatakni,M,24.0,,64.0,Morocco,MAR,2012 Summer,2012,Summer,London,Boxing,Boxing Men's Light-Welterweight,
97,M'Bairo Abakar,M,31.0,,,Chad,CHA,1992 Summer,1992,Summer,Barcelona,Judo,Judo Men's Half-Middleweight,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199310,Vladimir Anatolyevich Zuyev,M,35.0,,,Belarus,BLR,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Two Person Keelboat,
199321,Irina Lvovna Zuykova,F,34.0,,,Unified Team,EUN,1992 Summer,1992,Summer,Barcelona,Equestrianism,"Equestrianism Mixed Dressage, Individual",
199322,Irina Lvovna Zuykova,F,34.0,,,Unified Team,EUN,1992 Summer,1992,Summer,Barcelona,Equestrianism,"Equestrianism Mixed Dressage, Team",
199333,Denis vegelj,M,20.0,,,Slovenia,SLO,1992 Summer,1992,Summer,Barcelona,Rowing,Rowing Men's Coxless Pairs,Bronze


In [94]:
df[df['Weight'].isna()]

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
52,Ragnhild Margrethe Aamodt,F,27.0,163.0,,Norway,NOR,2008 Summer,2008,Summer,Beijing,Handball,Handball Women's Handball,Gold
56,Nils Egil Aaness,M,27.0,,,Norway,NOR,1964 Winter,1964,Winter,Innsbruck,Speed Skating,"Speed Skating Men's 1,500 metres",
58,Johan Aantjes,M,26.0,,,Netherlands,NED,1984 Summer,1984,Summer,Los Angeles,Water Polo,Water Polo Men's Water Polo,
61,Willemien Aardenburg,F,22.0,,,Netherlands,NED,1988 Summer,1988,Summer,Seoul,Hockey,Hockey Women's Hockey,Bronze
97,M'Bairo Abakar,M,31.0,,,Chad,CHA,1992 Summer,1992,Summer,Barcelona,Judo,Judo Men's Half-Middleweight,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199310,Vladimir Anatolyevich Zuyev,M,35.0,,,Belarus,BLR,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Two Person Keelboat,
199321,Irina Lvovna Zuykova,F,34.0,,,Unified Team,EUN,1992 Summer,1992,Summer,Barcelona,Equestrianism,"Equestrianism Mixed Dressage, Individual",
199322,Irina Lvovna Zuykova,F,34.0,,,Unified Team,EUN,1992 Summer,1992,Summer,Barcelona,Equestrianism,"Equestrianism Mixed Dressage, Team",
199333,Denis vegelj,M,20.0,,,Slovenia,SLO,1992 Summer,1992,Summer,Barcelona,Rowing,Rowing Men's Coxless Pairs,Bronze


In [95]:
df[df['Medal'].isna()]

Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
3,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",
4,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,NED,1992 Winter,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199457,Andrzej ya,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
199458,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
199459,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
199460,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


### Replace Nulls using Scikit Learn Simple Imputer

In [85]:
# get the categorical and numeric column names
obj_col = list(df.select_dtypes(include='object').columns)
num_col = list(df.select_dtypes(include=['float64','int64']).columns)
print(f'The object columns are: {obj_col} \nThe numeric columns are: {num_col}')

The object columns are: ['Name', 'Sex', 'Team', 'NOC', 'Games', 'Season', 'City', 'Sport', 'Event', 'Medal'] 
The numeric columns are: ['Age', 'Height', 'Weight', 'Year']


In [89]:
# Get the dataframe of the numeric and objects
df_obj = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [119]:
# Simple Imputer for the Numeric
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df_num)
#imp_mean.get_params()

In [120]:
# Prepare the numerica Data
nums = pd.DataFrame(data = imp_mean.transform(df_num), columns=num_col)
nums.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199462 entries, 0 to 199461
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Age     199462 non-null  float64
 1   Height  199462 non-null  float64
 2   Weight  199462 non-null  float64
 3   Year    199462 non-null  float64
dtypes: float64(4)
memory usage: 6.1 MB


In [121]:
# Simple Imputer for the category
imp_miss = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None')
imp_miss.fit(df_obj)

In [122]:
# prepare the category data
obj = pd.DataFrame(data = imp_miss.transform(df_obj), columns=obj_col)
obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199462 entries, 0 to 199461
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    199462 non-null  object
 1   Sex     199462 non-null  object
 2   Team    199462 non-null  object
 3   NOC     199462 non-null  object
 4   Games   199462 non-null  object
 5   Season  199462 non-null  object
 6   City    199462 non-null  object
 7   Sport   199462 non-null  object
 8   Event   199462 non-null  object
 9   Medal   199462 non-null  object
dtypes: object(10)
memory usage: 15.2+ MB


### Joining the two cleaned data

In [142]:
data = nums.join(obj)
data

Unnamed: 0,Age,Height,Weight,Year,Name,Sex,Team,NOC,Games,Season,City,Sport,Event,Medal
0,24.0,180.0,80.0,1992.0,A Dijiang,M,China,CHN,1992 Summer,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,23.0,170.0,60.0,2012.0,A Lamusi,M,China,CHN,2012 Summer,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,21.0,185.0,82.0,1988.0,Christine Jacoba Aaftink,F,Netherlands,NED,1988 Winter,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
3,21.0,185.0,82.0,1988.0,Christine Jacoba Aaftink,F,Netherlands,NED,1988 Winter,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",
4,25.0,185.0,82.0,1992.0,Christine Jacoba Aaftink,F,Netherlands,NED,1992 Winter,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199457,29.0,179.0,89.0,1976.0,Andrzej ya,M,Poland-1,POL,1976 Winter,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
199458,27.0,176.0,59.0,2014.0,Piotr ya,M,Poland,POL,2014 Winter,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
199459,27.0,176.0,59.0,2014.0,Piotr ya,M,Poland,POL,2014 Winter,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
199460,30.0,185.0,96.0,1998.0,Tomasz Ireneusz ya,M,Poland,POL,1998 Winter,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [150]:
# Converting Age to Integer
data['Age'] = data['Age'].astype('int')

## Exporting the clean data for SQL modeling

In [151]:
data.to_csv('clean_athlete.csv')