In [62]:
#Imports
import pandas as pd
from scipy import stats
import numpy as np
import math

In [124]:
#Read data
df = pd.read_csv("../../Data/v5Clean_TypeCOnly.csv", index_col = 0)
comps = pd.read_csv("../../Data/Competitions.csv", index_col = 0)

## 1. Pre-processing

Let's check out the data

In [22]:
df

Unnamed: 0_level_0,CompID,NAME,DOB,SEX,BWT,SQ1,SQ2,SQ3,BP1,BP2,...,WILKS,SQ1outcome,SQ2outcome,SQ3outcome,BP1outcome,BP2outcome,BP3outcome,DL1outcome,DL2outcome,DL3outcome
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,454,Neville Harris,1977.0,M,73.80,202.5,215.0,,135.0,140.0,...,425.21,True,True,False,True,False,False,True,True,False
1,454,Hung Phan,1992.0,M,87.95,180.0,195.0,205.0,120.0,125.0,...,374.74,True,True,True,True,True,True,True,True,True
2,454,Chris Forte,1987.0,M,73.70,180.0,187.5,190.0,127.5,127.5,...,405.79,True,True,True,False,True,True,True,True,False
3,454,Kenny Lay,1992.0,M,90.80,180.0,190.0,200.0,117.5,122.5,...,357.52,True,True,False,True,False,True,True,False,False
4,454,Garry McPherson,1969.0,M,98.60,170.0,182.5,185.0,130.0,135.0,...,342.78,True,False,True,True,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14406,962,Nichola Lovell,1992.0,F,63.20,99.0,104.0,109.0,61.0,65.0,...,342.85,True,True,True,True,False,True,True,True,True
14407,962,Samantha Sutton,1968.0,F,71.40,115.0,120.0,120.0,55.0,60.0,...,309.17,True,False,True,True,False,False,True,True,False
14408,962,Georgina Stevenson,1991.0,F,96.25,100.0,108.0,120.0,50.0,58.0,...,252.78,True,True,True,False,True,False,True,True,True
14409,962,Xinyu Liang,1992.0,F,47.90,90.0,98.0,105.0,47.0,52.0,...,387.45,True,True,True,True,True,False,True,True,True


In [20]:
df.describe()

Unnamed: 0,CompID,DOB,BWT,SQ1,SQ2,SQ3,BP1,BP2,BP3,DL1,DL2,DL3,TOTAL,WILKS
count,14410.0,13861.0,14410.0,13003.0,12965.0,12828.0,14313.0,14274.0,14107.0,13093.0,13017.0,12874.0,14015.0,14015.0
mean,708.024774,1952.814516,82.650244,149.285803,158.656899,164.668569,96.537798,102.50606,105.957929,174.76243,186.978682,194.36632,424.623118,324.889437
std,148.959762,252.296138,20.82372,52.562212,54.853761,56.978821,39.727314,41.176905,41.93306,54.41472,59.61361,57.863009,170.546331,97.900277
min,454.0,15.0,39.3,25.0,11.0,25.0,7.0,26.0,29.0,46.0,52.0,18.0,27.5,32.43
25%,577.0,1980.0,68.2,105.0,115.0,120.0,61.0,65.0,68.0,130.0,140.0,145.0,300.0,287.79
50%,704.0,1989.0,81.1,145.0,155.0,162.0,95.0,100.0,105.0,175.0,185.0,195.0,422.5,339.73
75%,839.0,1994.0,92.75,185.0,197.5,205.0,125.0,132.5,135.0,215.0,230.0,240.0,555.0,386.67
max,962.0,2007.0,195.0,440.0,470.0,1470.0,345.0,345.0,352.5,355.0,2235.0,381.0,1070.0,633.55


It looks like there are some erroneous entries in DOB (e.g. 15) and in some lift columns (e.g. 1470kg squat). Let's fix these first.

In [175]:
#Get z-scores
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_zscores = df[numeric_cols].apply(stats.zscore, nan_policy = 'omit')

#DOB erroneous outliers
DOB_eoutliers = df.loc[df_zscores[abs(df_zscores['DOB']) > 5].index.values]
try:
    print(f"DOB: Count = {len(DOB_eoutliers)}, Min = {min(DOB_eoutliers.DOB)}, \
    Max = {max(DOB_eoutliers.DOB)}, Number of Comps = {len(DOB_eoutliers.CompID.unique())}")
except ValueError:
    print("No DOB erroneous outliers")
#200+ entries from ~10 competitions. We will assume these were mistakenly entered as Age rather than DOB, and hopefully we can fix them.

#SQ3 erroneous outliers
SQ3_eoutliers = df.loc[df_zscores[abs(df_zscores['SQ3']) > 5].index.values]
try:
    print(f"SQ3: Count = {len(SQ3_eoutliers)}, Min = {min(SQ3_eoutliers.SQ3)}, \
    Max = {max(SQ3_eoutliers.SQ3)}, Number of Comps = {len(SQ3_eoutliers.CompID.unique())}")
except ValueError:
    print("No SQ3 erroneous outliers")
#Clearly Jezza's lifts aren't erroneous. The other one is though. We'll assume it's supposed to be 140.

#DL2 erroneous outliers
DL2_eoutliers = df.loc[df_zscores[abs(df_zscores['DL2']) > 5].index.values]
try:
    print(f"DL2: Count = {len(DL2_eoutliers)}, Min = {min(DL2_eoutliers.DL2)}, \
    Max = {max(DL2_eoutliers.DL2)}, Number of Comps = {len(DL2_eoutliers.CompID.unique())}")
except ValueError:
    print("No DL2 erroneous outliers")
#Only one. We'll assume it's supposed to be 235.

No DOB erroneous outliers
SQ3: Count = 3, Min = 457.5,     Max = 477.5, Number of Comps = 3
No DL2 erroneous outliers


In [126]:
#Fix SQ3 and DL2
df.loc[df['SQ3'] == 1470, ['SQ3']] = 140
df.loc[df['DL2'] == 2235, ['DL2']] = 235

In [171]:
#Fix DOB (Subtract age from year of competition - some errors possible (+/-1 year for same person) - doesn't matter)
for index, row in DOB_eoutliers.iterrows():
    df.loc[index, ['DOB']] = int(comps.loc[df.loc[index, ['CompID']]].Link.str.split("\\").values[0][-1][:4]) - df.loc[index, ['DOB']]

Now we'll deal with missing values. Missing values in lift columns indicate unattempted lifts, so it's not possible to find true missing values. We can, however, find missing values for other variables.

In [182]:
df.isnull().sum()/len(df)*100

CompID         0.000000
NAME           0.000000
DOB            3.809854
SEX            0.000000
BWT            0.000000
SQ1            9.764053
SQ2           10.027759
SQ3           10.978487
BP1            0.673144
BP2            0.943789
BP3            2.102706
DL1            9.139486
DL2            9.666898
DL3           10.659264
TOTAL          2.741152
WILKS          2.741152
SQ1outcome     0.000000
SQ2outcome     0.000000
SQ3outcome     0.000000
BP1outcome     0.000000
BP2outcome     0.000000
BP3outcome     0.000000
DL1outcome     0.000000
DL2outcome     0.000000
DL3outcome     0.000000
dtype: float64

Luckily DOB is the only field with missing values (~3.8% missing), and these aren't too important to us (Imputation of these values would be unappealing none the less)