In [36]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 999

In [3]:
#Read data
df = pd.read_csv("../../Data/Tidy_Full.csv", index_col = 0)

In [4]:
#Inspect data
df.head()

Unnamed: 0_level_0,CompID,Name,Sex,Class,Division,BestBP,Total,Place,BestSQ,BestDL,...,BP2,BP3,Wilks,SQ1,SQ2,SQ3,DL1,DL2,DL3,DOB
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,Barbara Hamilton,F,56.0,O/50,42.5/43,42.5,1,,,...,,,,,,,,,,
1,1,Nat Zacharko,F,56.0,O/40,60,60.0,1,,,...,,,,,,,,,,
2,1,Dean Jenkinson,M,67.5,Open,125,125.0,1,,,...,,,,,,,,,,
3,1,Aaron McFarlane,M,67.5,Open,105,105.0,2-Jan,,,...,,,,,,,,,,
4,1,David Graham,M,67.5,O/60,52.5,52.5,1,,,...,,,,,,,,,,


### Goals:
- Homogenize column values
- Fix data types
- Change no-lifts to minus (rather than x suffix)

##### Sex column

In [5]:
#Inspect
df.Sex.value_counts()

M           15488
F            6969
M-O            63
M-J            43
m              42
F-O            18
F-J            17
M-M1           16
f              16
M - Open       12
M-M3           11
M-SJ           11
F-M1           11
F - M1          7
F - Open        6
M - O           6
M-M2            6
M-Sub           5
W               4
M - M2          3
F - M2          2
M-BL            2
F-SJ            2
F-M3            2
F-M2            1
M - M1          1
M - U18         1
F-M4            1
M - SJ          1
M-Jnr           1
Name: Sex, dtype: int64

In [6]:
#Fix
def fix_sex(df):
    df.Sex = df.Sex.replace({'m': 'M', 'f': 'F', 'W': 'F'})
    df.Sex = df.Sex.str.replace(' ','')
    df[df.Sex.str.contains('-')].Division.isnull().all() #All rows with div info in sex column have nulls in div column
    df.Division.fillna(pd.DataFrame(df.Sex.str.split('-').to_list(), columns = ['Sex', 'Division'])['Division'], inplace = True)
    df.Sex = pd.DataFrame(df.Sex.str.split('-').to_list(), columns = ['Sex', 'Division'])['Sex']
    return df

##### Class column

In [8]:
#Inspect
df[~df.Class.fillna('').str.match(r'^-?\d+(\.\d+)?[+]?$')].Class.value_counts() #Possibly bad values
#df[df.Class.fillna('').str.match(r'^-?\d+(\.\d+)?[+]?$')].Class.value_counts() #Okay values
#df[df.Class.fillna('').str.match(r'^-?\d+(\.\d+)?[+]?$')].Class.value_counts().plot(kind='bar') #Histogram of okay values

90kg         12
75kg          9
82.5kg        8
67.5kg        7
100kg         6
125kg         6
60kg          4
90+kg         3
125x          3
125.00 +      2
82.5 - SJ     2
125 +         2
56kg          2
110kg         2
125+ - O      1
67.5 - M1     1
110 - M2      1
90+ - J       1
90 - M3       1
75 - SJ       1
82.5 - J      1
100 - O       1
110 - SJ      1
82.5 - O      1
125 - O       1
56 - M2       1
67.5 - O      1
100 - M2      1
Name: Class, dtype: int64

In [9]:
#Fix
def fix_class(df):
    df.Class = df.Class.str.replace('kg', '')
    df.Class = df.Class.str.replace('x', '')
    df.Class = df.Class.str.replace(' ', '')
    df[df.Class.fillna('').str.contains('-')].Division.isnull().all() #All rows with div info in class column have nulls in div column
    df.Division.fillna(pd.DataFrame(df.Class.fillna('').str.split('-').to_list(), columns = ['Class', 'Division'])['Division'], inplace = True)
    df.Class = pd.DataFrame(df.Class.fillna('').str.split('-').to_list(), columns = ['Class', 'Division'])['Class']
    df.Class.replace('', np.nan, inplace = True)
    #Look up special case comp tables to check for errors
    df.replace({'Class' : { '125.00+' : '125+', '66.9': '67.5', '55.9': '56', '62.5': '67.5', '67.6': '67.5', '75.5': '75'}}, inplace = True)
    for i in [1338, 1556, 1558, 3446, 3473, 7510, 7528]:
        temp = df.loc[i, 'BDW']
        df.loc[i, 'BDW'] = df.loc[i, 'Class']
        df.loc[i, 'Class'] = temp
    return df

##### DOB column

In [11]:
#Inspect
df.DOB.value_counts()
#set([k[0] for j, k in df[df.DOB.notnull()].iterrows() if any(substring in k[21] for substring in ['PNG', 'NRU', '????', 'DQ', '-', 'NAU', 'NZPF', 'TUV', '???', 'PRC', 'NZ', 'NIU'])])

1993    942
1992    813
1994    811
1995    772
1991    626
       ... 
2020      1
SJ        1
1933      1
52        1
48        1
Name: DOB, Length: 160, dtype: int64

In [12]:
#Fix
def fix_dob(df):
    #Division in DOB column
    for i in [103, 109, 113, 114, 141, 142, 144, 146, 168, 177, 184, 214]:
        df.loc[df.CompID == i, 'Division'] = df.loc[df.CompID == i, 'DOB']
        df.loc[df.CompID == i, 'DOB'] = np.nan
    #Words in DOB column
    df.DOB.replace(['PNG', 'NRU', '????', 'DQ', '-', 'NAU', 'NZPF', 'TUV', '???', 'PRC', 'NZ', 'NIU'], np.nan, inplace = True)
    df.DOB.replace('`987', 1987, inplace = True)
    #Big numbers
    df.DOB.replace('\d{5}', np.nan, inplace = True, regex = True)
    #Small numbers
    df.replace({'DOB' : { '0' : np.nan, '1886': 1986, '1680': 1980, '995': 1995}}, inplace = True)
    df.loc[17261, 'DOB'] = 2018-30; df.loc[12972, 'DOB'] = 2016-35; df.loc[12975, 'DOB'] = 2018-30; df.loc[18635, 'DOB'] = 2018-27
    for i,j in zip([572, 574, 606, 612, 665, 939, 940], [2015, 2015, 2016, 2016, 2017, 2020, 2020]):
        df.loc[df.CompID == i, 'DOB'] = j-df.loc[df.CompID == i, 'DOB']
    #Too close to current year
    df.replace({'DOB' : { 2014 : np.nan, 2016: np.nan, 2020: np.nan}}, inplace = True)
    df.loc[12166, 'DOB'] = np.nan
    return df

##### Division column

In [14]:
#Inspect
#df.Division.value_counts()
#Check rows with weird entries
set([k[0] for j, k in df[df.Division.notnull()].iterrows() if any(substring in k[4] for substring in ['"'])])

{5, 10, 21, 30, 31, 48, 51, 53, 64, 66, 67, 70}

In [15]:
#Fix
def fix_division(df):
    df.loc[df.CompID == 64, 'Division'] = df[df.CompID == 64].Division.str.replace('"', 'Snr')
    df.Division = df.Division.str.replace('"', 'Open')
    divsemdf = pd.read_csv("../../Data/Division_semantics.csv", engine = 'python')
    #Map words to semantic meaning
    semdict = {tuple(divsemdf[divsemdf[col].notnull()][col]): col for col in divsemdf.columns}
    semmap = {}
    for k, v in semdict.items():
        for key in k:
            semmap[key] = v
    #Change words
    df.Division.replace(semmap, inplace = True)
    df.Division.replace('Unknown', np.nan, inplace = True)
    return df

##### Wilks column

In [32]:
#Inspect
df.Wilks.value_counts()
#df[~df.Wilks.fillna('').str.match(r'^-?\d+(\.\d+)?[+]?$')].Wilks.value_counts()
#df[df.Wilks.str.match('\.$') == True] #Check none start with .
#df[df.Wilks.str.match('^\.') == True] #Check none end with .

0          94
406.8       7
383.26      6
412.04      6
387.96      5
           ..
247.811     1
393.12      1
381.18      1
124.55      1
300.335     1
Name: Wilks, Length: 16143, dtype: int64

In [92]:
#Fix
def fix_wilks(df):
    df.Wilks.replace(['BMB', '-', 'out', 'x', 'Disq.', 'Disq', '?', 'DQ (fees)', 'withdrawn', 'DQ'], np.nan, inplace = True)
    df.Wilks = df.Wilks.str.replace(',', '.', regex = False)
    return df

##### BDW column

In [98]:
#Inspect
#df.BDW.value_counts()
df[~df.BDW.fillna('').str.match(r'^-?\d+(\.\d+)?[+]?$')].BDW.value_counts()

Series([], Name: BDW, dtype: int64)

In [96]:
#Fix
def fix_bdw(df):
    df.BDW = df.BDW.str.replace(',', '.', regex = False)
    df.BDW = df.BDW.str.replace('..', '.', regex = False)
    df.BDW = df.BDW.str.replace('?', '', regex = False)
    df.BDW.replace('BMB', np.nan, inplace = True)
    return df

##### Place column

In [None]:
#Inspect


In [None]:
#Fix


### Execute

In [97]:
#One function
def fix_all(df):
    df = fix_sex(df)
    df = fix_class(df)
    df = fix_dob(df)
    df = fix_division(df)
    df = fix_wilks(df)
    df = fix_bdw(df)
    return df
df = fix_all(pd.read_csv("../../Data/Tidy_Full.csv", index_col = 0))

# Testing area

In [20]:
#opdf = pd.read_csv("../../Data/openpowerlifting-2021-03-31-ab223798.csv", index_col = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [71]:
df.columns

Index(['CompID', 'Name', 'Sex', 'Class', 'Division', 'BestBP', 'Total',
       'Place', 'BestSQ', 'BestDL', 'BDW', 'BP1', 'BP2', 'BP3', 'Wilks', 'SQ1',
       'SQ2', 'SQ3', 'DL1', 'DL2', 'DL3', 'DOB'],
      dtype='object')