INTENTION: <br>
Training different models with combinations fo features to predict the target (bench press).
-  Fit Linear Regression Model for all features
-  Fit Linear Regression model to sex, squat and deadlift features
-  Fit KNN Regressor, adding bodyweight feature.
-  Fit Random Forest Regressor model to sex, age, squat and deadlift features
-  Fit Random Forest Regressor with sex, equipment, age, squat and deadlift features

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import joblib

In [17]:
#loading data as data frame
df = pd.read_csv("powerlifting.csv")

#getting rid of irrelevant freatures ('playerId' and 'Name')
df = df.iloc[:, 2: ]
df.head()

Unnamed: 0,Sex,Equipment,Age,BodyweightKg,BestSquatKg,BestDeadliftKg,BestBenchKg
0,M,Raw,23.0,87.3,205.0,235.0,125.0
1,M,Wraps,23.0,73.48,220.0,260.0,157.5
2,M,Raw,26.0,112.4,142.5,220.0,145.0
3,F,Raw,35.0,59.42,95.0,102.5,60.0
4,F,Raw,26.5,61.4,105.0,127.5,60.0


In [18]:
#encoding data ('Sex' and 'Equipment')
from sklearn.preprocessing import OrdinalEncoder

columns_to_encode = ['Sex', 'Equipment']
encoder = OrdinalEncoder(
    categories = [['F', 'M'],
    ['Raw', 'Wraps', 'Single-ply', 'Multi-ply']]
)


df[columns_to_encode] = encoder.fit_transform(df[columns_to_encode])

df.head()


Unnamed: 0,Sex,Equipment,Age,BodyweightKg,BestSquatKg,BestDeadliftKg,BestBenchKg
0,1.0,0.0,23.0,87.3,205.0,235.0,125.0
1,1.0,1.0,23.0,73.48,220.0,260.0,157.5
2,1.0,0.0,26.0,112.4,142.5,220.0,145.0
3,0.0,0.0,35.0,59.42,95.0,102.5,60.0
4,0.0,0.0,26.5,61.4,105.0,127.5,60.0


In [19]:
#fixing non-float inputs / incorect format 
import numpy as np
def fixing_format(column):
    for index, row_value in df[column].items():
        try:
            df.loc[index, column] = np.abs(float(row_value))
        except (TypeError, ValueError):
            df.loc[index, column] = np.nan
    return df
fixing_format('BodyweightKg')
fixing_format('BestSquatKg')
fixing_format('BestDeadliftKg')
fixing_format('BestBenchKg')

df.dropna(inplace=True)

#np.abs() added due to inocrrect data input as negative of value


In [20]:
#in visualisation outliers were detected that must be removed
squat, deadlift, bench = df.iloc[:, 4], df.iloc[:, 5], df.iloc[:,6]

#setting a standard for removing outliers 
squat_upper_lim, squat_lower_lim = np.mean(squat) + (np.std(squat)), np.mean(squat) - (np.std(squat))
deadlift_upper_lim, deadlift_lower_lim = np.mean(deadlift) + (np.std(deadlift)), np.mean(deadlift) - (np.std(deadlift))
bench_upper_lim, bench_lower_lim = np.mean(bench) + (np.std(bench)), np.mean(bench) - (np.std(bench))

#removing outliers using bitwise boolean indexing
df.drop(df[(df['BestBenchKg'] >= bench_upper_lim) & (df['BestSquatKg'] <= squat_lower_lim)].index, inplace = True)
df.drop(df[(df['BestBenchKg'] <= bench_lower_lim) & (df['BestSquatKg'] >= squat_upper_lim)].index, inplace = True)
df.drop(df[(df['BestBenchKg'] >= bench_upper_lim) & (df['BestDeadliftKg'] <= deadlift_lower_lim)].index, inplace = True)
df.drop(df[(df['BestBenchKg'] <= bench_lower_lim) & (df['BestDeadliftKg'] >= deadlift_upper_lim)].index, inplace = True)

df_clean = df #storing for later
df.to_csv('clean_pl_data.csv', index=False)

In [21]:
#setting the last column (benech) as the target 
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [22]:
#scaling data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [26]:
#training the model

Reg_All_Features = LinearRegression()

Reg_All_Features.fit(X_train_scaled, y_train)

joblib.dump(Reg_All_Features, 'Reg_All_Features.joblib') #saving the model

Reg_All_Features.predict(scaler.transform([[0, 0, 19, 53, 95, 110]]))



array([45.94063513])

The models are scored in 'evaluating_models.ipynb' <br>
Now only sex, squat amd deadlift features will be used.

In [28]:
#loading data as clean data frame stored from before
df = df_clean
#getting rid of irrelevant freatures ('playerId' and 'Name')
df = df.iloc[:, [0,4,5,6]]
df.head()

Unnamed: 0,Sex,BestSquatKg,BestDeadliftKg,BestBenchKg
0,1.0,205.0,235.0,125.0
1,1.0,220.0,260.0,157.5
2,1.0,142.5,220.0,145.0
3,0.0,95.0,102.5,60.0
4,0.0,105.0,127.5,60.0


In [29]:
#setting the last column (benech) as the target 
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X_train

Unnamed: 0,Sex,BestSquatKg,BestDeadliftKg
17619,0.0,180.0,187.5
11670,1.0,185.0,202.5
13152,1.0,217.5,240.0
6456,1.0,255.0,270.0
11227,1.0,170.0,210.0
...,...,...,...
9488,0.0,220.0,247.5
7381,1.0,307.5,317.5
17932,1.0,190.0,225.0
7383,1.0,245.0,245.0


In [30]:
#using pipline to scale and train data
from sklearn.pipeline import Pipeline

Pipe = Pipeline([
    ('scale', StandardScaler()),
    ('regressor', LinearRegression())
])

Pipe.fit(X_train, y_train)
Reg_Squat_Dead = Pipe

In [31]:
#saving model
joblib.dump(Pipe, 'Reg_Squat_Dead.joblib') #saving the model


['Reg_Squat_Dead.joblib']

Now adding bodyweight feature and using KNN Regression

In [33]:
from sklearn.neighbors import KNeighborsRegressor

df = df_clean
df = df.iloc[:, [0,3,4,5,6]]
#setting target
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#scaling and training
Pipe = Pipeline([
    ('scale', StandardScaler()),
    ('regressor', KNeighborsRegressor())
])

Pipe.fit(X_train, y_train)

joblib.dump(Pipe, 'KNN_BW_Squat_Dead.joblib') #saving the model

['KNN_BW_Squat_Dead.joblib']

Now age, deadlift and squat with Random Forest Regression

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
df = df_clean
df = df.iloc[:, [0,2,4,5,6]]
#setting target
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#scaling and training
Pipe = Pipeline([
    ('scale', StandardScaler()),
    ('classifier', RandomForestRegressor())
])

Pipe.fit(X_train, y_train)

joblib.dump(Pipe, 'Forest_Age_Squat_Dead.joblib') #saving the model

['Forest_Age_Squat_Dead.joblib']

Now equioment squat and deadlift

In [35]:
df = df_clean
df = df.iloc[:, [0,1,4,5,6]]
df.head()
#setting target
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#scaling and training
Pipe = Pipeline([
    ('scale', StandardScaler()),
    ('classifier', RandomForestRegressor())
])

Pipe.fit(X_train, y_train)

joblib.dump(Pipe, 'Equipment_Squat_Dead.joblib') #saving the model

['Equipment_Squat_Dead.joblib']