INTENTION: <br>
-  Find R^2 scores for all the models
-  Find the best parameters for the highest scoring model.
- Rescore the model with the best parameters

In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Reg_All_Features = joblib.load('Reg_All_Features.joblib')
Reg_Squat_Dead = joblib.load('Reg_Squat_Dead.joblib')
KNN_BW_Squat_Dead = joblib.load('KNN_BW_Squat_Dead.joblib')
Forest_Age_Squat_Dead = joblib.load('Forest_Age_Squat_Dead.joblib')
Equipment_Squat_Dead = joblib.load('Equipment_Squat_Dead.joblib')


In [6]:
df = pd.read_csv('clean_pl_data.csv')
#setting the last column (benech) as the target 
X, y = df.iloc[:, : -1], df.iloc[:, -1]

#splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
#scaling data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
model_dict = {
    '1' : Reg_All_Features,
    '2' : Reg_Squat_Dead,
    '3' : KNN_BW_Squat_Dead,
    '4' : Forest_Age_Squat_Dead,
    '5' : Equipment_Squat_Dead 
}




def scoring(model_selection, X_train, y_train, X_test, y_test):
    model = model_dict[model_selection]
    X_test_df = pd.DataFrame(X_test, columns=['Sex', 'Equipment', 'Age', 'BodyweightKg', 'BestSquatKg', 'BestDeadliftKg'])

    if model_selection == '1':
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train, y_train)
        X_test_scaled = scaler.transform(X_test)
        model.fit(X_train_scaled, y_train)
        return model.score(X_test_scaled, y_test)
    
    elif model_selection == '2':
        X_train_subset = X_train[['Sex','BestSquatKg', 'BestDeadliftKg']]
        X_test_subset = X_test_df[['Sex','BestSquatKg', 'BestDeadliftKg']]
        model.fit(X_train_subset, y_train)
        return model.score(X_test_subset, y_test)
    
    elif model_selection == '3':
        X_train_subset = X_train[['Sex','BodyweightKg','BestSquatKg', 'BestDeadliftKg']]
        X_test_subset = X_test_df[['Sex','BodyweightKg','BestSquatKg', 'BestDeadliftKg']]
        model.fit(X_train_subset, y_train)
        return model.score(X_test_subset, y_test)
    
    elif model_selection == '4':
        X_train_subset = X_train[['Sex','Age','BestSquatKg', 'BestDeadliftKg']]
        X_test_subset = X_test_df[['Sex','Age','BestSquatKg', 'BestDeadliftKg']]
        model.fit(X_train_subset, y_train)
        return model.score(X_test_subset, y_test)
    
    elif model_selection == '5':
        X_train_subset = X_train[['Sex','Equipment','BestSquatKg', 'BestDeadliftKg']]
        X_test_subset = X_test_df[['Sex','Equipment','BestSquatKg', 'BestDeadliftKg']]
        model.fit(X_train_subset, y_train)
        return model.score(X_test_subset, y_test)   
    

scores = [scoring(model_selection, X_train.copy(), y_train.copy(), X_test.copy(), y_test.copy()) for model_selection in ['1','2', '3', '4', '5']]
print(scores)

[0.8700204564862428, 0.8595701356060687, 0.8482675087326711, 0.8543654438596274, 0.8441590689561489]


The first model scored the highest in this case, below is finding the best scoring model for the general case.

In [8]:
import numpy as np
best_model_index = str(np.argmax(scores) + 1)
best_model = model_dict[best_model_index]

In [9]:
#LinearRegression for ALL features Evaluation

#testing to find best parameters

param_grid = {
    'copy_X' : [False],
    'tol' : [1e-10,1e-8,1e-7, 1e-5],
    'n_jobs' : [1,2],
    'positive' : [True]
}

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(best_model, param_grid, cv=3)

grid.fit(X_train_scaled, y_train)
grid.best_params_
new_score = grid.score(X_test_scaled, y_test)
print(new_score)

0.8700204564862427


Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score

X_scaled = scaler.fit_transform(X)

scores_cv = cross_val_score(best_model, X_scaled, y , cv=5) #5 folds
print(scores_cv)
print(np.mean(scores_cv))


[0.86686895 0.87503508 0.86906515 0.86864903 0.87153531]
0.8702307023571434
