In [19]:
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import (
    KFold,
    cross_val_score,
)
from sklearn.metrics import (
    make_scorer, 
    mean_squared_error
)
from salary_prediction.config.config import (
    TRAINED_LINEAR_REGRESSOR,
    TRAINED_RIDGE_REGRESSOR,
    VALIDATION_TRAINING_DATA,
)

Loading the trained model and validation data

In [20]:
def load_trained_model_and_validation_data():
    linear_regression_model = joblib.load(TRAINED_LINEAR_REGRESSOR)
    ridge_regression_model = joblib.load(TRAINED_RIDGE_REGRESSOR)
    X_train = joblib.load(os.path.join(VALIDATION_TRAINING_DATA, "X_train.pkl"))
    y_train = joblib.load(os.path.join(VALIDATION_TRAINING_DATA, "y_train.pkl"))

    return linear_regression_model, ridge_regression_model, X_train, y_train

Creating a custom scorer that prioritizes lower MSE scores indicating better model performance

In [21]:
custom_scorer = make_scorer(mean_squared_error, greater_is_better=False)

Explcit k-fold cross validation implementation

In [22]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

Validating the model predictions against other folds

In [23]:
def model_validation(model, X_train, y_train, model_name):
    mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=custom_scorer)
    results = pd.DataFrame(
        data=[mse_scores],
        columns=["1st fold", "2nd fold", "3rd fold", "4th fold", "5th fold"]
    )

    results.insert(0, "Model", model_name)

    return results

In [24]:
linear_regression_model, ridge_regression_model, X_train, y_train = load_trained_model_and_validation_data()

Validation results for both Linear and Ridge Regression

In [25]:
linear_regressor_model_validation = model_validation(linear_regression_model, X_train, y_train, "Linear Regressor")
ridge_regressor_model_validation = model_validation(ridge_regression_model, X_train, y_train, "Ridge Regressor")

Combining the results

In [26]:
combined_validation = pd.concat([linear_regressor_model_validation, ridge_regressor_model_validation], axis=0, ignore_index=True)
combined_validation

Unnamed: 0,Model,1st fold,2nd fold,3rd fold,4th fold,5th fold
0,Linear Regressor,-13998820.0,-24620270.0,-25566350.0,-51531960.0,-45102590.0
1,Ridge Regressor,-14053190.0,-24662720.0,-25576980.0,-51504970.0,-44967050.0


In [27]:
# combined_validation.to_csv("/Users/josephobukofe/salary_prediction/models/model_validation_report.csv", index=False)