In [57]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge,LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

In [42]:
# Read File and Extract Features and Target
salary = pd.read_csv("Salaries.csv")

X = salary.drop('Salary',axis=1)
y = salary['Salary']

numerical_features = ["Age","Years of Experience"]
categorical_features = ["Gender","Education Level","Job Title"]

preprocessor = ColumnTransformer(
    transformers= [
        ('num', MinMaxScaler(), numerical_features),
        ('cate', OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)


In [43]:
# Setting Up The Pipeline
pipe = Pipeline(
    steps= [
        ('preprocessor',preprocessor),
        ('poly_deg', PolynomialFeatures()),
        ('Regressor',LinearRegression())
    ]
)

# Parameter Grid For GridSearchCV
Grid = {
    'poly_deg__degree': [1,2,3]
}

# Stablish The CV 
CrossV = KFold(n_splits=2,shuffle=True,random_state=42)

# Create and Train The Model
Linear_Reg_Model =  GridSearchCV(
    estimator = pipe,
    param_grid = Grid,
    cv = CrossV,
    scoring = 'r2',
    verbose = 1
)

Linear_Reg_Model.fit(X_train,y_train)

for key, value in Linear_Reg_Model.best_params_.items():
    print(f"{key} : {value}\n")    

print(f"Test Set Score = {Linear_Reg_Model.score(X_test,y_test):0.2f}")

Fitting 2 folds for each of 3 candidates, totalling 6 fits
poly_deg__degree : 1

Test Set Score = 0.85


I will Try XGBoosting Hoping for Better Test Score

In [50]:
# Setting Up The Pipeline
pipe = Pipeline(
    steps= [
        ('preprocessor',preprocessor),
        ('Regressor',XGBRegressor())
    ]
)

# Parameter Grid For GridSearchCV
Grid = {
    'Regressor__max_depth': [2,3],
    'Regressor__n_estimators' : [100,150,200,250,300]
}

# Stablish The CV 
CrossV = KFold(n_splits=5,shuffle=True,random_state=42)

# Create and Train The Model
XGB_Model =  GridSearchCV(
    estimator = pipe,
    param_grid = Grid,
    cv = CrossV,
    scoring = 'r2',
    verbose = 1
)

XGB_Model.fit(X_train,y_train)

for key, value in XGB_Model.best_params_.items():
    print(f"{key} : {value}\n")    

print(f"Test Set Score = {XGB_Model.score(X_test,y_test):0.2f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Regressor__max_depth : 3

Regressor__n_estimators : 300

Test Set Score = 0.92


- XGBoosting Model Got Better Test R-squared Score.
- 7% Improvement above regular Multiple Linear Regression Model.

Trying Random Forest For the sake of practice.

In [53]:
# Setting Up The Pipeline
pipe = Pipeline(
    steps= [
        ('preprocessor',preprocessor),
        ('Regressor',RandomForestRegressor(criterion='friedman_mse'))
    ]
)

# Parameter Grid For GridSearchCV
Grid = {
    'Regressor__max_depth': [4,5,6],
    'Regressor__n_estimators' : [i for i in range(5,70,10)]
}

# Stablish The CV 
CrossV = KFold(n_splits=5,shuffle=True,random_state=42)

# Create and Train The Model
XGB_Model =  GridSearchCV(
    estimator = pipe,
    param_grid = Grid,
    cv = CrossV,
    scoring = 'r2',
    verbose = 1
)

XGB_Model.fit(X_train,y_train)

for key, value in XGB_Model.best_params_.items():
    print(f"{key} : {value}\n")    

print(f"Test Set Score = {XGB_Model.score(X_test,y_test):0.2f}")

Fitting 5 folds for each of 21 candidates, totalling 105 fits
Regressor__max_depth : 6

Regressor__n_estimators : 25

Test Set Score = 0.87


- The Best Model On Data Test Set is the XGBoosting Model with 93%. 
- Save the model.

In [58]:
joblib.dump(XGB_Model,"salary_model.joblib")

['salary_model.joblib']