In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
# from xgboost import XGBRegressor  # uncomment if xgboost installed

import joblib

# Load processed data (already encoded/scaled per Step 1-2)
df = pd.read_csv("data/processed_data.csv")
print(df.shape)
df.head()


(995, 9)


Unnamed: 0,SEX,AGE,UNIT,DESIGNATION,PAST EXP,RATINGS,years_experience,SALARY,TOTAL_EXPERIENCE
0,1,-0.714847,5,0,-1.627437,-0.676471,-1.598467,37350.0,-2.244495
1,0,-0.358498,2,0,0.844777,0.728725,1.060036,64750.0,1.351258
2,0,0.3542,2,3,-0.921091,-1.379069,-0.345903,56520.0,-0.818593
3,0,-0.091236,5,0,0.491603,-0.676471,-1.802968,43550.0,-1.169902
4,1,-1.071196,2,3,1.551124,1.431322,1.060036,80580.0,1.764563


In [54]:
X = df.drop(columns=["SALARY"])
y = df["SALARY"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(kernel="rbf", C=100, gamma="scale"),
    # "XGB": XGBRegressor(random_state=42)  # if available
}


In [55]:
def eval_regression(y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae, mse, rmse, r2 = eval_regression(y_test, y_pred)

    results.append({
        "Model": name,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2
    })
    print(f"{name}: R2={r2:.4f}, RMSE={rmse:.2f}, MAE={mae:.2f}")


LinearRegression: R2=0.6762, RMSE=9261.54, MAE=7113.37
Ridge: R2=0.6762, RMSE=9261.43, MAE=7114.02
Lasso: R2=0.6762, RMSE=9261.54, MAE=7113.37
RandomForest: R2=0.9726, RMSE=2691.78, MAE=1905.39
GradientBoosting: R2=0.9835, RMSE=2088.58, MAE=1414.57
SVR: R2=0.2889, RMSE=13724.84, MAE=9880.25


In [56]:
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
results_df


Unnamed: 0,Model,MAE,MSE,RMSE,R2
4,GradientBoosting,1414.567126,4362178.0,2088.582763,0.983533
3,RandomForest,1905.388526,7245705.0,2691.784706,0.972648
1,Ridge,7114.015266,85774130.0,9261.432507,0.676204
2,Lasso,7113.366785,85776170.0,9261.542523,0.676197
0,LinearRegression,7113.366776,85776170.0,9261.542786,0.676196
5,SVR,9880.246264,188371300.0,13724.842565,0.288902


In [57]:
from sklearn.model_selection import cross_val_score

print("Cross-Validation Scores (R² with 5-fold):\n")
for name, model in models.items():
    try:
        scores = cross_val_score(model, X, y, cv=5, scoring='r2')
        print(f"{name:20} CV R²: mean = {scores.mean():.4f}, std = {scores.std():.4f}")
    except Exception as e:
        print(f"{name:20} Error during CV: {e}")


Cross-Validation Scores (R² with 5-fold):

LinearRegression     CV R²: mean = 0.6550, std = 0.0197
Ridge                CV R²: mean = 0.6550, std = 0.0197
Lasso                CV R²: mean = 0.6550, std = 0.0197
RandomForest         CV R²: mean = 0.9724, std = 0.0024
GradientBoosting     CV R²: mean = 0.9839, std = 0.0030
SVR                  CV R²: mean = 0.2911, std = 0.0345


In [58]:
best_name = results_df.iloc[0]["Model"]
best_model = models[best_name]  # already trained

# Retrain on FULL data before saving (recommended)
best_model.fit(X, y)

joblib.dump(best_model, "pkl/salary_model.pkl")
print(f"Saved best model: {best_name} -> salary_model.pkl")


Saved best model: GradientBoosting -> salary_model.pkl


In [59]:
import pandas as pd
import joblib


model = joblib.load("pkl/salary_model.pkl")


new_emp = pd.DataFrame([{
    "SEX": "Female",
    "UNIT": "Sales",
    "DESIGNATION": "Analyst",  #change always
    "AGE": 27,
    "PAST EXP": 4,
    "RATINGS": 4,
    "years_experience": 5.0
}])



def preprocess_single(df):
    import pickle


    with open("pkl/label_encoders.pkl", "rb") as f:
        le_dict = pickle.load(f)


    with open("pkl/scaler.pkl", "rb") as f:
        scaler = pickle.load(f)


    for col in le_dict:
        df[col] = le_dict[col].transform(df[col])


    df["TOTAL_EXPERIENCE"] = df["years_experience"] + df["PAST EXP"]


    num_cols = ["AGE", "PAST EXP", "RATINGS", "years_experience", "TOTAL_EXPERIENCE"]
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.transform(df_scaled[num_cols])


    with open("pkl/feature_order.pkl", "rb") as f:
        feature_order = pickle.load(f)


    df_final = df_scaled[feature_order]
    return df_final


X_new = preprocess_single(new_emp)
pred_salary = model.predict(X_new)[0]


print(f" Predicted Salary: ₹{pred_salary:,.2f}")


 Predicted Salary: ₹50,100.94
