In [13]:
# Salary Benchmarking for Competitive Compensation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the provided dataset
input_file_path = '/mnt/data/salary_benchmarking_dataset.xlsx'
df = pd.read_excel("D:/E/Study Materials/Salary Benchmarking/salary_benchmarking_dataset.xlsx")

df


Unnamed: 0,Role,Location,Education_Level,Experience_Years,Salary_Range
0,Data Scientist,Remote,High School,6,Medium
1,Product Manager,New York,High School,19,Medium
2,HR Manager,Austin,Bachelor's,14,High
3,Data Scientist,Seattle,High School,10,Low
4,Marketing Specialist,San Francisco,Master's,7,Medium
...,...,...,...,...,...
995,Product Manager,Austin,High School,7,Medium
996,HR Manager,Austin,PhD,5,Medium
997,Data Scientist,New York,Master's,5,High
998,Software Engineer,Remote,PhD,7,High


In [15]:

# Map 'Salary_Range' to numerical values
def map_salary_range(salary):
    mapping = {'Low': 0, 'Medium': 1, 'High': 2}
    return mapping[salary]

df['Salary_Range_Numeric'] = df['Salary_Range'].map(map_salary_range)

# Prepare the data for regression
df_encoded = pd.get_dummies(df, columns=['Role', 'Location', 'Education_Level'], drop_first=True)

# Define features (X) and target (y)
X = df_encoded.drop(['Salary_Range', 'Salary_Range_Numeric'], axis=1)
y = df['Salary_Range_Numeric']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)



In [16]:
# Evaluate the model
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Generate a regression report
def regression_report(y_true, y_pred):
    errors = y_true - y_pred
    mae = np.mean(np.abs(errors))
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    report = (
        f"Regression Report:\n"
        f"Mean Absolute Error: {mae:.2f}\n"
        f"Mean Squared Error: {mse:.2f}\n"
        f"Root Mean Squared Error: {rmse:.2f}\n"
        f"R-squared: {r2:.2f}\n"
    )
    return report

# Print the regression report
print(regression_report(y_test, y_pred))


Mean Squared Error: 0.60
R-squared: -0.37
Regression Report:
Mean Absolute Error: 0.61
Mean Squared Error: 0.60
Root Mean Squared Error: 0.78
R-squared: -0.37

