In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.exceptions import ConvergenceWarning
import warnings
import gc
import psutil
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingRegressor

In [2]:
# Load the dataset
data = pd.read_csv('./datasets/salaries.csv')

# Data exploration
# print(data.head())
# print(data.info())
# print(data.describe())

In [3]:
# Handling missing values (if any)
data = data.dropna()

# Convert categorical features to numerical using OneHotEncoder
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numerical_features = ['remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [4]:
# Splitting the data into train and test sets
X = data.drop(columns=['salary_in_usd', 'salary', 'salary_currency', 'work_year'])
y = data['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Define the models
models = {
    'Ridge Regression': Ridge(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Define the parameter grids
param_grids = {
    'Ridge Regression': {
        'pca__n_components': [5, 10, 15, 25, 50, 60, 75, 80, 100, 150, 250],
        'model__alpha': [10.0],  # Fixed value in a list
        'model__fit_intercept': [True]  # Fixed value in a list
    },
    'Random Forest': {
        'pca__n_components': [5, 10, 15, 25, 50, 60, 75, 80, 100, 150, 250],
        'model__n_estimators': [100],  # Fixed value in a list
        'model__max_depth': [None],  # Fixed value in a list
        'model__min_samples_split': [10],  # Fixed value in a list
        'model__min_samples_leaf': [1]  # Fixed value in a list
    },
    'Gradient Boosting': {
        'pca__n_components': [5, 10, 15, 25, 50, 60, 75, 80, 100, 150, 250],
        'model__n_estimators': [300],  # Fixed value in a list
        'model__learning_rate': [0.1],  # Fixed value in a list
        'model__max_depth': [4],  # Fixed value in a list
        'model__min_samples_split': [10],  # Fixed value in a list
        'model__min_samples_leaf': [4]  # Fixed value in a list
    }
}

In [6]:
# Perform grid search for each model
best_estimators = {}
for model_name, model in models.items():
    print(f"Performing grid search for {model_name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('pca', PCA()), ('model', model)])
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='r2', n_jobs=-1, error_score=np.nan)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        grid_search.fit(X_train, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 score for {model_name}: {grid_search.best_score_}")

Performing grid search for Ridge Regression...
Best parameters for Ridge Regression: {'model__alpha': 10.0, 'model__fit_intercept': True, 'pca__n_components': 250}
Best R2 score for Ridge Regression: 0.3257201948911634
Performing grid search for Random Forest...
Best parameters for Random Forest: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 100, 'pca__n_components': 250}
Best R2 score for Random Forest: 0.3213057343300286
Performing grid search for Gradient Boosting...
Best parameters for Gradient Boosting: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 300, 'pca__n_components': 250}
Best R2 score for Gradient Boosting: 0.32649778349849035


In [7]:
# Evaluate the best estimators on the test set
results = {}
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R2': r2}

# Print the results
for model_name, result in results.items():
    print(f"{model_name}: MSE = {result['MSE']}, R2 = {result['R2']}")

Ridge Regression: MSE = 3307123815.078545, R2 = 0.30837390596193404
Random Forest: MSE = 3267224061.83025, R2 = 0.31671822931819316
Gradient Boosting: MSE = 3235964800.156232, R2 = 0.32325554762346553


In [8]:
# Define the base models
estimators = [
    ('ridge', best_estimators['Ridge Regression']),
    ('rf', best_estimators['Random Forest']),
    ('gbr', best_estimators['Gradient Boosting'])
]

# Define the stacking model
stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0)
)

In [9]:
# Fit the stacking model
stacking_regressor.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred = stacking_regressor.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f'Stacking Model R2 Score: {r2}')

Stacking Model R2 Score: 0.3250222113882367


In [11]:
# # Visualize the results
# results_df = pd.DataFrame(results).T
# results_df.plot(kind='bar', figsize=(10, 5))
# plt.title('Model Performance After Grid Search')
# plt.ylabel('Score')
# plt.show()

In [12]:
# # Predicting salaries for new data
# new_data = pd.DataFrame({
#     'experience_level': ['SE'],
#     'employment_type': ['FT'],
#     'job_title': ['Data Scientist'],
#     'employee_residence': ['US'],
#     'remote_ratio': [1],
#     'company_location': ['US'],
#     'company_size': ['M']
# })
# predicted_salary = pipeline.predict(new_data)
# print(f"Predicted Salary in USD: {predicted_salary[0]}")