In [1]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.exceptions import ConvergenceWarning
import warnings
import gc
import psutil
import os
from sklearn.decomposition import PCA
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
# Load the dataset
data = pd.read_csv('./datasets/salaries.csv')

# Data exploration
# print(data.head())
# print(data.info())
# print(data.describe())

In [3]:
# Handling missing values (if any)
data = data.dropna()

# Convert categorical features to numerical using OneHotEncoder
categorical_features = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
numerical_features = ['remote_ratio']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [4]:
# Splitting the data into train and test sets
X = data.drop(columns=['salary_in_usd', 'salary', 'salary_currency', 'work_year'])
y = data['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
pca__n_components = [250]

# Define advanced models
advanced_models = {
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0)
}

# Define the parameter grids for advanced models
advanced_param_grids = {
    'XGBoost': {
        'pca__n_components': pca__n_components,
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.05],
        'model__max_depth': [3, 4, 5]
    },
    'LightGBM': {
        'pca__n_components': pca__n_components,
        'model__n_estimators': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.05],
        'model__max_depth': [3, 4, 5]
    },
    'CatBoost': {
        'pca__n_components': pca__n_components,
        'model__iterations': [100, 200, 300],
        'model__learning_rate': [0.01, 0.1, 0.05],
        'model__depth': [3, 4, 5]
    }
}


In [6]:
# Perform grid search for each model
best_estimators = {}
for model_name, model in advanced_models.items():
    print(f"Performing grid search for {model_name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('pca', PCA()), ('model', model)])
    grid_search = GridSearchCV(pipeline, advanced_param_grids[model_name], cv=5, scoring='r2', n_jobs=-1, error_score=np.nan)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        grid_search.fit(X_train, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best R2 score for {model_name}: {grid_search.best_score_}")

Performing grid search for XGBoost...
Best parameters for XGBoost: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 300, 'pca__n_components': 250}
Best R2 score for XGBoost: 0.33101056814193724
Performing grid search for LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.190111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63560
[LightGBM] [Info] Number of data points in the train set: 10581, number of used features: 250
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.169051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63593
[LightGBM] [Info] Start training from score 150011.456951
[LightGBM] [Info] Number of data points in the train set: 10582, number of used features: 250
[LightGBM] [Info] Start training from score 149230.438008
[LightGBM] [Info] Auto-choosing col-wise 

In [7]:
# Evaluate the best estimators on the test set
results = {}
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R2': r2}

# Print the results
for model_name, result in results.items():
    print(f"{model_name}: MSE = {result['MSE']}, R2 = {result['R2']}")

XGBoost: MSE = 3246744601.995817, R2 = 0.32100117206573486
LightGBM: MSE = 3244188796.5912447, R2 = 0.321535645119121
CatBoost: MSE = 3223845794.41123, R2 = 0.3257900220113987


In [8]:
# Define the base models
estimators = [
    ('xgboost', best_estimators['XGBoost']),
    ('lightgbm', best_estimators['LightGBM']),
    ('catboost', best_estimators['CatBoost'])
]

# Define the stacking model
stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0)
)

In [9]:
# Fit the stacking model
stacking_regressor.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63571
[LightGBM] [Info] Number of data points in the train set: 13227, number of used features: 250
[LightGBM] [Info] Start training from score 149822.647615
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63551
[LightGBM] [Info] Number of data points in the train set: 10581, number of used features: 250
[LightGBM] [Info] Start training from score 150211.496834
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 63557
[LightGBM] [Info] Number of data points in the train set: 10581, number of used features: 250
[LightGBM] 

In [10]:
# Make predictions
y_pred = stacking_regressor.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f'Stacking Model R2 Score: {r2}')

Stacking Model R2 Score: 0.3251304732572534


In [11]:
# # Visualize the results
# results_df = pd.DataFrame(results).T
# results_df.plot(kind='bar', figsize=(10, 5))
# plt.title('Model Performance After Grid Search')
# plt.ylabel('Score')
# plt.show()

In [12]:
# # Predicting salaries for new data
# new_data = pd.DataFrame({
#     'experience_level': ['SE'],
#     'employment_type': ['FT'],
#     'job_title': ['Data Scientist'],
#     'employee_residence': ['US'],
#     'remote_ratio': [1],
#     'company_location': ['US'],
#     'company_size': ['M']
# })
# predicted_salary = pipeline.predict(new_data)
# print(f"Predicted Salary in USD: {predicted_salary[0]}")