In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.options.display.float_format = '{:.2f}'.format

### Load in Data

In [2]:
df = pd.read_csv("./data/ds_salaries.csv")
df.drop(["Unnamed: 0", "salary", "salary_currency"], axis=1, inplace=True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,US,50,US,L


### Preprocessing

In [3]:
def preprocessing(df: pd.DataFrame)-> pd.DataFrame:

    # Replace categoricial data with ordinal values
    df['experience_level'] = df['experience_level'].replace({'EN': 1, 'MI': 2, 'SE': 3, 'EX': 4})
    df['employment_type'] = df['employment_type'].replace({'FL':0 ,'PT': 1, 'FT': 2, 'CT': 3,})
    df['remote_ratio'] = df['remote_ratio'].replace({100: 3, 50: 2, 0: 1})
    df["company_size"] = df["company_size"].replace({'S':1 ,'M': 2, 'L': 3})
    
    return df

In [4]:
df = preprocessing(df)

In [5]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,remote_ratio,company_size,job_title_3D Computer Vision Researcher,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Data Scientist,...,company_location_PL,company_location_PT,company_location_RO,company_location_RU,company_location_SG,company_location_SI,company_location_TR,company_location_UA,company_location_US,company_location_VN
0,2020,2,2,79833,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,3,2,260000,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,3,2,109024,2,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020,2,2,20000,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020,3,2,150000,2,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
y = df["salary_in_usd"]
X = df.drop(["salary_in_usd"], axis=1)
X = MinMaxScaler().fit_transform(X)

print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (607, 162), y shape: (607,)


Get the best model

In [7]:
def regression_results(X: pd.DataFrame, y: pd.Series):
    """
    Inputs the raw data and returns a comparison of all the model accuracies

    Args:
        X (pd.DataFrame): input variables
        y (pd.Series): target
    """
    
    r = Ridge()
    l = Lasso()
    e = ElasticNet()
    knn = KNeighborsRegressor()
    gbr = GradientBoostingRegressor()

    regressors = [r, l, e, knn, gbr]
    regressors_names = ["Ridge", "Lasso", "ElasticNet", "KNN", "Gradient Boosting Regressor"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

    r_squared, mse, mae = [], [], []
    
    res = pd.DataFrame(columns=["R_square", "Mean_Squared_Error", "Mean_Absolute_Error"], index = regressors_names)
    
    for regressor in regressors:
        model = regressor.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r_squared.append(r2_score(y_test, y_pred))
        mse.append(mean_squared_error(y_test, y_pred)**0.5)
        mae.append(mean_absolute_error(y_test, y_pred))
        
    res.R_square = r_squared
    res.Mean_Squared_Error = mse
    res.Mean_Absolute_Error = mae
    
    return res.sort_values("R_square", ascending=False)

In [8]:
regression_results(X, y)

  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,R_square,Mean_Squared_Error,Mean_Absolute_Error
Lasso,0.43,69248.18,40853.77
Ridge,0.4,70723.32,40610.72
Gradient Boosting Regressor,0.38,72264.62,42369.44
KNN,0.32,75695.05,44840.85
ElasticNet,0.19,82397.1,51744.77
