In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv("salaries_cyber_model.csv")
df.head()

Unnamed: 0,experience_level,salary,remote_ratio,company_size
0,EN,63000,50,S
1,MI,95000,0,M
2,MI,70000,0,M
3,EX,315000,100,L
4,SE,220000,100,M


In [11]:
df = df.join( pd.get_dummies(df["experience_level"]) )
df.drop("experience_level", axis=1, inplace=True)

df = df.join( pd.get_dummies(df["remote_ratio"], prefix="remote_") )
df.drop("remote_ratio", axis=1, inplace=True)

df = df.join( pd.get_dummies(df["company_size"], prefix="company_size_") )
df.drop("company_size", axis=1, inplace=True)

df.head()

Unnamed: 0,salary,EN,EX,MI,SE,remote__0,remote__50,remote__100,company_size__L,company_size__M,company_size__S
0,63000,1,0,0,0,0,1,0,0,0,1
1,95000,0,0,1,0,1,0,0,0,1,0
2,70000,0,0,1,0,1,0,0,0,1,0
3,315000,0,1,0,0,0,0,1,1,0,0
4,220000,0,0,0,1,0,0,1,0,1,0


# Test and Training Split

In [12]:
y = df.loc[:, "salary"]
y

0       63000
1       95000
2       70000
3      315000
4      220000
        ...  
867    170000
868    135000
869    100000
870    126000
871     66310
Name: salary, Length: 872, dtype: int64

In [13]:
X = df.drop("salary", axis=1)
X.head()

Unnamed: 0,EN,EX,MI,SE,remote__0,remote__50,remote__100,company_size__L,company_size__M,company_size__S
0,1,0,0,0,0,1,0,0,0,1
1,0,0,1,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,1,0
3,0,1,0,0,0,0,1,1,0,0
4,0,0,0,1,0,0,1,0,1,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, 
)

# Scaling

In [16]:
from sklearn.preprocessing import StandardScaler

In [22]:
y_train = y_train.values.reshape(-1, 1)

In [23]:
scaler = StandardScaler()
y_train = scaler.fit_transform( y_train )
y_train

array([[ 1.10497974e-01],
       [ 3.65799096e-01],
       [ 1.04159618e+00],
       [-8.47322963e-02],
       [ 9.68009391e-01],
       [ 6.66153358e-01],
       [ 6.54448345e-02],
       [ 4.72093589e+00],
       [-7.27181258e-02],
       [ 9.66507619e-01],
       [-9.40741942e-01],
       [-5.65299115e-01],
       [ 1.17759659e+00],
       [-5.65299115e-01],
       [-7.22985102e-01],
       [ 1.07163161e+00],
       [ 1.26836365e+00],
       [-6.85440819e-01],
       [-8.92685260e-01],
       [ 6.54448345e-02],
       [-6.49398308e-01],
       [-9.10706516e-01],
       [-5.35263689e-01],
       [ 6.66153358e-01],
       [-1.03084822e+00],
       [-1.04121044e+00],
       [ 2.75692818e-01],
       [-4.69185751e-01],
       [-8.47623317e-02],
       [ 5.91064792e-01],
       [ 6.66153358e-01],
       [ 3.54094083e-02],
       [-6.85440819e-01],
       [-6.85440819e-01],
       [ 2.15621965e-01],
       [ 8.53874771e-01],
       [ 1.40533400e-01],
       [-6.10352254e-01],
       [-2.3

In [24]:
y_test = scaler.transform( y_test.values.reshape(-1, 1) )

# Test and evaluation split

In [25]:
# Creating evaluation data from 20% of the original data

X_train, X_eval, y_train, y_eval = train_test_split(
    X_train, y_train, test_size=0.25
)

# Modeling

## Linear Regression as Baseline

In [53]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)
reg.score(X_eval, y_eval)

0.25629983319237126

In [54]:
from sklearn.metrics import mean_squared_error 

y_pred = reg.predict(X_eval)
mean_squared_error(y_eval, y_pred, squared=True)

0.5495469056275158

## Polynomial Features

In [63]:
from sklearn.preprocessing import PolynomialFeatures

for degree in range(2, 14):
    # Fit and transform data
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly_features.fit_transform(X_train)

    X_eval_transformed = poly_features.transform(X_eval)

    # Fit model on transformed data
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y_train)

    # Evaluate model
    y_pred = lin_reg.predict(X_eval_transformed)
    mse = mean_squared_error(y_eval, y_pred, squared=True)

    print(f"Degree of Polynom: {degree}, MSE: {mse}")

Degree of Polynom: 2, MSE: 0.5230797690476652
Degree of Polynom: 3, MSE: 7.125106268439511e+23
Degree of Polynom: 4, MSE: 1.1839745199603552e+21
Degree of Polynom: 5, MSE: 8.33614699513701e+23
Degree of Polynom: 6, MSE: 9.66173375619291e+23
Degree of Polynom: 7, MSE: 2.2002800458947877e+21
Degree of Polynom: 8, MSE: 4.127987068271465e+23
Degree of Polynom: 9, MSE: 5.597359721861278e+22
Degree of Polynom: 10, MSE: 3.6995732238219204e+23
Degree of Polynom: 11, MSE: 3.426312095858346e+21
Degree of Polynom: 12, MSE: 5.570721129063014e+24
Degree of Polynom: 13, MSE: 1.236805811365592e+24


## Ridge Regression

In [56]:
from sklearn.linear_model import Ridge, RidgeCV

#ridge_reg = Ridge( alpha=0.1, solver="cholesky" )
#ridge_reg.fit(X_train, y_train)
ridge_reg = RidgeCV(alphas=(0.0001, 0.001, 0.01, 0.1, 1, 2, 3)).fit(X_train, y_train)

y_pred = ridge_reg.predict( X_eval )
mean_squared_error(y_eval, y_pred, squared=True)

0.5464907972269446

In [57]:
ridge_reg.alpha_

1.0

In [58]:
ridge_reg.best_score_

-0.7152058747843033

## Lasso Regression

In [59]:
from sklearn.linear_model import LassoCV

reg = LassoCV(cv=5).fit(X_train, y_train)

y_pred = reg.predict( X_eval )
mean_squared_error(y_eval, y_pred, squared=True)

  y = column_or_1d(y, warn=True)


0.5431297492568821

## Deep Learning

In [82]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(64, input_dim=10, activation="relu"),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(1)
])

model.compile(loss="mean_absolute_error", optimizer=Adam(learning_rate=0.001), metrics="RootMeanSquaredError")

history = model.fit(
    x=X_train,
    y=y_train,
    epochs=20,
    validation_data=(X_eval, y_eval),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [67]:
X_train

Unnamed: 0,EN,EX,MI,SE,remote__0,remote__50,remote__100,company_size__L,company_size__M,company_size__S
693,0,1,0,0,0,0,1,1,0,0
265,0,0,0,1,0,0,1,0,1,0
128,0,0,1,0,0,0,1,0,1,0
586,1,0,0,0,0,1,0,0,0,1
29,0,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
826,0,0,0,1,0,1,0,1,0,0
413,0,0,0,1,0,0,1,1,0,0
76,0,0,1,0,0,0,1,1,0,0
408,0,0,0,1,0,0,1,1,0,0


In [80]:
# Entry level, no remote work, large company
scaler.inverse_transform( model.predict([[1, 0, 0, 0, 1, 0, 0, 1, 0, 0]]) )



array([[80799.33]], dtype=float32)

In [81]:
# Entry level, no remote work, medium company
scaler.inverse_transform( model.predict([[1, 0, 0, 0, 1, 0, 0, 0, 1, 0]]) )



array([[63688.516]], dtype=float32)

In [83]:
!pip install -q -U keras-tuner

In [86]:
import keras_tuner as kt

def model_builder(hp):
    model = Sequential([
        Dense(64, input_dim=10, activation="relu"),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1)
    ])

    hp_learning_rate = hp.Choice("learning_rate", values=[0.1, 0.01, 0.001, 0.00001])
    model.compile(loss="mean_absolute_error", optimizer=Adam(learning_rate=0.001), metrics="RootMeanSquaredError")

    return model


tuner = kt.Hyperband(
    model_builder,
    objective=kt.Objective("val_root_mean_squared_error", direction="min"),
    max_epochs=30,
    factor=3,
    directory="hp_tuning",
    project_name="cyber_salary_estimator"
)

tuner.search(
    X_train, 
    y_train, 
    epochs=50,
    validation_data=(X_eval, y_eval),
)

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_learning_rate = best_hps["learning_rate"]

print(f"Best learning rate: {best_learning_rate}")

Trial 90 Complete [00h 00m 02s]
val_root_mean_squared_error: 0.7250751256942749

Best val_root_mean_squared_error So Far: 0.7175182700157166
Total elapsed time: 00h 01m 46s
INFO:tensorflow:Oracle triggered exit
Best learning rate: 0.01


In [90]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_eval, y_eval))

val_rmse_per_epoch = history.history["val_root_mean_squared_error"]
best_epoch = val_rmse_per_epoch.index(min(val_rmse_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Best epoch: 5


In [91]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_data=(X_eval, y_eval))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2900f2e7a00>

In [92]:
eval_result = hypermodel.evaluate(X_test, y_test)



In [94]:
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.5349109172821045, 0.7101514339447021]


In [95]:
hypermodel.save("./models/hypermodel")

INFO:tensorflow:Assets written to: ./models/hypermodel\assets
