In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("salaries_cyber_model.csv")
df.head()

Unnamed: 0,experience_level,salary,remote_ratio,company_size
0,MI,145000,0,M
1,MI,115000,0,M
2,SE,160000,0,M
3,SE,120000,0,M
4,SE,104000,100,M


In [3]:
df = df.join( pd.get_dummies(df["experience_level"]) )
df = df.join( pd.get_dummies(df["company_size"], prefix="company_size_") )


In [4]:
df.head()

Unnamed: 0,experience_level,salary,remote_ratio,company_size,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
0,MI,145000,0,M,0,0,1,0,0,1,0
1,MI,115000,0,M,0,0,1,0,0,1,0
2,SE,160000,0,M,0,0,0,1,0,1,0
3,SE,120000,0,M,0,0,0,1,0,1,0
4,SE,104000,100,M,0,0,0,1,0,1,0


In [5]:
df.drop("experience_level", axis=1, inplace=True)
df.drop("remote_ratio", axis=1, inplace=True)
df.drop("company_size", axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,salary,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
0,145000,0,0,1,0,0,1,0
1,115000,0,0,1,0,0,1,0
2,160000,0,0,0,1,0,1,0
3,120000,0,0,0,1,0,1,0
4,104000,0,0,0,1,0,1,0


# Test and Training Split

In [7]:
from sklearn.model_selection import train_test_split

y = df.loc[:, "salary"]
X = df.drop("salary", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, 
)

# Validation split

In [8]:
# Creating evaluation data from 20% of the original data

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25
)

# Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

y_train = scaler.fit_transform( y_train.values.reshape(-1, 1) )
y_test = scaler.transform( y_test.values.reshape(-1, 1) )
y_val = scaler.transform( y_val.values.reshape(-1, 1) )

y_train

array([[ 6.50904781e-01],
       [ 1.65571593e-01],
       [-7.23317029e-01],
       [ 1.13446019e+00],
       [ 7.37304755e-01],
       [ 2.51223755e+00],
       [ 5.60238141e-01],
       [ 6.98904766e-01],
       [-1.48739424e-01],
       [-8.88650312e-01],
       [-1.14642801e+00],
       [-1.43442793e+00],
       [ 1.07223799e+00],
       [-7.94428119e-01],
       [ 4.58904838e-01],
       [ 7.87793628e-01],
       [-9.18872526e-01],
       [-8.83316981e-01],
       [ 4.32238180e-01],
       [-3.61539360e-01],
       [-1.36509461e+00],
       [ 2.61001530e+00],
       [ 1.65571593e-01],
       [ 6.10015904e-01],
       [ 4.11271860e-02],
       [-3.67761580e-01],
       [-2.78872718e-01],
       [ 1.44238266e-01],
       [-3.12863819e-01],
       [-4.74428215e-01],
       [-1.22061313e-02],
       [ 7.66827309e-02],
       [-1.34553906e+00],
       [ 1.05446021e+00],
       [ 2.01127138e-01],
       [-9.54428070e-01],
       [ 1.70193814e-01],
       [ 4.32238180e-01],
       [ 7.3

# Modeling

## Linear Regression as Baseline

In [10]:
X_train

Unnamed: 0,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
505,0,0,0,1,0,1,0
404,0,0,0,1,0,1,0
519,0,0,0,1,0,1,0
532,0,0,0,1,0,1,0
690,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...
1079,0,0,0,1,0,1,0
533,0,0,0,1,0,1,0
1032,0,0,0,1,0,1,0
255,0,0,0,1,1,0,0


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 

reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_val)
reg.score(X_val, y_val), mean_squared_error(y_val, y_pred, squared=True)

(0.2595564992699553, 0.8947775459777232)

In [12]:
reg.coef_

array([[-0.9764411 ,  1.09633347, -0.4636781 ,  0.34378573,  0.10465922,
         0.14280637, -0.24746559]])

In [13]:
import pickle

f = open("./models/linear_regression.sav", "wb")
pickle.dump(reg, f)
f.close()

f = open("./models/standard_scalar.sav", "wb")
pickle.dump(scaler, f)
f.close()

In [14]:
y_pred = reg.predict( [[1, 0, 0, 0, 1, 0, 0]] )
salary = scaler.inverse_transform( y_pred )
salary



array([[82742.67538319]])

## Polynomial Features

In [16]:
from sklearn.preprocessing import PolynomialFeatures

for degree in range(2, 14):
    # Fit and transform data
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly_features.fit_transform(X_train)

    X_eval_transformed = poly_features.transform(X_val)

    # Fit model on transformed data
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y_train)

    # Evaluate model
    y_pred = lin_reg.predict(X_eval_transformed)
    mse = mean_squared_error(y_val, y_pred, squared=True)

    print(f"Degree of Polynom: {degree}, MSE: {mse}")

Degree of Polynom: 2, MSE: 1.9153213298064725e+24
Degree of Polynom: 3, MSE: 2.275621128108648e+23
Degree of Polynom: 4, MSE: 3.790398082612074e+23
Degree of Polynom: 5, MSE: 2.529575872534353e+23
Degree of Polynom: 6, MSE: 2.051823167507792e+22
Degree of Polynom: 7, MSE: 1.7239496433969805e+22
Degree of Polynom: 8, MSE: 1.028057708491777e+22
Degree of Polynom: 9, MSE: 7.5041840697662144e+22
Degree of Polynom: 10, MSE: 2.4534244431428453e+22
Degree of Polynom: 11, MSE: 4.428496624996814e+22
Degree of Polynom: 12, MSE: 4.9283350402952354e+22
Degree of Polynom: 13, MSE: 4.167370094855184e+22


## Ridge Regression

In [17]:
from sklearn.linear_model import Ridge, RidgeCV

#ridge_reg = Ridge( alpha=0.1, solver="cholesky" )
#ridge_reg.fit(X_train, y_train)
ridge_reg = RidgeCV(alphas=(0.0001, 0.001, 0.01, 0.1, 1, 2, 3)).fit(X_train, y_train)

y_pred = ridge_reg.predict( X_val )
mean_squared_error(y_val, y_pred, squared=True)

0.8958543356682788

In [18]:
ridge_reg.alpha_

1.0

In [19]:
ridge_reg.best_score_

-0.702665004494572

## Lasso Regression

In [20]:
from sklearn.linear_model import LassoCV

reg = LassoCV(cv=5).fit(X_train, y_train)

y_pred = reg.predict( X_val )
mean_squared_error(y_val, y_pred, squared=True)

  y = column_or_1d(y, warn=True)


0.8957741506899999

## Deep Learning

In [21]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

model = Sequential([
    Dense(32, input_dim=7, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1)
])

model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.01))

history = model.fit(
    x=X_train,
    y=y_train,
    epochs=20,
    validation_data=(X_val, y_val),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
# Entry level, large company
scaler.inverse_transform( model.predict([[1, 0, 0, 0, 1, 0, 0]]) )



array([[68944.36]], dtype=float32)

In [23]:
!pip install -q -U keras-tuner

In [24]:
import keras_tuner as kt

def model_builder(hp):
    model = Sequential([
        Dense(32, input_dim=7, activation="relu"),
        Dense(16, activation="relu"),
        Dense(1)
    ])

    hp_learning_rate = hp.Choice("learning_rate", values=[0.1, 0.01, 0.001, 0.00001])
    model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=hp_learning_rate))

    return model


tuner = kt.Hyperband(
    model_builder,
    objective=kt.Objective("val_loss", direction="min"),
    max_epochs=30,
    factor=3,
    directory="hp_tuning",
    project_name="cyber_salary_estimator"
)

tuner.search(
    X_train, 
    y_train, 
    epochs=50,
    validation_data=(X_val, y_val),
)

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_learning_rate = best_hps["learning_rate"]

print(f"Best learning rate: {best_learning_rate}")

INFO:tensorflow:Reloading Tuner from hp_tuning\cyber_salary_estimator\tuner0.json
INFO:tensorflow:Oracle triggered exit
Best learning rate: 0.01


In [28]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_val, y_val))

val_rmse_per_epoch = history.history["val_loss"]
best_epoch = val_rmse_per_epoch.index(min(val_rmse_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Best epoch: 11


In [29]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_data=(X_val, y_val))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x1c018f4a850>

In [30]:
eval_result = hypermodel.evaluate(X_test, y_test)



In [31]:
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: 3.9427740573883057


In [32]:
hypermodel.save("./models/hypermodel")

INFO:tensorflow:Assets written to: ./models/hypermodel\assets


# Example Pipeline

In [33]:
import pickle

f = open("./models/linear_regression.sav", "rb")
model = pickle.load(f)
f.close()

f = open("./models/standard_scalar.sav", "rb")
standard_scaler = pickle.load(f)
f.close()

# Predicting salary for junior professional working at a large size company
y_pred = model.predict( [[1, 0, 0, 0, 1, 0, 0]] )
salary = standard_scaler.inverse_transform( y_pred )
salary



array([[82742.67538319]])