In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv("salaries_cyber_model.csv")
df.head()

Unnamed: 0,experience_level,salary,remote_ratio,company_size
0,EN,63000,50,S
1,MI,95000,0,M
2,MI,70000,0,M
3,EX,315000,100,L
4,SE,220000,100,M


In [73]:
df = df.join( pd.get_dummies(df["experience_level"]) )
df = df.join( pd.get_dummies(df["company_size"], prefix="company_size_") )


In [74]:
df.head()

Unnamed: 0,experience_level,salary,remote_ratio,company_size,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
0,EN,63000,50,S,1,0,0,0,0,0,1
1,MI,95000,0,M,0,0,1,0,0,1,0
2,MI,70000,0,M,0,0,1,0,0,1,0
3,EX,315000,100,L,0,1,0,0,1,0,0
4,SE,220000,100,M,0,0,0,1,0,1,0


In [75]:
df.drop("experience_level", axis=1, inplace=True)
df.drop("remote_ratio", axis=1, inplace=True)
df.drop("company_size", axis=1, inplace=True)

In [76]:
df.head()

Unnamed: 0,salary,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
0,63000,1,0,0,0,0,0,1
1,95000,0,0,1,0,0,1,0
2,70000,0,0,1,0,0,1,0
3,315000,0,1,0,0,1,0,0
4,220000,0,0,0,1,0,1,0


# Test and Training Split

In [77]:
from sklearn.model_selection import train_test_split

y = df.loc[:, "salary"]
X = df.drop("salary", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, 
)

# Validation split

In [78]:
# Creating evaluation data from 20% of the original data

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25
)

# Scaling

In [79]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

y_train = scaler.fit_transform( y_train.values.reshape(-1, 1) )
y_test = scaler.transform( y_test.values.reshape(-1, 1) )
y_val = scaler.transform( y_val.values.reshape(-1, 1) )

y_train

array([[-0.83126109],
       [-1.05251071],
       [-0.49201168],
       [ 0.02423743],
       [ 0.12748725],
       [ 0.42838673],
       [-0.16751224],
       [-0.44776175],
       [ 0.14223723],
       [-0.72801127],
       [ 1.52873483],
       [-0.68376135],
       [-1.27376033],
       [ 1.01248572],
       [-0.49938667],
       [-0.75751122],
       [ 0.87973595],
       [ 2.4137333 ],
       [ 0.20123712],
       [ 2.78248266],
       [ 0.42838673],
       [ 0.7912361 ],
       [-0.68376135],
       [-0.45661174],
       [ 0.93873585],
       [ 0.58591646],
       [ 0.7912361 ],
       [-0.02001249],
       [-0.66134138],
       [ 0.05373738],
       [-0.56576155],
       [-0.5377366 ],
       [-0.09376237],
       [-0.16751224],
       [ 0.48148664],
       [ 0.20123712],
       [-0.72063628],
       [ 0.0979873 ],
       [-0.16751224],
       [-0.66901137],
       [ 0.56998649],
       [-0.09376237],
       [-0.94926089],
       [ 0.20123712],
       [-0.44776175],
       [-1

# Modeling

## Linear Regression as Baseline

In [80]:
X_train

Unnamed: 0,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
747,1,0,0,0,1,0,0
857,1,0,0,0,0,0,1
109,0,0,0,1,1,0,0
526,0,0,0,1,1,0,0
65,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...
198,0,0,0,1,0,1,0
537,0,0,1,0,1,0,0
439,0,0,1,0,0,0,1
14,1,0,0,0,1,0,0


In [81]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 

reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_val)
reg.score(X_val, y_val), mean_squared_error(y_val, y_pred, squared=True)

(0.32633875671605717, 0.45498201355913126)

In [97]:
reg.coef_

array([[-0.98157661,  1.24608955, -0.48704373,  0.2225308 ,  0.10616191,
         0.08739461, -0.19355652]])

In [99]:
import pickle

f = open("./models/linear_regression.sav", "wb")
pickle.dump(reg, f)
f.close()

f = open("./models/standard_scalar.sav", "wb")
pickle.dump(scaler, f)
f.close()

In [84]:
y_pred = reg.predict( [[1, 0, 0, 0, 1, 0, 0]] )
salary = scaler.inverse_transform( y_pred )
salary



array([[79424.78706306]])

## Polynomial Features

In [31]:
from sklearn.preprocessing import PolynomialFeatures

for degree in range(2, 14):
    # Fit and transform data
    poly_features = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly_features.fit_transform(X_train)

    X_eval_transformed = poly_features.transform(X_eval)

    # Fit model on transformed data
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly, y_train)

    # Evaluate model
    y_pred = lin_reg.predict(X_eval_transformed)
    mse = mean_squared_error(y_eval, y_pred, squared=True)

    print(f"Degree of Polynom: {degree}, MSE: {mse}")

Degree of Polynom: 2, MSE: 5.943031453024743e+24
Degree of Polynom: 3, MSE: 1.5594982921145765e+25
Degree of Polynom: 4, MSE: 7.084791095222776e+24
Degree of Polynom: 5, MSE: 4.671071151531031e+25
Degree of Polynom: 6, MSE: 3.764742750040559e+24
Degree of Polynom: 7, MSE: 5.06456494484265e+24
Degree of Polynom: 8, MSE: 3.7834808525459363e+24
Degree of Polynom: 9, MSE: 1.0800809237243628e+23
Degree of Polynom: 10, MSE: 4.259187570987126e+25
Degree of Polynom: 11, MSE: 4.6827767354813393e+24
Degree of Polynom: 12, MSE: 1.9120787775552895e+23
Degree of Polynom: 13, MSE: 3.0643497139890384e+24


## Ridge Regression

In [33]:
from sklearn.linear_model import Ridge, RidgeCV

#ridge_reg = Ridge( alpha=0.1, solver="cholesky" )
#ridge_reg.fit(X_train, y_train)
ridge_reg = RidgeCV(alphas=(0.0001, 0.001, 0.01, 0.1, 1, 2, 3)).fit(X_train, y_train)

y_pred = ridge_reg.predict( X_eval )
mean_squared_error(y_eval, y_pred, squared=True)

0.9101521905927887

In [34]:
ridge_reg.alpha_

1.0

In [112]:
ridge_reg.best_score_

-0.6035605016572188

## Lasso Regression

In [35]:
from sklearn.linear_model import LassoCV

reg = LassoCV(cv=5).fit(X_train, y_train)

y_pred = reg.predict( X_eval )
mean_squared_error(y_eval, y_pred, squared=True)

  y = column_or_1d(y, warn=True)


0.9086181604992466

## Deep Learning

In [93]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

model = Sequential([
    Dense(32, input_dim=7, activation="relu"),
    Dense(16, activation="relu"),
    Dense(1)
])

model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.01))

history = model.fit(
    x=X_train,
    y=y_train,
    epochs=20,
    validation_data=(X_val, y_val),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
X_train

Unnamed: 0,EN,EX,MI,SE,company_size__L,company_size__M,company_size__S
533,0,0,1,0,1,0,0
651,0,0,1,0,0,1,0
40,0,0,0,1,0,1,0
768,0,0,1,0,1,0,0
870,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...
453,1,0,0,0,0,0,1
699,0,1,0,0,1,0,0
562,0,0,0,1,1,0,0
63,0,0,0,1,0,1,0


In [42]:
# Entry level, large company
scaler.inverse_transform( model.predict([[1, 0, 0, 0, 1, 0, 0]]) )



array([[66583.055]], dtype=float32)

In [43]:
!pip install -q -U keras-tuner

In [95]:
import keras_tuner as kt

def model_builder(hp):
    model = Sequential([
        Dense(32, input_dim=7, activation="relu"),
        Dense(16, activation="relu"),
        Dense(1)
    ])

    hp_learning_rate = hp.Choice("learning_rate", values=[0.1, 0.01, 0.001, 0.00001])
    model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=hp_learning_rate))

    return model


tuner = kt.Hyperband(
    model_builder,
    objective=kt.Objective("val_loss", direction="min"),
    max_epochs=30,
    factor=3,
    directory="hp_tuning",
    project_name="cyber_salary_estimator"
)

tuner.search(
    X_train, 
    y_train, 
    epochs=50,
    validation_data=(X_val, y_val),
)

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_learning_rate = best_hps["learning_rate"]

print(f"Best learning rate: {best_learning_rate}")

INFO:tensorflow:Reloading Tuner from hp_tuning\cyber_salary_estimator\tuner0.json
INFO:tensorflow:Oracle triggered exit
Best learning rate: 0.01


In [46]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=30, validation_data=(X_eval, y_eval))

val_rmse_per_epoch = history.history["val_root_mean_squared_error"]
best_epoch = val_rmse_per_epoch.index(min(val_rmse_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Best epoch: 19


In [47]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_data=(X_eval, y_eval))

Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19


<keras.callbacks.History at 0x1c0c7abe040>

In [122]:
eval_result = hypermodel.evaluate(X_test, y_test)



In [123]:
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.5406957864761353, 0.721299946308136]


In [124]:
hypermodel.save("./models/hypermodel")

INFO:tensorflow:Assets written to: ./models/hypermodel\assets


# Example Pipeline

In [100]:
import pickle

f = open("./models/linear_regression.sav", "rb")
model = pickle.load(f)
f.close()

f = open("./models/standard_scalar.sav", "rb")
standard_scaler = pickle.load(f)
f.close()

# Predicting salary for junior professional working at a large size company
y_pred = model.predict( [[1, 0, 0, 0, 1, 0, 0]] )
salary = standard_scaler.inverse_transform( y_pred )
salary



array([[79424.78706306]])