### Import Modules and Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.neighbors import KNeighborsClassifier
from typing import Optional



In [2]:
df_ger = pd.read_csv("data/preprocessed_survey_results.csv")

## Multi-Layer-Perceptron

#### Divide Dataset into Train and Test Data

In [4]:
x = df_ger[['YearsCode', 'Age', 'OrgSize', 'in-person', 'remote']]

# x = df_reg[['YearsCode', 'OrgSize', 'Age', 'Bachelor’s degree','Doctoral degree', 'Master’s degree', 'Primary school','Professional degree', 'Secondary school', 'University courses','in-person', 'remote']]

# x = df_reg[['YearsCode', 'OrgSize', 'Age', 'Bachelor’s degree','Doctoral degree', 'Master’s degree', 'Primary school','Professional degree', 'Secondary school', 'University courses','in-person', 'remote', 'APL', 'Assembly', 'Bash/Shell', 'C', 'C#','C++', 'COBOL', 'Clojure', 'Crystal', 'Dart', 'Delphi', 'Elixir','Erlang', 'F#', 'Fortran', 'Go', 'Groovy', 'HTML/CSS', 'Haskell','Java', 'JavaScript', 'Julia', 'Kotlin', 'LISP', 'Lua', 'MATLAB','OCaml', 'Objective-C', 'PHP', 'Perl', 'PowerShell', 'Python', 'R','Ruby', 'Rust', 'SAS', 'SQL', 'Scala', 'Solidity', 'Swift','TypeScript', 'VBA']]

# x = df_reg[['YearsCode', 'OrgSize', 'Age', 'Bachelor’s degree','Doctoral degree', 'Master’s degree', 'Primary school','Professional degree', 'Secondary school', 'University courses','in-person', 'remote', 'APL', 'Assembly', 'Bash/Shell', 'C', 'C#','C++', 'COBOL', 'Clojure', 'Crystal', 'Dart', 'Delphi', 'Elixir','Erlang', 'F#', 'Fortran', 'Go', 'Groovy', 'HTML/CSS', 'Haskell','Java', 'JavaScript', 'Julia', 'Kotlin', 'LISP', 'Lua', 'MATLAB','OCaml', 'Objective-C', 'PHP', 'Perl', 'PowerShell', 'Python', 'R','Ruby', 'Rust', 'SAS', 'SQL', 'Scala', 'Solidity', 'Swift','TypeScript', 'VBA', 'Academic researcher', 'Blockchain','Cloud infrastructure engineer', 'Data or business analyst','Data scientist or machine learning specialist','Database administrator', 'Designer', 'DevOps specialist','Developer, QA or test', 'Developer, back-end','Developer, desktop or enterprise applications','Developer, embedded applications or devices', 'Developer, front-end','Developer, full-stack', 'Developer, game or graphics','Developer, mobile', 'Educator', 'Engineer, data','Engineer, site reliability', 'Engineering manager','Marketing or sales professional', 'Other (please specify):','Product manager', 'Project manager', 'Scientist','Security professional', 'Senior Executive (C-Suite, VP, etc.)','Student', 'System administrator']]

y = df_ger['Salary']

# scaler = MinMaxScaler(feature_range=(0,1))
# scaler.fit(x)
# x = pd.DataFrame(scaler.transform(x), index=x.index, columns=x.columns)

# normalizer = Normalizer()
# normalizer.fit(x)
# x = pd.DataFrame(normalizer.transform(x), index=x.index, columns=x.columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1000)
folds = KFold(n_splits=10, shuffle=True, random_state=100)

KeyError: "['in-person', 'remote'] not in index"

#### Layer Size Example 1

In [3]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(50, 20, 3), activation='logistic', learning_rate_init=1, max_iter=5000, solver='adam')

mlp_regressor.fit(x_train, y_train)
prediction_mlp = mlp_regressor.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([mlp_regressor], x_test, y_test)
print("Samples:", prediction_mlp[:5])
print("R2:     ", np.round(np.mean(r2_scores), 3))
print("MAE:    ", np.mean(mae_scores).astype(int))
print("MAPE:   ", np.round(np.mean(mape_scores), 3))
print("RMSE:   ", np.mean(rmse_scores).astype(int))
print("MAX:    ", np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_mlp)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_mlp), bins=30)
plt.show()

NameError: name 'x_train' is not defined

#### Layer Size Example 2

In [None]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(200, 200, 200, 200), activation='relu', alpha=0.05, learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, momentum=0.01, solver='adam')

mlp_regressor.fit(x_train, y_train)
prediction_mlp = mlp_regressor.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([mlp_regressor], x_test, y_test)
print("Samples:", prediction_mlp[:5])
print("R2:     ", np.round(np.mean(r2_scores), 3))
print("MAE:    ", np.mean(mae_scores).astype(int))
print("MAPE:   ", np.round(np.mean(mape_scores), 3))
print("RMSE:   ", np.mean(rmse_scores).astype(int))
print("MAX:    ", np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_mlp)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_mlp), bins=30)
plt.show()

#### Layer Size Example 3

In [None]:
mlp_regressor = MLPRegressor(hidden_layer_sizes=(200,), activation='logistic', alpha=0.05, learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, momentum=0.01, solver='adam')

mlp_regressor.fit(x_train, y_train)
prediction_mlp = mlp_regressor.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([mlp_regressor], x_test, y_test)
print("Samples:", prediction_mlp[:5])
print("R2:     ", np.round(np.mean(r2_scores), 3))
print("MAE:    ", np.mean(mae_scores).astype(int))
print("MAPE:   ", np.round(np.mean(mape_scores), 3))
print("RMSE:   ", np.mean(rmse_scores).astype(int))
print("MAX:    ", np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_mlp)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_mlp), bins=30)
plt.show()

In [None]:
# mlp_regressor = MLPRegressor(random_state=1, max_iter=2000, hidden_layer_sizes=(24, 24, 24))
# mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, ),  activation='tanh', max_iter=5000, learning_rate_init=0.1, verbose=True, random_state=1) # somehow this works
# mlp_regressor = MLPRegressor(hidden_layer_sizes=(150,),  activation='logistic', max_iter=5000, learning_rate_init=0.1, verbose=True, momentum=0.3)
# mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 100, 100),  activation='tanh', max_iter=5000, learning_rate_init=0.1, verbose=True, random_state=1)
# mlp_regressor = MLPRegressor(hidden_layer_sizes=(24, 24, 24), activation='logistic', solver='sgd', alpha=1, learning_rate='constant', learning_rate_init=0.6, max_iter=40000, momentum=0.3)
# {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (150,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.1, 'max_iter': 5000, 'momentum': 0.1, 'solver': 'adam'}

In [None]:
# mlp_regressor = MLPRegressor()
# param_grid = {
#     'hidden_layer_sizes': [(250,), (225,), (200,), (175,)],
#     'max_iter': [5000],
#     'activation': ['logistic', 'relu'],
#     'solver': ['adam'],
#     'learning_rate_init': [0.1, 0.3, 0.6, 0.9],
#     'alpha': [0.0001, 0.05],
#     'learning_rate': ['adaptive'],
#     'momentum': [0.001, 0.01, 0.1,]
# }
#
# # param_grid = {
# #     'hidden_layer_sizes': [(150,), (100,), (50,), (20,)],
# #     'max_iter': [5000],
# #     'activation': ['tanh', 'logistic'],
# #     'solver': ['adam'],
# #     'learning_rate_init': [0.1],
# #     'alpha': [0.0001, 0.05, 0.1, 0.3],
# #     'learning_rate': ['constant','adaptive'],
# #     'momentum': [0.1, 0.3, 0.5]
# # }
#
# grid = GridSearchCV(mlp_regressor, param_grid, n_jobs= -1, cv=5)
# grid.fit(x_train, y_train)
#
# print(grid.best_params_)

In [None]:
print(mlp_regressor.coefs_)