### Import Modules and Data

In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.neighbors import KNeighborsClassifier
from typing import Optional

In [9]:
df_ger = pd.read_csv("data/preprocessed_survey_results.csv")

## Regression

### Data Prep

In [10]:
x = df_ger[['YearsCode', 'Age', 'OrgSize', 'in-person', 'remote']]

y = df_ger['Salary']

# scaler = MinMaxScaler(feature_range=(0,1))
# scaler.fit(x)
# x = pd.DataFrame(scaler.transform(x), index=x.index, columns=x.columns)

# normalizer = Normalizer()
# normalizer.fit(x)
# x = pd.DataFrame(normalizer.transform(x), index=x.index, columns=x.columns)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1000)
folds = KFold(n_splits=10, shuffle=True, random_state=100)

KeyError: "['in-person', 'remote'] not in index"

### Linear Regression

In [None]:
lm = LinearRegression()
lm.fit(x_train, y_train)
prediction_linear = lm.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([lm], x_test, y_test)
print('R2:  ', np.round(np.mean(r2_scores), 3))
print('MAE: ', np.mean(mae_scores).astype(int))
print('MAPE:', np.round(np.mean(mape_scores), 3))
print('RMSE:', np.mean(rmse_scores).astype(int))
print('MAX: ', np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_linear)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_linear), bins=35)
plt.show()

NameError: name 'x_train' is not defined

In [None]:
lm_cv = LinearRegression()
estimators = cross_validate(lm_cv, x_train, y_train, cv=folds, return_estimator=True)['estimator']
r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores(estimators, x_test, y_test)
print('R2   (cross validated):', np.round(np.mean(r2_scores), 3))
print('MAE  (cross validated):', np.mean(mae_scores).astype(int))
print('MAPE (cross validated):', np.round(np.mean(mape_scores), 3))
print('RMSE (cross validated):', np.mean(rmse_scores).astype(int))
print('MAX  (cross validated):', np.max(max_scores).astype(int))
print('\nR2   (splits):', np.round(r2_scores, 3))
print('MAE  (splits):', list(map(round, mae_scores)))
print('MAPE (splits):', np.round(mape_scores, 3))
print('RMSE (splits):', list(map(round, rmse_scores)))
print('MAX  (splits):', list(map(round, max_scores)))

### Random Forest Regressor

In [None]:
random_regressor = RandomForestRegressor(random_state=1, bootstrap=True)
random_regressor.fit(x_train, y_train)
prediction_rf = random_regressor.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([random_regressor], x_test, y_test)
print('R2:  ', np.round(np.mean(r2_scores), 3))
print('MAE: ', np.mean(mae_scores).astype(int))
print('MAPE:', np.round(np.mean(mape_scores), 3))
print('RMSE:', np.mean(rmse_scores).astype(int))
print('MAX: ', np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_rf)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_rf), bins=30)
plt.show()

NameError: name 'RandomForestRegressor' is not defined

### Gradient Boosting Regressor

In [None]:
gbt_regressor = GradientBoostingRegressor(random_state=1)
gbt_regressor.fit(x_train, y_train)
prediction_gbt = gbt_regressor.predict(x_test)

r2_scores, mae_scores, mape_scores, rmse_scores, max_scores = calc_scores([gbt_regressor], x_test, y_test)
print('R2:  ', np.round(np.mean(r2_scores), 3))
print('MAE: ', np.mean(mae_scores).astype(int))
print('MAPE:', np.round(np.mean(mape_scores), 3))
print('RMSE:', np.mean(rmse_scores).astype(int))
print('MAX: ', np.max(max_scores).astype(int))

_, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
axs[0].scatter(x=y_test, y=prediction_gbt)
axs[0].axline((0, 0), slope=1, color="black", linestyle=(0, (5, 5)))
axs[0].axis('square')
axs[0].set(xlabel="actual salary", ylabel="predicted salary")
axs[1].set(xlabel="difference from actual salary", ylabel="count")
axs[1].hist((y_test-prediction_gbt), bins=30)
plt.show()