In [1]:
import numpy as np

DATA_PATH = '../data/processed/02_cleaned_df.pkl'
MODEL_DIR = '../models'

ROLE_COLS  = ['DevType']
TECH_COLS  = ['LanguageHaveWorkedWith',
              'DatabaseHaveWorkedWith',
              'WebframeHaveWorkedWith',
              'MiscTechHaveWorkedWith',
              'ToolsTechHaveWorkedWith']

EXCLUDE_ROLES = ['Other (please specify):',
                 'Student',
                 'Designer',
                 'Educator',
                 'Marketing or sales professional',
                 'Engineering manager',
                 'Senior Executive (C-Suite, VP, etc.)',
                 'Product manager',
                 'Engineer, site reliability']


In [2]:
import numpy as np
import pandas as pd

import pickle
import os
import yaml
import time
import datetime
import copy
import sys
sys.path.append('../scripts')
from preprocessing import one_hot_encode
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LassoCV
from sklearn import linear_model

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.metrics import auc, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.inspection import permutation_importance
from sklearn.ensemble import StackingClassifier


### Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)

    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores


In [4]:
def get_train_test_data(job, ohe_tech, ohe_roles):
    # Split to train and test
    role_mask = (ohe_roles[job] == 1)
    role_n = role_mask.sum()

    i_role  = role_mask[role_mask].index.tolist()
    i_other = role_mask[~role_mask].sample(role_n, random_state=0).index.tolist()

    i_role_train,  i_role_test  = train_test_split(i_role,  test_size=0.3, random_state=0)
    i_other_train, i_other_test = train_test_split(i_other, test_size=0.3, random_state=0)

    i_train = i_role_train + i_other_train
    i_test  = i_role_test  + i_other_test

    X_train, y_train = ohe_tech.loc[i_train], ohe_roles[job].loc[i_train]
    X_test,  y_test  = ohe_tech.loc[i_test], ohe_roles[job].loc[i_test]

    return X_train, X_test, y_train, y_test


### Load data and preprocess

In [5]:
# Read data
processed_df = pd.read_pickle(DATA_PATH)


In [6]:
# One hot encode
ohe_df = one_hot_encode(processed_df, ROLE_COLS + TECH_COLS)
ohe_df = ohe_df.drop(EXCLUDE_ROLES, axis=1, level=1)


In [7]:
# Split X & Y
ohe_tech  = ohe_df[TECH_COLS].droplevel(0, axis=1)
ohe_roles = ohe_df[ROLE_COLS].droplevel(0, axis=1)


In [8]:
# Check sums
ohe_roles.sum().sort_values()


Developer, game or graphics                        483
Database administrator                             586
Scientist                                          697
Developer, QA or test                              745
System administrator                              1002
Data or business analyst                          1049
Academic researcher                               1135
Engineer, data                                    1194
Data scientist or machine learning specialist     1543
DevOps specialist                                 1565
Developer, embedded applications or devices       1585
Developer, mobile                                 2990
Developer, desktop or enterprise applications     3239
Developer, front-end                              5433
Developer, back-end                              10818
Developer, full-stack                            11426
dtype: int64

# Train models

### Create template model

In [9]:
rf_clf =  RandomForestClassifier(max_depth=3, n_estimators=5000, random_state=0)

en_clf =  Pipeline([('std_scale', StandardScaler()),
                    ('cv_elastic_net',
                     GridSearchCV(
                        linear_model.LogisticRegression(penalty='elasticnet',
                                                        solver='saga',
                                                        max_iter=1000,
                                                        random_state=0),
                         param_grid={'C':        np.linspace(0.5, 1.5, 10),
                                     'l1_ratio': np.linspace(0,   1,   10)},
                         n_jobs=6))
                    ])

stacked_clf = StackingClassifier(estimators=[('random_forest', rf_clf),
                                             ('elastic_net',   en_clf)],
                                 final_estimator=linear_model.LogisticRegression())


### Train models

In [10]:
data = {}
models = {}
unique_jobs = ohe_roles.columns.to_list()

for job in unique_jobs:
     print(str(datetime.datetime.now()) + ' ... Training model for ' + job)

     # Create and save data
     X_train, X_test, y_train, y_test = get_train_test_data(job, ohe_tech, ohe_roles)
     data[job] = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

     # Fit and save model
     job_model = copy.deepcopy(stacked_clf)
     job_model.fit(X_train, y_train)
     models[job] = copy.deepcopy(job_model)
    

2023-03-01 22:23:45.684133 ... Training model for Academic researcher
2023-03-01 22:34:02.756157 ... Training model for Data or business analyst
2023-03-01 22:43:58.111395 ... Training model for Data scientist or machine learning specialist
2023-03-01 22:55:54.857749 ... Training model for Database administrator
2023-03-01 23:00:28.200071 ... Training model for DevOps specialist
2023-03-01 23:07:47.434314 ... Training model for Developer, QA or test
2023-03-01 23:14:40.105093 ... Training model for Developer, back-end
2023-03-01 23:22:50.306869 ... Training model for Developer, desktop or enterprise applications
2023-03-01 23:30:17.095980 ... Training model for Developer, embedded applications or devices
2023-03-01 23:38:04.684102 ... Training model for Developer, front-end
2023-03-01 23:45:17.838954 ... Training model for Developer, full-stack
2023-03-01 23:53:36.411614 ... Training model for Developer, game or graphics
2023-03-01 23:59:40.940614 ... Training model for Developer, mobi

### Evaluate models

In [11]:
train_evaluation = {}
test_evaluation  = {}

for job in unique_jobs:
    print(str(datetime.datetime.now()) + ' ... Evaluating ' + job)

    model = models[job]
    train_fit = classification_report(data[job]['y_train'], model.predict(data[job]['X_train']), output_dict=True)
    train_evaluation[job] = train_fit['weighted avg']

    test_fit = classification_report(data[job]['y_test'], model.predict(data[job]['X_test']), output_dict=True)
    test_evaluation[job] = test_fit['weighted avg']

test_evaluation  = pd.DataFrame(test_evaluation).T
train_evaluation = pd.DataFrame(train_evaluation).T


2023-03-02 00:47:38.257455 ... Evaluating Academic researcher
2023-03-02 00:47:41.288751 ... Evaluating Data or business analyst
2023-03-02 00:47:43.177029 ... Evaluating Data scientist or machine learning specialist
2023-03-02 00:47:45.055726 ... Evaluating Database administrator
2023-03-02 00:47:46.492709 ... Evaluating DevOps specialist
2023-03-02 00:47:48.271881 ... Evaluating Developer, QA or test
2023-03-02 00:47:49.657993 ... Evaluating Developer, back-end
2023-03-02 00:47:55.911012 ... Evaluating Developer, desktop or enterprise applications
2023-03-02 00:47:58.479645 ... Evaluating Developer, embedded applications or devices
2023-03-02 00:48:00.170164 ... Evaluating Developer, front-end
2023-03-02 00:48:03.874231 ... Evaluating Developer, full-stack
2023-03-02 00:48:10.332390 ... Evaluating Developer, game or graphics
2023-03-02 00:48:11.428918 ... Evaluating Developer, mobile
2023-03-02 00:48:13.824681 ... Evaluating Engineer, data
2023-03-02 00:48:15.319942 ... Evaluating Sc

In [12]:
train_evaluation.T

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
precision,0.835707,0.830114,0.908447,0.75869,0.779782,0.707515,0.705417,0.726741,0.861132,0.786225,0.751934,0.870182,0.899236,0.767185,0.888171,0.752525
recall,0.835642,0.8297,0.908333,0.758537,0.778539,0.707294,0.705362,0.726731,0.860685,0.785958,0.750563,0.866864,0.89871,0.764671,0.88809,0.751783
f1-score,0.835634,0.829647,0.908327,0.758501,0.778292,0.707216,0.705342,0.726728,0.860642,0.785909,0.750223,0.866565,0.898677,0.764116,0.888085,0.751601
support,1588.0,1468.0,2160.0,820.0,2190.0,1042.0,15144.0,4534.0,2218.0,7606.0,15996.0,676.0,4186.0,1670.0,974.0,1402.0


In [13]:
test_evaluation.T

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
precision,0.825538,0.812812,0.896336,0.74645,0.779645,0.621622,0.711589,0.741268,0.839407,0.782456,0.746891,0.873854,0.877978,0.739619,0.840855,0.694662
recall,0.825513,0.812698,0.896328,0.741477,0.775532,0.618304,0.711337,0.741255,0.839286,0.781595,0.74577,0.872414,0.877369,0.738162,0.840476,0.694352
f1-score,0.82551,0.812681,0.896328,0.740167,0.774703,0.615682,0.711251,0.741252,0.839271,0.781429,0.745481,0.872291,0.87732,0.737763,0.840432,0.694231
support,682.0,630.0,926.0,352.0,940.0,448.0,6492.0,1944.0,952.0,3260.0,6856.0,290.0,1794.0,718.0,420.0,602.0


### Calculate feature importances

In [14]:
models = pickle.load(open('../models/ensemble_models.pkl', 'rb'))


FileNotFoundError: [Errno 2] No such file or directory: '../models/ensemble_models.pkl'

In [15]:
features_imps = {}

for job in unique_jobs:
    print(str(datetime.datetime.now()) + ' ... Calculating feature importances ' + job)

    features_importances = permutation_importance(models[job],
                                                  data[job]['X_train'],
                                                  data[job]['y_train'],
                                                  n_repeats=12,
                                                  random_state=0,
                                                  n_jobs=6)
    features_importances.pop('importances')
    features_importances = pd.DataFrame.from_dict(features_importances)
    features_importances.index = X_train.columns

    features_imps[job] = features_importances.sort_values('importances_mean', ascending=False)

2023-03-02 00:48:31.276423 ... Calculating feature importances Academic researcher
2023-03-02 00:56:35.732747 ... Calculating feature importances Data or business analyst
2023-03-02 01:04:02.195293 ... Calculating feature importances Data scientist or machine learning specialist
2023-03-02 01:12:32.790610 ... Calculating feature importances Database administrator
2023-03-02 01:19:32.764135 ... Calculating feature importances DevOps specialist




2023-03-02 01:35:52.549985 ... Calculating feature importances Developer, QA or test
2023-03-02 01:42:53.497213 ... Calculating feature importances Developer, back-end
2023-03-02 02:17:54.106317 ... Calculating feature importances Developer, desktop or enterprise applications
2023-03-02 02:30:27.547750 ... Calculating feature importances Developer, embedded applications or devices
2023-03-02 02:38:27.598539 ... Calculating feature importances Developer, front-end
2023-03-02 02:56:11.740850 ... Calculating feature importances Developer, full-stack


MemoryError: Unable to allocate 11.8 MiB for an array with shape (15996, 97) and data type float64

In [None]:
[print(job, imp[:10].index.tolist()) for job, imp in features_imps.items()]

### Exporting

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models.pkl'), 'wb') as handle:
    pickle.dump(models, handle)
    

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models_eval.pkl'), 'wb') as handle:
    pickle.dump({'train': train_evaluation, 'test': test_evaluation}, handle)

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models_feature_importances.pkl'), 'wb') as handle:
    pickle.dump(features_imps, handle)