In [1]:
DF_PATH = "../data/processed/2_cleaned_data.pkl"

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoveflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
# import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

_________

### Functions

In [3]:
def calculate_performance(ground_truth, predictions, metric_function, sort_values=False):
    performance_scores = {}
    #loop over roles
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        #calculate metric performance for the role
        performance_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    performance_scores = pd.Series(performance_scores.values(), index=performance_scores.keys())
    if sort_values:
        performance_scores = performance_scores.sort_values()
    
    return performance_scores

_________

# Initialize

### Create directories

In [4]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

### Read data

In [5]:
# Read Data
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications",...,skills_group_2,skills_group_20,skills_group_21,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,0,0
9,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,0,1,0,0,0
10,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,7,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
73263,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,4,0,1,2,2,0
73264,0,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,1,1,1,0
73265,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,4,4,0,0


### Balance classes 

In [6]:
# Check the total samples of roles
roles_df = df["DevType"].copy()
roles_df.sum(axis=0)

Academic researcher                               1506
Blockchain                                         557
Cloud infrastructure engineer                     1956
Data or business analyst                          1254
Data scientist or machine learning specialist     1854
Database administrator                             950
DevOps specialist                                 2329
Developer, QA or test                              986
Developer, back-end                              15444
Developer, desktop or enterprise applications     4230
Developer, embedded applications or devices       1771
Developer, front-end                              7781
Developer, full-stack                            17221
Developer, game or graphics                        786
Developer, mobile                                 3635
Engineer, data                                    1597
Project manager                                   1211
Scientist                                          889
Security p

In [7]:
print(f"Shape of roles dataframe : {roles_df.shape}")
roles_df.head(3)

Shape of roles dataframe : (41626, 20)


Unnamed: 0,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Project manager,Scientist,Security professional,System administrator
2,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [8]:
#Resample roles
samples_per_class = 1200
resampled_roles = []

for role in roles_df.columns:
    #get the role samples
    sub_df = roles_df.loc[roles_df[role]==1].copy()
    
    if len(sub_df) < samples_per_class:
        #Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=11)
    else:
        #Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=11)
        
    resampled_roles.append(sub_df)

In [9]:
resampled_roles_df = pd.concat(resampled_roles)
print(f"Shape of resampled roles dataframe : {resampled_roles_df.shape}")
resampled_roles_df.head(3)

Shape of resampled roles dataframe : (24000, 20)


Unnamed: 0,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Project manager,Scientist,Security professional,System administrator
14714,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
38157,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
28458,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
resampled_roles_df.sum(axis=0)

Academic researcher                              2494
Blockchain                                       1433
Cloud infrastructure engineer                    2317
Data or business analyst                         2014
Data scientist or machine learning specialist    2477
Database administrator                           1815
DevOps specialist                                2484
Developer, QA or test                            1545
Developer, back-end                              6928
Developer, desktop or enterprise applications    2980
Developer, embedded applications or devices      1889
Developer, front-end                             2759
Developer, full-stack                            6659
Developer, game or graphics                      1481
Developer, mobile                                2094
Engineer, data                                   2072
Project manager                                  1769
Scientist                                        1943
Security professional       

In [11]:
# Construct dfs
# roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()
df.head(3)

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications",...,skills_group_2,skills_group_20,skills_group_21,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,0,0


### Split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), 
                                                    df["DevType"], 
                                                    random_state=11)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),


____________

# Train models

### Initialize MLflow

In [13]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

### 1. Logistic regression

In [14]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train, Y_train)

In [15]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_performance(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test), columns=Y_test.columns)
test_scores = {score.__name__: calculate_performance(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print(mean_test_scores)
test_scores

accuracy_score     93.5885
precision_score    47.7410
recall_score       21.1040
f1_score           27.3780
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,96.74,59.43,17.55,27.1
Blockchain,98.76,46.15,9.45,15.69
Cloud infrastructure engineer,95.42,51.38,11.67,19.02
Data or business analyst,96.87,48.05,11.46,18.5
Data scientist or machine learning specialist,96.44,65.9,43.04,52.07
Database administrator,97.75,0.0,0.0,0.0
DevOps specialist,94.77,64.19,16.21,25.89
"Developer, QA or test",97.45,0.0,0.0,0.0
"Developer, back-end",71.98,67.32,48.66,56.49
"Developer, desktop or enterprise applications",90.26,65.11,16.48,26.31


## Log run

### 1. Prepare

In [18]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [19]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(clf),
         "model_object": clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [20]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

### 2. Log

In [21]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)     