In [1]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

In [2]:
Path('../models/mlruns').mkdir(parents=True, exist_ok=True)
Path('../models/temp/').mkdir(parents=True, exist_ok=True)

In [3]:
df=pd.read_pickle('../data/processed/2_cleaned_data.pkl')

In [4]:
df.head(5)

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,1,0,1,2,0,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,1,0,0,...,2,0,0,1,4,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,4,0,0,2,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,3,0,2,0,1,1,0,0


In [5]:
roles_df=df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1875
Data or business analyst                          1987
Data scientist or machine learning specialist     2658
Database administrator                            1935
DevOps specialist                                 3531
Developer, QA or test                             1468
Developer, back-end                              19214
Developer, desktop or enterprise applications     6136
Developer, embedded applications or devices       2470
Developer, front-end                             10920
Developer, full-stack                            21920
Developer, game or graphics                       1054
Developer, mobile                                 5446
Engineer, data                                    2229
Scientist                                         1244
System administrator                              2610
dtype: int64

In [6]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [7]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), 
                                                    df["DevType"], 
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),


In [9]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [21]:
# Initialize client and experiment
mlflow.set_tracking_uri('../models/mlruns')
client = MlflowClient()
mlflow.set_experiment("skills_jobs_stackoverflow")
exp = client.get_experiment_by_name("skills_jobs_stackoverflow")

2023/09/27 18:43:54 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


In [11]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train.values, Y_train.values)
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)

In [12]:
# Evaluate on training set
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [13]:
mean_train_scores = train_scores.mean()
print(mean_train_scores)

accuracy_score     87.431250
precision_score    64.063125
recall_score       31.895625
f1_score           40.456250
dtype: float64


In [14]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test.values), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)

In [15]:
mean_test_scores = test_scores.mean()
print(mean_test_scores)

accuracy_score     87.075000
precision_score    62.051250
recall_score       31.130625
f1_score           39.272500
dtype: float64


In [16]:
# Data details
data_details = {"data_path": '../data/processed/2_cleaned_data.pkl',
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join('../models/temp/', 'data.pkl'), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [17]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(clf),
         "model_object": clf} 

with open(os.path.join('../models/temp/', 'model.pkl'), "wb") as output_file:
    pickle.dump(model, output_file)

In [18]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join('../models/temp/', 'metrics.pkl'), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [22]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts('../models/temp/')
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

