In [1]:
# constants
DATA_PATH = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith',
             'DatabaseHaveWorkedWith',
             'PlatformHaveWorkedWith',
             'WebframeHaveWorkedWith',
             'MiscTechHaveWorkedWith',
             'ToolsTechHaveWorkedWith',
             'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL =  "data.pkl"
LOG_MODEL_PKL =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
import seaborn as sns
sns.set()
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

### _Functions_

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    '''
        Calculates the quality of predictions compared to
        ground truth data using a specified metric function.
        
        Parameters:
        * ground_truth (DataFrame): containing the actual values for each role.
        * predictions (DataFrame): containing the predicted values for each role.
        * metric_function (Function): calculate the quality of predictions against the ground truth.
        * sort_values (Boolean): determines whether the resulting quality scores should be sorted in ascending order.
        
        Returns: the quality scores for each column (role) in the predictions DataFrame.   
    '''
    
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

## _Initialize_

### _create directories_

In [4]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

### _Read data_

In [5]:
data = pd.read_pickle(DATA_PATH) 
data

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,1,2,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,2,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,4,1,2,0,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,4,2,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,3,0,0,2,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,3,1,0,0,0
83435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,1,2,2,1,1,0
83436,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,5,1,2,0,2
83437,0,0,0,0,0,0,1,0,0,0,...,0,0,2,0,0,3,1,3,0,2


### _Balance classes_

In [6]:
# Check the total samples of roles
roles_df = data["DevType"].copy()
roles_df.sum(axis=0).sort_values()

Developer, game or graphics                        899
Scientist                                         1046
Developer, QA or test                             1135
Database administrator                            1210
Data or business analyst                          1658
Academic researcher                               1708
Engineer, data                                    1941
System administrator                              2069
Developer, embedded applications or devices       2138
Data scientist or machine learning specialist     2460
DevOps specialist                                 3056
Developer, mobile                                 4751
Developer, desktop or enterprise applications     4845
Developer, front-end                              8932
Developer, back-end                              17084
Developer, full-stack                            20655
dtype: int64

In [7]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [8]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
data = data.loc[roles_df.index].copy()

In [9]:
roles_df.sum(axis=0).sort_values()

Developer, game or graphics                      1441
Developer, QA or test                            1514
Database administrator                           1765
Developer, embedded applications or devices      1773
Scientist                                        1910
Data or business analyst                         1965
Engineer, data                                   2046
System administrator                             2110
Developer, mobile                                2155
DevOps specialist                                2170
Academic researcher                              2280
Data scientist or machine learning specialist    2576
Developer, front-end                             2614
Developer, desktop or enterprise applications    2690
Developer, full-stack                            5602
Developer, back-end                              5710
dtype: int64

### _Split data_

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(data.drop("DevType", axis=1), 
                                                    data["DevType"], 
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(data.drop("DevType", axis=1),


## _Train models_

### _Initialize MLflow_

In [11]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)


2024/08/16 18:58:50 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


### _1. Logistic regression_

In [12]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train.values, Y_train.values)
# 

In [13]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [14]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test.values), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [15]:
print(mean_test_scores)
test_scores

accuracy_score     88.940625
precision_score    62.217500
recall_score       29.916875
f1_score           38.644375
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,90.06,64.91,30.94,41.9
Data or business analyst,91.12,65.99,26.58,37.9
Data scientist or machine learning specialist,89.96,67.58,49.23,56.96
Database administrator,90.9,56.1,5.2,9.52
DevOps specialist,90.23,65.55,25.66,36.88
"Developer, QA or test",91.69,25.0,0.51,0.99
"Developer, back-end",74.4,62.64,35.72,45.5
"Developer, desktop or enterprise applications",87.52,61.58,18.2,28.09
"Developer, embedded applications or devices",92.67,64.95,33.41,44.13
"Developer, front-end",88.4,67.91,36.76,47.7


### _Log run_

##### 1. Prepare

In [16]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [17]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(clf),
         "model_object": clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [18]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

### _2. Log_

In [19]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 
    