In [71]:
import numpy as np
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from pathlib import Path
import os 
import pickle

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score


In [70]:
MLFLOW_TRACKING_URI = '../models/mlruns'
LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

DATA_PATH = "../data/processed/clean_data/clean_data.pkl"

    # Create log directories

In [6]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)


# Load data

In [9]:
# Read Data
raw_df = pd.read_pickle(DATA_PATH)
raw_df

Unnamed: 0_level_0,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Assembly,Bash/Shell/PowerShell,C,C#,C++,Dart,Go,HTML/CSS,Haskell,Java,...,skills_group_17,skills_group_18,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,1,0,1,2
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,1,0,1
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64422,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1
64428,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,5,0,0,1
64429,0,0,0,0,0,1,0,0,0,0,...,2,0,0,0,2,0,1,0,0,1
64447,0,0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,2,2


# Balance data

In [11]:
# Check the total samples of roles
jobs_df = raw_df["DevType"].copy()
jobs_df.sum(axis=0)

Academic researcher                               1430
Data or business analyst                          1410
Data scientist or machine learning specialist     1803
Database administrator                            1027
DevOps specialist                                 1985
Developer, QA or test                             1065
Developer, back-end                              13160
Developer, desktop or enterprise applications     4029
Developer, embedded applications or devices       1544
Developer, front-end                              7300
Developer, full-stack                            13990
Developer, game or graphics                        855
Developer, mobile                                 3633
Engineer, data                                    1344
Scientist                                          775
System administrator                              1311
dtype: int64

In [16]:
jobs_df.sum(axis=0).describe()['25%']

np.float64(1249.5)

In [79]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in jobs_df.columns:
    sub_df = jobs_df.loc[jobs_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [30]:
# Construct dfs
jobs_df  = pd.concat(resampled_roles)
raw_df = raw_df.loc[jobs_df.index].copy()
raw_df

Unnamed: 0_level_0,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,LanguageWorkedWith,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Assembly,Bash/Shell/PowerShell,C,C#,C++,Dart,Go,HTML/CSS,Haskell,Java,...,skills_group_17,skills_group_18,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
12737,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
53331,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,1
47749,0,0,0,0,0,0,0,1,0,1,...,0,0,0,1,3,3,4,0,1,3
38896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,0,0,0,0
38896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51707,0,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,2,0,0,2
33265,0,1,0,0,0,0,0,1,0,1,...,0,0,2,0,2,0,1,0,4,4
33265,0,1,0,0,0,0,0,1,0,1,...,0,0,2,0,2,0,1,0,4,4
24472,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,3,0,0,3


In [32]:
jobs_df.sum(axis=0)


Academic researcher                              2470
Data or business analyst                         2124
Data scientist or machine learning specialist    2712
Database administrator                           1864
DevOps specialist                                2164
Developer, QA or test                            1617
Developer, back-end                              6826
Developer, desktop or enterprise applications    2960
Developer, embedded applications or devices      1816
Developer, front-end                             3027
Developer, full-stack                            5845
Developer, game or graphics                      1559
Developer, mobile                                2334
Engineer, data                                   2088
Scientist                                        1915
System administrator                             2012
dtype: int64

In [33]:
jobs_df.sum(axis=0).describe()

count      16.000000
mean     2708.312500
std      1490.271372
min      1559.000000
25%      1902.250000
50%      2144.000000
75%      2774.000000
max      6826.000000
dtype: float64

# Split data into Train and Test Data

In [35]:


X_train, X_test, Y_train, Y_test = train_test_split(raw_df.drop("DevType", axis=1), 
                                                    raw_df["DevType"], 
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(raw_df.drop("DevType", axis=1),


# Train the baseline model 

In [None]:
# mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
# client = MlflowClient()
# mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
# exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

In [36]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [77]:
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

2024/10/26 14:04:28 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


## Logistic regression

In [46]:
# CREATE THE BASELINE MODEL AS A SIMPLE LOGISTIC REGRESSION

baseline_model = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

baseline_model.fit(X_train.values, Y_train.values)
# predictions =  pd.DataFrame(baseline_model.predict(X_train.values),
#                             columns=Y_train.columns)

In [57]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [59]:
# Evaluate on training set
predictions =  pd.DataFrame(baseline_model.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,85.33,68.59,31.85,43.5
Data or business analyst,89.55,68.16,29.68,41.36
Data scientist or machine learning specialist,89.62,78.23,57.73,66.43
Database administrator,86.22,55.74,15.16,23.84
DevOps specialist,91.42,70.87,36.02,47.76
"Developer, QA or test",88.73,64.75,2.94,5.62
"Developer, back-end",74.65,64.25,39.54,48.95
"Developer, desktop or enterprise applications",86.46,57.04,10.44,17.65
"Developer, embedded applications or devices",92.34,64.53,29.3,40.3
"Developer, front-end",89.3,63.01,33.04,43.35


In [62]:
# Evaluate on test set
predictions =  pd.DataFrame(baseline_model.predict(X_test.values), columns=Y_test.columns)

test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,85.23,68.11,30.06,41.71
Data or business analyst,89.83,67.82,29.26,40.89
Data scientist or machine learning specialist,89.75,76.84,58.14,66.2
Database administrator,86.44,59.13,16.01,25.2
DevOps specialist,91.48,70.14,35.8,47.41
"Developer, QA or test",88.57,60.47,2.52,4.85
"Developer, back-end",74.45,63.46,38.04,47.56
"Developer, desktop or enterprise applications",85.71,58.22,9.46,16.27
"Developer, embedded applications or devices",92.04,63.55,26.34,37.25
"Developer, front-end",89.27,61.01,34.22,43.85


In [63]:
mean_test_scores = test_scores.mean()
mean_test_scores

accuracy_score     87.540000
precision_score    66.379375
recall_score       30.849375
f1_score           40.228750
dtype: float64

In [64]:
mean_train_scores = train_scores.mean()
mean_train_scores

accuracy_score     87.615000
precision_score    66.797500
recall_score       31.419375
f1_score           40.860000
dtype: float64

In [65]:
print(mean_test_scores)
test_scores

accuracy_score     87.540000
precision_score    66.379375
recall_score       30.849375
f1_score           40.228750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,85.23,68.11,30.06,41.71
Data or business analyst,89.83,67.82,29.26,40.89
Data scientist or machine learning specialist,89.75,76.84,58.14,66.2
Database administrator,86.44,59.13,16.01,25.2
DevOps specialist,91.48,70.14,35.8,47.41
"Developer, QA or test",88.57,60.47,2.52,4.85
"Developer, back-end",74.45,63.46,38.04,47.56
"Developer, desktop or enterprise applications",85.71,58.22,9.46,16.27
"Developer, embedded applications or devices",92.04,63.55,26.34,37.25
"Developer, front-end",89.27,61.01,34.22,43.85


# Log run mlflow

In [72]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [74]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(baseline_model),
         "model_object": baseline_model} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [75]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [78]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 