In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.14.1-py3-none-any.whl (25.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.8/25.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!git clone https://github.com/MohamedKhalifa1/Stack-Overflow-Annual-Developer-Survey-Analysis

Cloning into 'Stack-Overflow-Annual-Developer-Survey-Analysis'...
remote: Enumerating objects: 179, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 179 (delta 49), reused 105 (delta 23), pack-reused 47[K
Receiving objects: 100% (179/179), 35.85 MiB | 14.94 MiB/s, done.
Resolving deltas: 100% (57/57), done.


In [3]:
DATA_PATH = '/content/Stack-Overflow-Annual-Developer-Survey-Analysis/data/preprocessed/04_cleaned_data.pkl.gz'
ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith',
                  ]
MLFLOW_TRACKING_URI = '/content/Stack-Overflow-Annual-Developer-Survey-Analysis/models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_job_stackoverflow"
LOG_PATH = "/content/Stack-Overflow-Annual-Developer-Survey-Analysis/models/temp"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"
SAMPLE_PER_CLASS = 1200
RANDOM_STATE = 42

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
import gzip
import os
import mlflow
from sklearn.model_selection import train_test_split
from mlflow.tracking import MlflowClient
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler , RobustScaler , MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , f1_score , recall_score , precision_score , accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition  import PCA
warnings.filterwarnings("ignore")

In [5]:
with open(DATA_PATH,'rb') as f:
  df = gzip.open(f, 'rb')
  df = pickle.load(df)

In [6]:
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,Developer Advocate,Developer Experience,"Developer, QA or test",...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,9,0,0,0,0,0,0


In [7]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)

    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores

In [8]:
roles_df = df["DevType"].copy()
roles_df.sum(axis=0)

Academic researcher                               1117
Blockchain                                         257
Cloud infrastructure engineer                      792
Data or business analyst                           737
Data scientist or machine learning specialist     1286
Database administrator                             229
DevOps specialist                                 1018
Developer Advocate                                 167
Developer Experience                               258
Developer, QA or test                              503
Developer, back-end                              11604
Developer, desktop or enterprise applications     3377
Developer, embedded applications or devices       1632
Developer, front-end                              4484
Developer, full-stack                            19335
Developer, game or graphics                        734
Developer, mobile                                 2290
Engineer, data                                     992
Hardware E

In [9]:
resample_roles = []

for role_col in roles_df.columns:
  sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

  if len(sub_df) > SAMPLE_PER_CLASS :
    sub_df = sub_df.sample(SAMPLE_PER_CLASS , replace = True , random_state = RANDOM_STATE)
    resample_roles.append(sub_df)
  if len(sub_df) < SAMPLE_PER_CLASS :
    sub_df = sub_df.sample(SAMPLE_PER_CLASS , replace = True , random_state = RANDOM_STATE)
  resample_roles.append(sub_df)

In [10]:
roles_df = pd.concat(resample_roles)

In [11]:
df = df.loc[roles_df.index].copy()

In [12]:
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,Developer Advocate,Developer Experience,"Developer, QA or test",...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
66205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,3,0,0,0
87174,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,13,0,0,0
82093,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,9,0,5,0,0,1
8591,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,7,0,1,0
33455,1,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,10,0,0,0


In [13]:
roles_df.sum(axis=0)

Academic researcher                              1200
Blockchain                                       1200
Cloud infrastructure engineer                    1200
Data or business analyst                         1200
Data scientist or machine learning specialist    2400
Database administrator                           1200
DevOps specialist                                1200
Developer Advocate                               1200
Developer Experience                             1200
Developer, QA or test                            1200
Developer, back-end                              2400
Developer, desktop or enterprise applications    2400
Developer, embedded applications or devices      2400
Developer, front-end                             2400
Developer, full-stack                            2400
Developer, game or graphics                      1200
Developer, mobile                                2400
Engineer, data                                   1200
Hardware Engineer           

In [14]:
X = df.drop(columns=ROLE_COLS, axis =1)
y = df[ROLE_COLS]

In [15]:
X

Unnamed: 0_level_0,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,APL,Ada,Apex,Assembly,Bash/Shell (all shells),C,C#,C++,Clojure,Cobol,...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
66205,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,3,0,0,0
87174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,13,0,0,0
82093,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,9,0,5,0,0,1
8591,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,5,0,7,0,1,0
33455,0,0,0,0,1,0,0,0,0,0,...,0,0,0,2,0,0,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5299,0,0,0,0,1,1,1,1,0,0,...,0,1,0,3,2,0,1,0,0,2
3983,0,0,0,0,1,0,0,0,0,0,...,0,0,0,2,3,1,2,0,0,2
72623,0,0,0,0,1,1,1,1,0,0,...,0,0,0,11,2,0,0,0,0,2
47793,0,0,0,0,1,1,0,0,0,0,...,0,2,0,1,4,0,1,0,0,0


In [16]:
y

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,Developer Advocate,Developer Experience,"Developer, QA or test",...,"Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Hardware Engineer,Project manager,Research & Development role,Scientist,Security professional,System administrator
66205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87174,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82093,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8591,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33455,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5299,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
72623,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
47793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [18]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((29760, 293), (7440, 293), (29760, 24), (7440, 24))

In [19]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
if exp is None:
    exp_id = mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
else:
    exp_id = exp.experiment_id

In [21]:
clf = make_pipeline(
    RobustScaler(),
    PCA(),
    RandomForestClassifier(n_jobs= 8 , verbose = 1 , random_state=RANDOM_STATE)
)

In [23]:
tuned_params = [{
    'pca__n_components': [0.8, 0.9],
    'randomforestclassifier__n_estimators': [300,500],
    'randomforestclassifier__max_depth': [10,None]
}]

In [24]:
hp_seach = GridSearchCV(clf, tuned_params)
hp_seach.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   16.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:  2.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.1s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    6.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   15.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:  2.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 184 tasks      |

In [25]:
hp_seach.best_params_

{'pca__n_components': 0.8,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 500}

In [26]:
predictions = pd.DataFrame(hp_seach.predict(X_train), columns=y_train.columns)
train_scores = {score.__name__:calculate_quality(y_train ,predictions , score)
for score in  [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)
train_scores

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   18.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   43.4s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   48.9s finished


Unnamed: 0,Unnamed: 1,accuracy_score,precision_score,recall_score,f1_score
DevType,Academic researcher,99.98,99.9,99.48,99.69
DevType,Blockchain,99.98,99.89,99.47,99.68
DevType,Cloud infrastructure engineer,100.0,100.0,99.9,99.95
DevType,Data or business analyst,99.98,99.69,99.69,99.69
DevType,Data scientist or machine learning specialist,100.0,99.95,100.0,99.97
DevType,Database administrator,99.98,99.89,99.57,99.73
DevType,DevOps specialist,100.0,100.0,100.0,100.0
DevType,Developer Advocate,100.0,100.0,100.0,100.0
DevType,Developer Experience,99.98,99.9,99.38,99.64
DevType,"Developer, QA or test",99.99,100.0,99.79,99.89


In [27]:
predictions = pd.DataFrame(hp_seach.predict(X_test), columns=y_test.columns)
test_scores = {score.__name__:calculate_quality(y_test ,predictions , score)
for score in  [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()
test_scores

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    8.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    9.7s finished


Unnamed: 0,Unnamed: 1,accuracy_score,precision_score,recall_score,f1_score
DevType,Academic researcher,98.47,100.0,52.5,68.85
DevType,Blockchain,99.92,100.0,97.67,98.82
DevType,Cloud infrastructure engineer,99.18,100.0,74.15,85.16
DevType,Data or business analyst,99.15,98.92,75.1,85.38
DevType,Data scientist or machine learning specialist,99.33,97.06,92.79,94.88
DevType,Database administrator,99.91,99.61,97.7,98.65
DevType,DevOps specialist,98.64,100.0,58.44,73.77
DevType,Developer Advocate,100.0,100.0,100.0,100.0
DevType,Developer Experience,99.8,99.53,93.83,96.6
DevType,"Developer, QA or test",99.45,100.0,83.98,91.3


In [28]:
print(mean_test_scores)

accuracy_score     99.329583
precision_score    99.553333
recall_score       83.083333
f1_score           89.828333
dtype: float64


In [30]:
data_details = {
    'data_path':DATA_PATH ,
    'training_indices' : X_train.index.tolist(),
    'testing_indices' : X_test.index.tolist(),
    'feature_name' :X_train.columns.droplevel(0).tolist(),
    'target_name' :y_train.columns.tolist(),
}
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as f:
    pickle.dump(data_details, f)

In [31]:
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hp_seach),
         "model_object": hp_seach}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [32]:
classes_metrics = {"train_scores": train_scores,
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [33]:
if exp_id:
    with mlflow.start_run(experiment_id=exp_id, run_name=model["model_description"]):
        mlflow.log_artifacts(LOG_PATH)
        for metric, score in mean_test_scores.items():
            mlflow.log_metric(metric, score)
else:
    print(f"Failed to fetch or create experiment '{MLFLOW_EXPERIMENT_NAME}'")