In [1]:
# constants 
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "336de09f80cb4d16935f30f82e1e6ed8"

LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [2]:
# load packages
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

## _Initialize_

### _1. Mlflow_

In [37]:
# convert path in the file to the format 
#-> 'C:\\Users\\DELL\\End2End_DS_Projects\\JobTechGuide\\notebooks\\..\\models\\mlruns\\830408098758583715\\336de09f80cb4d16935f30f82e1e6ed8\\artifacts'
# for windows

In [35]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artifacts_path = run.info.artifact_uri

### _Load model_

In [38]:
model_path = os.path.join(artifacts_path, LOG_MODEL_PKL)

# Load model
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [39]:
artifacts_path

'C:\\\\Users\\\\DELL\\\\End2End_DS_Projects\\\\JobTechGuide\\\\notebooks\\\\..\\\\models\\\\mlruns\\\\830408098758583715\\\\336de09f80cb4d16935f30f82e1e6ed8\\\\artifacts'

In [40]:
# Load data pkl
data_path  = os.path.join(artifacts_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [41]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [42]:
classifier

### _Load skills Clusters_


In [43]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)


In [44]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,C#
1,skills_group_0,F#
2,skills_group_0,PowerShell
3,skills_group_0,SQL
4,skills_group_0,VBA
...,...,...
120,skills_group_9,Chef
121,skills_group_9,Flow
122,skills_group_9,Pulumi
123,skills_group_9,Puppet


## _Predict sample entry_

In [45]:
sample_skills = ['Scala', 'Hadoop', 'Python']

In [46]:
# Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
dtype: bool

### _1. Recreate cluster features_

In [47]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,C#,False
1,skills_group_0,F#,False
2,skills_group_0,PowerShell,False
3,skills_group_0,SQL,False
4,skills_group_0,VBA,False
...,...,...,...
120,skills_group_9,Chef,False
121,skills_group_9,Flow,False
122,skills_group_9,Pulumi,False
123,skills_group_9,Puppet,False


In [48]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_19    2
skills_group_2     1
skills_group_20    0
skills_group_21    0
skills_group_22    0
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

### _2. Create OneHotEncoded skills_

In [49]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [50]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

### _3. Combine features_

In [51]:
# Concat
features = pd.concat([ohe_skills,
                      cluster_features])


In [52]:
# Sort columns
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 148, dtype: int64

### _4. Predict_

In [53]:
features.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [54]:
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


[array([[0.78, 0.22]]),
 array([[0.87, 0.13]]),
 array([[0.91, 0.09]]),
 array([[0.93, 0.07]]),
 array([[0.97, 0.03]]),
 array([[0.91, 0.09]]),
 array([[0.77, 0.23]]),
 array([[0.87, 0.13]]),
 array([[0.98, 0.02]]),
 array([[0.96, 0.04]]),
 array([[0.96, 0.04]]),
 array([[0.92, 0.08]]),
 array([[0.98, 0.02]]),
 array([[0.68, 0.32]]),
 array([[0.87, 0.13]]),
 array([[0.94, 0.06]])]

In [55]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)

Engineer, data                                   0.32
Developer, back-end                              0.23
Academic researcher                              0.22
Data or business analyst                         0.13
Developer, desktop or enterprise applications    0.13
Scientist                                        0.13
Data scientist or machine learning specialist    0.09
Developer, QA or test                            0.09
Developer, game or graphics                      0.08
Database administrator                           0.07
System administrator                             0.06
Developer, front-end                             0.04
Developer, full-stack                            0.04
DevOps specialist                                0.03
Developer, embedded applications or devices      0.02
Developer, mobile                                0.02
dtype: float64