In [22]:
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient

import os 
import sklearn
import pickle
import yaml


In [33]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "88dd333d1e6a46029a6910bef05c43f3"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/feature_engineering/features_skills_clusters_description.yaml"

In [24]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = run.info.artifact_uri
artificats_path

'file:///C:/Users/wannatry/Documents/Mellouky/Projects/jobme/notebooks/../models/mlruns/826215840783647024/88dd333d1e6a46029a6910bef05c43f3/artifacts'

# Load Model

In [25]:
artificats_path

'file:///C:/Users/wannatry/Documents/Mellouky/Projects/jobme/notebooks/../models/mlruns/826215840783647024/88dd333d1e6a46029a6910bef05c43f3/artifacts'

In [26]:
# remove the work file

artificats_path = artificats_path[8:]

In [76]:
LOG_MODEL_PKL

'model.pkl'

In [27]:
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)
print(model_path)
with open(model_path, "rb") as f:
    model = pickle.load(f)
# 
# model

C:/Users/wannatry/Documents/Mellouky/Projects/jobme/notebooks/../models/mlruns/826215840783647024/88dd333d1e6a46029a6910bef05c43f3/artifacts\model.pkl


In [77]:
model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [29]:
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [30]:
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [81]:
classifier

In [31]:
classifier

# Load features cluster

In [47]:
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.unsafe_load(stream)

clusters_config

{'skills_group_0': array(['C#', 'VBA', 'Microsoft SQL Server', 'Microsoft Azure', 'Windows',
        'ASP.NET', 'ASP.NET Core', '.NET', '.NET Core', 'Xamarin'],
       dtype=object),
 'skills_group_1': array(['Python', 'R', 'Django', 'Flask', 'Keras', 'Pandas', 'TensorFlow',
        'Torch/PyTorch'], dtype=object),
 'skills_group_10': array(['Kotlin', 'Firebase', 'SQLite', 'Android', 'Google Cloud Platform'],
       dtype=object),
 'skills_group_11': array(['Unity 3D', 'Unreal Engine'], dtype=object),
 'skills_group_12': array(['TypeScript', 'Angular', 'Angular.js', 'Cordova'], dtype=object),
 'skills_group_13': array(['Ruby', 'Ruby on Rails'], dtype=object),
 'skills_group_14': array(['Bash/Shell/PowerShell', 'Perl', 'Linux'], dtype=object),
 'skills_group_15': array(['Haskell', 'Julia', 'Rust'], dtype=object),
 'skills_group_16': array(['Objective-C', 'Swift', 'MacOS', 'iOS'], dtype=object),
 'skills_group_17': array(['Dart', 'Flutter'], dtype=object),
 'skills_group_18': array(['Dyn

In [83]:
# Reformat into data frame
cluster_skill_map = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

cluster_skill_map

[('skills_group_0', 'C#'),
 ('skills_group_0', 'VBA'),
 ('skills_group_0', 'Microsoft SQL Server'),
 ('skills_group_0', 'Microsoft Azure'),
 ('skills_group_0', 'Windows'),
 ('skills_group_0', 'ASP.NET'),
 ('skills_group_0', 'ASP.NET Core'),
 ('skills_group_0', '.NET'),
 ('skills_group_0', '.NET Core'),
 ('skills_group_0', 'Xamarin'),
 ('skills_group_1', 'Python'),
 ('skills_group_1', 'R'),
 ('skills_group_1', 'Django'),
 ('skills_group_1', 'Flask'),
 ('skills_group_1', 'Keras'),
 ('skills_group_1', 'Pandas'),
 ('skills_group_1', 'TensorFlow'),
 ('skills_group_1', 'Torch/PyTorch'),
 ('skills_group_10', 'Kotlin'),
 ('skills_group_10', 'Firebase'),
 ('skills_group_10', 'SQLite'),
 ('skills_group_10', 'Android'),
 ('skills_group_10', 'Google Cloud Platform'),
 ('skills_group_11', 'Unity 3D'),
 ('skills_group_11', 'Unreal Engine'),
 ('skills_group_12', 'TypeScript'),
 ('skills_group_12', 'Angular'),
 ('skills_group_12', 'Angular.js'),
 ('skills_group_12', 'Cordova'),
 ('skills_group_13', 'R

In [84]:
clusters_df = pd.DataFrame(cluster_skill_map, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,C#
1,skills_group_0,VBA
2,skills_group_0,Microsoft SQL Server
3,skills_group_0,Microsoft Azure
4,skills_group_0,Windows
...,...,...
85,skills_group_8,Spring
86,skills_group_9,HTML/CSS
87,skills_group_9,JavaScript
88,skills_group_9,SQL


# Make predictions 

In [63]:
sample_skills = ['Python', 'Pandas', 'Numpy', 'Keras']


In [64]:
pd.Series(sample_skills).isin(features_names)

0     True
1     True
2    False
3     True
dtype: bool

In [65]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,C#,False
1,skills_group_0,VBA,False
2,skills_group_0,Microsoft SQL Server,False
3,skills_group_0,Microsoft Azure,False
4,skills_group_0,Windows,False
...,...,...,...
85,skills_group_8,Spring,False
86,skills_group_9,HTML/CSS,False
87,skills_group_9,JavaScript,False
88,skills_group_9,SQL,False


In [66]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     3
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_2     0
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

# Onehotencode skills

In [67]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                  Assembly
1     Bash/Shell/PowerShell
2                         C
3                        C#
4                       C++
              ...          
85                 Teraform
86            Torch/PyTorch
87                 Unity 3D
88            Unreal Engine
89                  Xamarin
Length: 90, dtype: object

In [68]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
Teraform                 0
Torch/PyTorch            0
Unity 3D                 0
Unreal Engine            0
Xamarin                  0
Length: 90, dtype: int64

In [85]:
features = pd.concat([ohe_skills,
                      cluster_features])
features

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
skills_group_5           0
skills_group_6           0
skills_group_7           0
skills_group_8           0
skills_group_9           0
Length: 109, dtype: int64

In [70]:
features = features.loc[features_names]
features

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
skills_group_5           0
skills_group_6           0
skills_group_7           0
skills_group_8           0
skills_group_9           0
Length: 109, dtype: int64

In [71]:
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


[array([[0.30443398, 0.69556602]]),
 array([[0.37920238, 0.62079762]]),
 array([[0.37, 0.63]]),
 array([[1., 0.]]),
 array([[1., 0.]]),
 array([[1., 0.]]),
 array([[0.99, 0.01]]),
 array([[1., 0.]]),
 array([[1., 0.]]),
 array([[0.98, 0.02]]),
 array([[1., 0.]]),
 array([[1., 0.]]),
 array([[1., 0.]]),
 array([[0.99, 0.01]]),
 array([[0.95, 0.05]]),
 array([[1., 0.]])]

In [72]:
positive_probs = [prob[0][1] for prob in predictions]
predictions = pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)
predictions

Academic researcher                              0.695566
Data scientist or machine learning specialist    0.630000
Data or business analyst                         0.620798
Scientist                                        0.050000
Developer, front-end                             0.020000
Engineer, data                                   0.010000
Developer, back-end                              0.010000
Database administrator                           0.000000
Developer, desktop or enterprise applications    0.000000
Developer, QA or test                            0.000000
DevOps specialist                                0.000000
Developer, embedded applications or devices      0.000000
Developer, game or graphics                      0.000000
Developer, full-stack                            0.000000
Developer, mobile                                0.000000
System administrator                             0.000000
dtype: float64

In [74]:
predictions / predictions.sum(axis=0) * 100

Academic researcher                              34.157260
Data scientist or machine learning specialist    30.937500
Data or business analyst                         30.485597
Scientist                                         2.455357
Developer, front-end                              0.982143
Engineer, data                                    0.491071
Developer, back-end                               0.491071
Database administrator                            0.000000
Developer, desktop or enterprise applications     0.000000
Developer, QA or test                             0.000000
DevOps specialist                                 0.000000
Developer, embedded applications or devices       0.000000
Developer, game or graphics                       0.000000
Developer, full-stack                             0.000000
Developer, mobile                                 0.000000
System administrator                              0.000000
dtype: float64

In [75]:
(predictions / predictions.sum(axis=0) * 100).sum()

np.float64(99.99999999999999)