In [6]:
# constants 
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "623eb74c190d43db818db63cba2ac287"

LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [7]:
# load packages
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

## _Initialize_

### _1. Mlflow_

In [8]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = run.info.artifact_uri

In [12]:
artificats_path

'file:///C:/Users/DELL/End2End_DS_Projects/JobTechGuide/notebooks/../models/mlruns/391059065298274482/623eb74c190d43db818db63cba2ac287/artifacts'

### _Load model_

In [15]:
# Fix path by stripping 'file:///' and using os.path.join
artificats_path = artificats_path.replace('file:///', '').replace('/', '\\')
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)

# Load model
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [16]:
# Load data pkl
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [17]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [18]:
classifier

### _Load skills Clusters_


In [19]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

clusters_config

{'skills_group_0': ['Groovy',
  'Java',
  'Oracle',
  'Oracle Cloud Infrastructure',
  'Spring',
  'Atom',
  'Eclipse',
  'IntelliJ',
  'NetBeans',
  'Sublime Text'],
 'skills_group_1': ['C#',
  'F#',
  'PowerShell',
  'SQL',
  'VBA',
  'Microsoft SQL Server',
  'Microsoft Azure',
  'ASP.NET',
  'ASP.NET Core ',
  '.NET Core / .NET 5',
  '.NET Framework',
  'Xamarin',
  'Notepad++',
  'Rider',
  'Visual Studio'],
 'skills_group_10': ['Cassandra',
  'Couchbase',
  'Elasticsearch',
  'Redis',
  'DigitalOcean'],
 'skills_group_11': ['Ruby', 'Ruby on Rails', 'RubyMine', 'TextMate'],
 'skills_group_12': ['Assembly', 'C', 'C++', 'Qt'],
 'skills_group_13': ['Unity 3D', 'Unreal Engine'],
 'skills_group_14': ['APL',
  'COBOL',
  'Crystal',
  'Delphi',
  'IBM DB2',
  'IBM Cloud or Watson'],
 'skills_group_15': ['Svelte', 'Deno'],
 'skills_group_16': ['TypeScript', 'Angular', 'Angular.js', 'Cordova'],
 'skills_group_17': ['Objective-C', 'Swift', 'Xcode'],
 'skills_group_18': ['Bash/Shell', 'Perl'

In [20]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Groovy
1,skills_group_0,Java
2,skills_group_0,Oracle
3,skills_group_0,Oracle Cloud Infrastructure
4,skills_group_0,Spring
...,...,...
120,skills_group_9,Firebase
121,skills_group_9,SQLite
122,skills_group_9,Google Cloud Platform
123,skills_group_9,Flutter
