In [1]:
CLUSTERS_SKILLS = r'..\data\processed\clusters_skills.yaml'
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "rf_job_predict"
EXPERIMENT_ID = '7998af4e6d8a4ab7b9365772bff3a01c'
ARTIFACT_PATH = '../models/MODELS'

In [2]:
import mlflow
import mlflow.sklearn
import os
import pickle
import yaml
import pandas as pd
import numpy as np

In [3]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [4]:
target_path = ARTIFACT_PATH + '/' + EXPERIMENT_ID + '/artifacts'

In [5]:
target_path

'../models/MODELS/7998af4e6d8a4ab7b9365772bff3a01c/artifacts'

In [6]:
# Get a list of folders in the specified path
folders = [entry.name for entry in os.scandir(target_path) if entry.is_dir()]

# Print the list of folders
print(folders[0])

 Basic RandomForestClassifier with PCA


In [7]:
target_path += '/' + folders[0]+ '/model.pkl'
target_path

'../models/MODELS/7998af4e6d8a4ab7b9365772bff3a01c/artifacts/ Basic RandomForestClassifier with PCA/model.pkl'

In [8]:
with open(target_path, 'rb') as file:
    loaded_model = pickle.load(file)

In [9]:
loaded_model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('pca', PCA(n_components=0.99)),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42,
                                        verbose=1))])

In [10]:
with open(CLUSTERS_SKILLS, 'r') as file :
    clusters_skills = yaml.load(file, Loader=yaml.FullLoader)
    

In [11]:
# clusters_skills = pd.Series(clusters_skills)

In [12]:
# clusters_skills = clusters_skills.reset_index()

In [13]:
# clusters_skills.columns = ['Cluster', 'Skills']

In [14]:
sample_skills =  ['Python', 'Pandas']


In [15]:
test = pd.DataFrame(columns=clusters_skills.keys())

In [16]:
test = test.drop('skills_group_8', axis=1)

In [17]:
test.loc[0] = 0

In [18]:
for cluster, skills in clusters_skills.items() :
    for skill in skills :
        if skill in sample_skills:
            test[cluster] +=1

In [19]:
test

Unnamed: 0,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,...,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0


In [20]:
predictions = loaded_model.predict_proba(test)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [21]:
class_names = loaded_model.classes_.tolist()
class_names

['Academic researcher',
 'Cloud infrastructure engineer',
 'Data or business analyst',
 'Data scientist or machine learning specialist',
 'Database administrator',
 'DevOps specialist',
 'Developer, QA or test',
 'Developer, back-end',
 'Developer, desktop or enterprise applications',
 'Developer, embedded applications or devices',
 'Developer, front-end',
 'Developer, full-stack',
 'Developer, game or graphics',
 'Developer, mobile',
 'Engineer, data',
 'Hardware Engineer',
 'Scientist',
 'Security professional',
 'System administrator']

In [22]:
proba_values = predictions.flatten().tolist()


In [23]:
predicted_class_index = np.argmax(predictions)


In [24]:
predicted_class_name = class_names[predicted_class_index]
predicted_class_name

'Hardware Engineer'

In [25]:
sorted_class_names = [class_name for _, class_name in sorted(zip(proba_values, class_names), reverse=True)]


In [29]:
for class_name in sorted_class_names:
    proba = predictions[0, class_names.index(class_name)]  # Find the probability for the sorted class
    print(f"{class_name}: {proba:.4f}")

Hardware Engineer: 0.2914
Academic researcher: 0.2829
Scientist: 0.2123
Data or business analyst: 0.0975
Developer, QA or test: 0.0577
Engineer, data: 0.0308
Data scientist or machine learning specialist: 0.0147
Developer, full-stack: 0.0127
System administrator: 0.0000
Security professional: 0.0000
Developer, mobile: 0.0000
Developer, game or graphics: 0.0000
Developer, front-end: 0.0000
Developer, embedded applications or devices: 0.0000
Developer, desktop or enterprise applications: 0.0000
Developer, back-end: 0.0000
DevOps specialist: 0.0000
Database administrator: 0.0000
Cloud infrastructure engineer: 0.0000
