In [1]:
DF = r'../data/processed/clusters_skills_df.pkl'
CLUSTERS_SKILLS = r'..\data\processed\clusters_skills.yaml'
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "rf_job_predict"
EXPERIMENT_ID = 'c410d8ab2d42489ba0b8738b000d13e5'
ARTIFACT_PATH = '../models/MODELS'

In [2]:
import mlflow
import mlflow.sklearn
import os
import pickle
import yaml
import pandas as pd
import numpy as np

In [3]:
df = pd.read_pickle(DF)

In [4]:
x = df.droplevel(0, axis=1)

In [5]:
x =x.iloc[:,25:]

In [6]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [7]:
target_path = ARTIFACT_PATH + '/' + EXPERIMENT_ID + '/artifacts'

In [8]:
target_path

'../models/MODELS/c410d8ab2d42489ba0b8738b000d13e5/artifacts'

In [9]:
# Get a list of folders in the specified path
folders = [entry.name for entry in os.scandir(target_path) if entry.is_dir()]

# Print the list of folders
print(folders[0])

 Basic RandomForestClassifier with PCA


In [10]:
target_path += '/' + folders[0]+ '/model.pkl'
target_path

'../models/MODELS/c410d8ab2d42489ba0b8738b000d13e5/artifacts/ Basic RandomForestClassifier with PCA/model.pkl'

In [11]:
with open(target_path, 'rb') as file:
    loaded_model = pickle.load(file)

In [12]:
loaded_model

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('pca', PCA(n_components=0.99)),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42,
                                        verbose=1))])

In [13]:
with open(CLUSTERS_SKILLS, 'r') as file :
    clusters_skills = yaml.load(file, Loader=yaml.FullLoader)
    

In [14]:
del clusters_skills['skills_group_8']

In [15]:
# clusters_skills = pd.Series(clusters_skills)

In [16]:
# clusters_skills = clusters_skills.reset_index()

In [17]:
# clusters_skills.columns = ['Cluster', 'Skills']

In [18]:
sample_skills =  ['JavaScript', 'Python', 'Angular', 'MySQL', 'Selenium', 'Appium']
target_class = "Engineer, data"
target_score = 0

In [19]:
test = pd.DataFrame(columns=list(clusters_skills.keys()) + x.columns.to_list())

In [20]:
test.loc[0] = 0

In [21]:
for cluster, skills in clusters_skills.items() :
    for skill in skills :
        if skill in sample_skills:
            test[cluster] +=1

In [22]:
for skill in sample_skills :
    if skill in test.columns.tolist() :
        test[skill] += 1

In [23]:
test.sum()

skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
                  ..
liblittletest      0
npm                0
pnpm               0
snitch             0
tunit              0
Length: 260, dtype: int64

In [24]:
class_names = loaded_model.classes_
class_names

array(['Academic researcher', 'Cloud infrastructure engineer',
       'Data or business analyst',
       'Data scientist or machine learning specialist',
       'Database administrator', 'DevOps specialist',
       'Developer, QA or test', 'Developer, back-end',
       'Developer, desktop or enterprise applications',
       'Developer, front-end', 'Developer, full-stack',
       'Developer, game or graphics', 'Developer, mobile',
       'Engineer, data', 'Scientist', 'Security professional',
       'System administrator'], dtype=object)

In [25]:
predictions = loaded_model.predict_proba(test)


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [26]:
for class_name, probability in zip(class_names, predictions[0]):  # Assuming predictions has shape (1, n_classes)
    print(f"Class: {class_name}, Probability: {probability:.4f}")
    if class_name == target_class :
        target_score = probability
        

Class: Academic researcher, Probability: 0.2309
Class: Cloud infrastructure engineer, Probability: 0.0000
Class: Data or business analyst, Probability: 0.0300
Class: Data scientist or machine learning specialist, Probability: 0.0100
Class: Database administrator, Probability: 0.0000
Class: DevOps specialist, Probability: 0.0100
Class: Developer, QA or test, Probability: 0.0067
Class: Developer, back-end, Probability: 0.1100
Class: Developer, desktop or enterprise applications, Probability: 0.0520
Class: Developer, front-end, Probability: 0.1250
Class: Developer, full-stack, Probability: 0.2758
Class: Developer, game or graphics, Probability: 0.0000
Class: Developer, mobile, Probability: 0.0800
Class: Engineer, data, Probability: 0.0071
Class: Scientist, Probability: 0.0000
Class: Security professional, Probability: 0.0000
Class: System administrator, Probability: 0.0625


In [27]:
sorted_classes = sorted(zip(class_names, predictions[0]), key=lambda x: x[1], reverse=True)
for class_name, probability in sorted_classes:
    print(f"Class: {class_name}, {probability:.4f}")

Class: Developer, full-stack, 0.2758
Class: Academic researcher, 0.2309
Class: Developer, front-end, 0.1250
Class: Developer, back-end, 0.1100
Class: Developer, mobile, 0.0800
Class: System administrator, 0.0625
Class: Developer, desktop or enterprise applications, 0.0520
Class: Data or business analyst, 0.0300
Class: Data scientist or machine learning specialist, 0.0100
Class: DevOps specialist, 0.0100
Class: Engineer, data, 0.0071
Class: Developer, QA or test, 0.0067
Class: Cloud infrastructure engineer, 0.0000
Class: Database administrator, 0.0000
Class: Developer, game or graphics, 0.0000
Class: Scientist, 0.0000
Class: Security professional, 0.0000


In [28]:
idx = np.argmax(predictions)
class_names[idx]

'Developer, full-stack'

In [29]:
sorted_classes

[('Developer, full-stack', 0.2758333333333333),
 ('Academic researcher', 0.23085714285714284),
 ('Developer, front-end', 0.125),
 ('Developer, back-end', 0.10999999999999999),
 ('Developer, mobile', 0.08),
 ('System administrator', 0.0625),
 ('Developer, desktop or enterprise applications', 0.052000000000000005),
 ('Data or business analyst', 0.03),
 ('Data scientist or machine learning specialist', 0.01),
 ('DevOps specialist', 0.01),
 ('Engineer, data', 0.007142857142857143),
 ('Developer, QA or test', 0.006666666666666666),
 ('Cloud infrastructure engineer', 0.0),
 ('Database administrator', 0.0),
 ('Developer, game or graphics', 0.0),
 ('Scientist', 0.0),
 ('Security professional', 0.0)]

In [30]:
d = {}

In [31]:
others = []
for skill in x.columns.tolist() :
    if skill not in sample_skills:
        others.append(skill)

In [32]:
for skill in others :
    new = test.copy()
    for cluster, skills in clusters_skills.items() :
        if skill in skills:
            new[cluster] +=1
    predictions = loaded_model.predict_proba(new)
    target_class_index = list(class_names).index(target_class)
    target_class_score = predictions[0][target_class_index]
    target_class_score = (target_class_score - target_score) / target_score
    d[skill] = target_class_score
    

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0

[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurr

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0

In [33]:
sorted_items = sorted(d.items(), key=lambda x: x[1], reverse=True)


In [37]:
threshold = 2
suggestions = []
for key, value in sorted_items:
    if value > threshold:
        suggestions.append(key)
    print(f"{key}: {value}")

Scala: 7.558333333333334
Cassandra: 7.558333333333334
Snowflake: 7.558333333333334
Play Framework: 7.558333333333334
Apache Kafka: 7.558333333333334
Apache Spark: 7.558333333333334
Hadoop: 7.558333333333334
Cloudflare: 5.65
Digital Ocean: 5.65
Hetzner: 5.65
Linode, now Akamai: 5.65
Vultr: 5.65
R: 3.3951754385964916
Django: 3.3951754385964916
FastAPI: 3.3951754385964916
Flask: 3.3951754385964916
CUDA: 3.3951754385964916
Hugging Face Transformers: 3.3951754385964916
Keras: 3.3951754385964916
NumPy: 3.3951754385964916
Opencv: 3.3951754385964916
Pandas: 3.3951754385964916
Scikit-Learn: 3.3951754385964916
TensorFlow: 3.3951754385964916
Tidyverse: 3.3951754385964916
Torch/PyTorch: 3.3951754385964916
Pip: 3.3951754385964916
GDScript: 3.2
Go: 3.2
Lua: 3.2
Rust: 3.2
Dynamodb: 3.2
Elasticsearch: 3.2
InfluxDB: 3.2
Neo4J: 3.2
PostgreSQL: 3.2
Redis: 3.2
Amazon Web Services (AWS): 3.2
OpenShift: 3.2
OpenStack: 3.2
RabbitMQ: 3.2
Tauri: 3.2
Ansible: 3.2
Cargo: 3.2
Docker: 3.2
Godot: 3.2
Homebrew: 3.2


In [35]:
print ('Your skill set is {}:'.format(sample_skills))
print ('============================')
print ('Your Target Role is {}:'.format(target_class))
print ('============================')
print ('You may consider learning {}:'.format(suggestions))


Your skill set is ['JavaScript', 'Python', 'Angular', 'MySQL', 'Selenium', 'Appium']:
Your Target Role is Engineer, data:
You may consider learning ['Scala', 'Cassandra', 'Snowflake', 'Play Framework', 'Apache Kafka', 'Apache Spark', 'Hadoop', 'Cloudflare', 'Digital Ocean', 'Hetzner', 'Linode, now Akamai', 'Vultr', 'R', 'Django', 'FastAPI', 'Flask', 'CUDA', 'Hugging Face Transformers', 'Keras', 'NumPy', 'Opencv', 'Pandas', 'Scikit-Learn', 'TensorFlow', 'Tidyverse', 'Torch/PyTorch', 'Pip', 'GDScript', 'Go', 'Lua', 'Rust', 'Dynamodb', 'Elasticsearch', 'InfluxDB', 'Neo4J', 'PostgreSQL', 'Redis', 'Amazon Web Services (AWS)', 'OpenShift', 'OpenStack', 'RabbitMQ', 'Tauri', 'Ansible', 'Cargo', 'Docker', 'Godot', 'Homebrew', 'Kubernetes', 'Nix', 'Pacman', 'Podman', 'Terraform', 'Delphi', 'Firebird', 'Qwik', 'Solid.js', 'Chef', 'Puppet', 'Unity 3D', 'Unreal Engine', 'C#', 'Clojure', 'F#', 'Haskell', 'Lisp', 'Nim', 'Objective-C', 'PowerShell', 'Prolog', 'SAS', 'Solidity', 'Swift', 'VBA', 'Visual

In [36]:
print (sample_skills)

['JavaScript', 'Python', 'Angular', 'MySQL', 'Selenium', 'Appium']
