In [47]:
import mlflow
from mlflow.tracking import MlflowClient
import os
import pickle
import pandas as pd
import yaml

In [48]:
MLFLOW_TRACKING_URI='../models/mlruns'
MLFLOW_EXPERIMENT_NAME="skills_jobs_stackoverflow"
MLFLOW_RUN_ID="cb06262089ad4c5c82d2da3046219870"


LOG_DATA_PKL="data.pkl"
LOG_MODEL_PKL="model.pkl"
LOG_METRICS_PKL="metrics.pkl"

CLUSTERS_YAML_PATH="features_skills_clusters_description.yaml"

In [49]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client=MlflowClient()

run=mlflow.get_run(MLFLOW_RUN_ID)

artifacts_path=run.info.artifact_uri

# Model loading

In [50]:
model_path=os.path.join(artifacts_path,LOG_MODEL_PKL)

with open(model_path,"rb") as f:
    
    model=pickle.load(f)
    
model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [51]:
# load data pkl 

data_path=os.path.join(artifacts_path,LOG_DATA_PKL)
print(data_path)

with open(data_path,"rb") as handle:
    
    data=pickle.load(handle)


../models/mlruns/0/cb06262089ad4c5c82d2da3046219870/artifacts\data.pkl


In [52]:
print(data.keys())

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])


In [53]:
data['features_names']

['APL',
 'Assembly',
 'Bash/Shell',
 'C',
 'C#',
 'C++',
 'COBOL',
 'Clojure',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'LISP',
 'Matlab',
 'Node.js',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Swift',
 'TypeScript',
 'VBA',
 'Cassandra',
 'Couchbase',
 'DynamoDB',
 'Elasticsearch',
 'Firebase',
 'IBM DB2',
 'MariaDB',
 'Microsoft SQL Server',
 'MongoDB',
 'MySQL',
 'Oracle',
 'PostgreSQL',
 'Redis',
 'SQLite',
 'AWS',
 'DigitalOcean',
 'Google Cloud Platform',
 'Heroku',
 'IBM Cloud or Watson',
 'Microsoft Azure',
 'Oracle Cloud Infrastructure',
 'ASP.NET',
 'ASP.NET Core ',
 'Angular',
 'Angular.js',
 'Django',
 'Drupal',
 'Express',
 'FastAPI',
 'Flask',
 'Gatsby',
 'Laravel',
 'React.js',
 'Ruby on Rails',
 'Spring',
 'Svelte',
 'Symfony',
 'Vue.js',
 'jQuery',
 '.NET Core / .NET 5',
 '.NET Framework',
 'Apache Spar

In [54]:
features_name=pd.Series(data['features_names'])

target_names=pd.Series(data['targets_names'])

classifier=model['model_object']

In [55]:
classifier

In [86]:
features_name

0                APL
1           Assembly
2         Bash/Shell
3                  C
4                 C#
           ...      
144    skills_group5
145    skills_group6
146    skills_group7
147    skills_group8
148    skills_group9
Length: 149, dtype: object

# Load skills clusters

In [58]:
with open(CLUSTERS_YAML_PATH,"r") as stream:
    
    clusters_config=yaml.safe_load(stream)
    
clusters_config

{'skills_group0': ['C',
  'C',
  'C',
  'C',
  'Assembly',
  'Assembly',
  'Assembly',
  'Assembly'],
 'skills_group1': ['DigitalOcean',
  'Ruby',
  'Ruby',
  'Ruby',
  'Ruby',
  'Ruby on Rails',
  'Elixir',
  'Elixir',
  'Symfony',
  'Elixir',
  'Elixir',
  'RubyMine'],
 'skills_group10': ['SQLite',
  'Xcode',
  'Swift',
  'Swift',
  'Swift',
  'Swift',
  'Cordova',
  'Xamarin',
  'Objective-C',
  'Objective-C',
  'Objective-C',
  'Objective-C'],
 'skills_group11': ['R', 'R', 'R', 'R', 'RStudio', 'Julia', 'Julia'],
 'skills_group12': ['Sublime Text',
  'Atom',
  'IBM Cloud or Watson',
  'APL',
  'APL',
  'APL',
  'APL',
  'COBOL',
  'COBOL'],
 'skills_group13': ['SQL',
  'SQL',
  'SQL',
  'SQL',
  'Microsoft SQL Server',
  'Microsoft Azure',
  'PowerShell',
  'PowerShell',
  'Oracle',
  'PowerShell',
  'PowerShell',
  'F#',
  'F#'],
 'skills_group14': ['NumPy',
  'Pandas',
  'IPython/Jupyter',
  'TensorFlow',
  'Keras',
  'Torch/PyTorch'],
 'skills_group15': ['Kotlin',
  'Kotlin',
  '

In [59]:
clusters=[(cluster_name,cluster_skill)
         for cluster_name,cluster_skills in clusters_config.items()
         for cluster_skill in cluster_skills]

cluster_df=pd.DataFrame(clusters,columns=['cluster_name','skill'])

cluster_df

Unnamed: 0,cluster_name,skill
0,skills_group0,C
1,skills_group0,C
2,skills_group0,C
3,skills_group0,C
4,skills_group0,Assembly
...,...,...
234,skills_group9,Yarn
235,skills_group9,Heroku
236,skills_group9,Vue.js
237,skills_group9,Angular.js


# Some prediction

In [66]:
features_name

0                APL
1           Assembly
2         Bash/Shell
3                  C
4                 C#
           ...      
144    skills_group5
145    skills_group6
146    skills_group7
147    skills_group8
148    skills_group9
Length: 149, dtype: object

In [92]:
sample_skill=['Scala','Hadoop','Python']

pd.Series(sample_skill).isin(features_name)

0    True
1    True
2    True
dtype: bool

In [114]:
sample_cluster=cluster_df.copy()

sample_cluster['sample_skills']=sample_cluster['skill'].isin(sample_skill)
sample_cluster

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group0,C,False
1,skills_group0,C,False
2,skills_group0,C,False
3,skills_group0,C,False
4,skills_group0,Assembly,False
...,...,...,...
234,skills_group9,Yarn,False
235,skills_group9,Heroku,False
236,skills_group9,Vue.js,False
237,skills_group9,Angular.js,False


In [71]:
cluster_features=sample_cluster.groupby('cluster_name')['sample_skills'].sum()
cluster_features

cluster_name
skills_group0     0
skills_group1     0
skills_group10    0
skills_group11    0
skills_group12    0
skills_group13    0
skills_group14    0
skills_group15    0
skills_group16    0
skills_group17    0
skills_group18    0
skills_group19    0
skills_group2     9
skills_group20    0
skills_group21    0
skills_group22    0
skills_group23    0
skills_group3     0
skills_group4     0
skills_group5     0
skills_group6     0
skills_group7     0
skills_group8     0
skills_group9     0
Name: sample_skills, dtype: int64

In [93]:
features_name


0                APL
1           Assembly
2         Bash/Shell
3                  C
4                 C#
           ...      
144    skills_group5
145    skills_group6
146    skills_group7
147    skills_group8
148    skills_group9
Length: 149, dtype: object

In [102]:
skills_names=features_name[~features_name.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [104]:
skills_names.isin(sample_cluster)

0      False
1      False
2      False
3      False
4      False
       ...  
120    False
121    False
122    False
123    False
124    False
Length: 125, dtype: bool

In [107]:
ohe_skills = pd.Series(skills_names.isin(sample_cluster).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

In [109]:
features = pd.concat([ohe_skills,
                      cluster_features])
features

APL              0
Assembly         0
Bash/Shell       0
C                0
C#               0
                ..
skills_group5    0
skills_group6    0
skills_group7    0
skills_group8    0
skills_group9    0
Length: 149, dtype: int64

In [110]:
features = features.loc[features_name]
features

APL              0
Assembly         0
Bash/Shell       0
C                0
C#               0
                ..
skills_group5    0
skills_group6    0
skills_group7    0
skills_group8    0
skills_group9    0
Length: 149, dtype: int64

# Prediction

In [111]:
prediction=classifier.predict_proba([features.values])
prediction

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


[array([[0.85, 0.15]]),
 array([[0.81, 0.19]]),
 array([[0.96, 0.04]]),
 array([[0.93, 0.07]]),
 array([[0.97, 0.03]]),
 array([[0.83, 0.17]]),
 array([[0.82, 0.18]]),
 array([[0.92, 0.08]]),
 array([[0.97, 0.03]]),
 array([[0.96, 0.04]]),
 array([[0.94, 0.06]]),
 array([[0.9, 0.1]]),
 array([[1., 0.]]),
 array([[0.86, 0.14]]),
 array([[0.87, 0.13]]),
 array([[0.9, 0.1]])]

In [112]:
positive_probs=[prob[0][1] for prob in prediction]

pd.Series(positive_probs,index=target_names).sort_values(ascending=False)


Data or business analyst                         0.19
Developer, back-end                              0.18
Developer, QA or test                            0.17
Academic researcher                              0.15
Engineer, data                                   0.14
Scientist                                        0.13
Developer, game or graphics                      0.10
System administrator                             0.10
Developer, desktop or enterprise applications    0.08
Database administrator                           0.07
Developer, full-stack                            0.06
Data scientist or machine learning specialist    0.04
Developer, front-end                             0.04
DevOps specialist                                0.03
Developer, embedded applications or devices      0.03
Developer, mobile                                0.00
dtype: float64