In [1]:
import pandas as pd
import re
import nltk

In [2]:
!pip install wordcloud
!pip install keybert

Collecting keybert
  Downloading keybert-0.5.0.tar.gz (19 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.6 MB/s 
Collecting rich>=10.4.0
  Downloading rich-11.2.0-py3-none-any.whl (217 kB)
[K     |████████████████████████████████| 217 kB 31.5 MB/s 
[?25hCollecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 3.7 MB/s 
Collecting colorama<0.5.0,>=0.4.0
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 39.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.9 MB/s 
[?25hCollecting huggingface-hub
  Downloadi

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
responsibility = """Used my experience in Data Engineering to create end-to-end products that collect, transform, and visualize data without any manual intervention.

Supported studies in various industries, working mostly on Sentiment Analysis, A/B testing, and statistical analyses through interactive dashboards.

As a Technical Team Lead, I had several responsibilities including designing solutions, managing products, project management, and operations optimizations"""

df_raw_input = pd.DataFrame(data={'Job Description': [responsibility],'KeyWords':['']})
df_raw_input

Unnamed: 0,Job Description,KeyWords
0,Used my experience in Data Engineering to crea...,


In [5]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    #remove number
    text = re.sub('\d+', '', text)
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from keybert import KeyBERT

#preprocessing
#vectorization
class Preprocessor(BaseEstimator, TransformerMixin):
    
    #lst_stopwords = set(nltk.corpus.stopwords.words("english"))

    def __init__(self):
        self.lst_stopwords = set(nltk.corpus.stopwords.words("english"))    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        X['Job Description'] = X['Job Description'].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=self.lst_stopwords))
        return X

class KeywordsExtraction(BaseEstimator, TransformerMixin):       
    def __init__(self):
        self.kw_model = KeyBERT()

    def key_words(self,text):
        keywords = self.kw_model.extract_keywords(text,keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.5,top_n=10)
        keywords =  [i[0] for i in keywords]
        return keywords    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        print(X['Job Description'])
        X['KeyWords'] = X['Job Description'].apply(self.key_words)

        return X[['KeyWords']]

  # X_list = [i[0] for i in X ]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import feature_extraction
from google.colab import drive
import joblib
drive.mount('/content/drive')
model_testing = joblib.load('/content/drive/MyDrive/Colab Notebooks/pipeline.joblib')

Mounted at /content/drive


In [8]:

full_pipeline = Pipeline([("preprocessor",Preprocessor()),
                  ("keywords_extraction",KeywordsExtraction()),
                  ("model", model_testing)
                  ])

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
full_pipeline.predict_proba(df_raw_input)

0    used experience data engineering create endtoe...
Name: Job Description, dtype: object


array([[0.00228386, 0.01306107, 0.01141509, 0.01195732, 0.00057276,
        0.01177402, 0.00171517, 0.01232499, 0.00538395, 0.00152481,
        0.01304762, 0.00901566, 0.01679079, 0.00057277, 0.02123137,
        0.01341381, 0.00133501, 0.01340938, 0.01030352, 0.00095398,
        0.00152472, 0.00095392, 0.00379519, 0.00209401, 0.00114443,
        0.01268288, 0.01322961, 0.0039842 , 0.02437643, 0.01772362,
        0.01630454, 0.00247268, 0.00019107, 0.00038199, 0.00228419,
        0.01732932, 0.00304071, 0.01233014, 0.00019108, 0.00586409,
        0.01625555, 0.01249907, 0.00190423, 0.00398355, 0.00322894,
        0.01232168, 0.01968189, 0.00931483, 0.0108562 , 0.00148039,
        0.00623292, 0.01756803, 0.00038201, 0.00247302, 0.00697983,
        0.01012747, 0.01432155, 0.0177754 , 0.00529589, 0.00076352,
        0.00883695, 0.0130538 , 0.00133439, 0.01341863, 0.00038202,
        0.01342011, 0.01824855, 0.01952613, 0.0191906 , 0.0058616 ,
        0.00057282, 0.01267923, 0.00641951, 0.00

In [10]:
joblib.dump(full_pipeline,'/content/drive/MyDrive/Colab Notebooks/full_model_v1.joblib')

['/content/drive/MyDrive/Colab Notebooks/full_model_v1.joblib']

In [11]:
# preproc_pipe = ColumnTransformer([
#     ('process', preprocessing_pipeline,['KeyWords','Job Description']),
# ], remainder="drop")

In [12]:
# preproc_pipe

In [13]:
# preproc_pipe.transform(df_raw_input)

In [16]:
model_v1 = joblib.load('/content/drive/MyDrive/Colab Notebooks/full_model_v1.joblib')

In [18]:
model_v1.predict(df_raw_input)

0    used experience data engineering create endtoe...
Name: Job Description, dtype: object


array(['delivery manager'], dtype='<U29')

In [51]:
a = model_v1.predict_proba(df_raw_input)
a

0    used experience data engineering create endtoe...
Name: Job Description, dtype: object


array([[0.00228386, 0.01306107, 0.01141509, 0.01195732, 0.00057276,
        0.01177402, 0.00171517, 0.01232499, 0.00538395, 0.00152481,
        0.01304762, 0.00901566, 0.01679079, 0.00057277, 0.02123137,
        0.01341381, 0.00133501, 0.01340938, 0.01030352, 0.00095398,
        0.00152472, 0.00095392, 0.00379519, 0.00209401, 0.00114443,
        0.01268288, 0.01322961, 0.0039842 , 0.02437643, 0.01772362,
        0.01630454, 0.00247268, 0.00019107, 0.00038199, 0.00228419,
        0.01732932, 0.00304071, 0.01233014, 0.00019108, 0.00586409,
        0.01625555, 0.01249907, 0.00190423, 0.00398355, 0.00322894,
        0.01232168, 0.01968189, 0.00931483, 0.0108562 , 0.00148039,
        0.00623292, 0.01756803, 0.00038201, 0.00247302, 0.00697983,
        0.01012747, 0.01432155, 0.0177754 , 0.00529589, 0.00076352,
        0.00883695, 0.0130538 , 0.00133439, 0.01341863, 0.00038202,
        0.01342011, 0.01824855, 0.01952613, 0.0191906 , 0.0058616 ,
        0.00057282, 0.01267923, 0.00641951, 0.00

In [54]:
model_v1.classes_

array(['account executive', 'account manager', 'administrator',
       'analytics', 'application analyst', 'application developer',
       'application engineer', 'applications developer',
       'automation engineer', 'back end developer', 'business analyst',
       'business intelligence', 'c++ software engineer',
       'cloud architect', 'consultant', 'data', 'data administrator',
       'data analyst', 'data architect', 'data developer',
       'data engineer', 'data entry', 'data manager', 'data migration',
       'data scientist', 'data warehouse', 'database administrator',
       'database analyst', 'delivery manager', 'design engineer',
       'developer', 'devops engineer', 'digital engineer',
       'digital manager', 'embedded engineer', 'engineer',
       'engineering manager', 'front end developer', 'front end engineer',
       'graphic design', 'helpdesk', 'infrastructure engineer',
       'integration engineer', 'it analyst', 'it engineer', 'it manager',
       'it supp