In [2]:
import pandas as pd
pd.options.plotting.backend = "plotly" #interactive plots will be useful in this context
import plotly.express as px
import numpy as np
import gensim
from sklearn.decomposition import PCA
import sent2vec
from sent2vec.vectorizer import Vectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
import nltk
import pickle
import torch
from sklearn.cluster import KMeans
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

import dtale
import umap
from sklearn.cluster import DBSCAN

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df=pd.read_csv('../datasets/df_r.csv')

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['requirement'].to_list())

In [5]:
kmeans = KMeans(n_clusters=30, random_state=0).fit(embeddings)

In [7]:
filename = 'all-MiniLM-L6-v2_k30.sav'
pickle.dump(kmeans, open(filename, 'wb'))
filename = 'all-MiniLM-L6-v2_embeddings'
pickle.dump(embeddings, open(filename, 'wb'))

In [4]:
k30 = pickle.load(open('all-MiniLM-L6-v2_k30.sav', 'rb'))
embeddings = pickle.load(open('all-MiniLM-L6-v2_embeddings', 'rb'))

In [5]:
clusters = k30.predict(embeddings)
df['cluster']=clusters

In [12]:
def pca_embeddings(vectors):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(vectors)
    _df = pd.DataFrame(data = principal_components, columns = ['_pc1','_pc2'])
    return(_df)
_df = pca_embeddings(embeddings)
df['pc1'] = _df.iloc[:,0]
df['pc2'] = _df.iloc[:,1]

In [6]:
def umap_embeddings(vectors):
    reducer = umap.UMAP()
    umap_embeddings = reducer.fit_transform(vectors)
    _df = pd.DataFrame(data = umap_embeddings, columns = ['_pc1','_pc2'])
    return(_df, umap_embeddings)
#print(embeddings[:1])
foo = umap_embeddings(embeddings)
_df = foo[0]
df['umap1'] = _df.iloc[:,0]
df['umap2'] = _df.iloc[:,1]
umap_embeddings = foo[1]

In [None]:
dbscan = DBSCAN(eps=5, min_samples=2).fit(umap_embeddings)

df['cluster_dbscan']=dbscan.labels_

In [None]:
df['cluster_dbscan_str']= df['cluster_dbscan'].astype(str)
fig = px.scatter(df,'umap1','umap2', hover_data = ['requirement'],color = 'cluster_dbscan_str')
fig.update_traces(marker=dict(size=1))
fig.show()

In [8]:
df['cluster_str']= df['cluster'].astype(str)
fig = px.scatter(df,'umap1','umap2', hover_data = ['requirement'],color = 'cluster_str' )
fig.update_traces(marker=dict(size=1))
fig.show()

In [87]:
def get_cluster_top_n(cluster, n, df):
    result_ids = []
    result_scores = []
    result_requirements = []
    #embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kmeans.cluster_centers_[cluster], embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
        result_ids.append(idx.item())
        result_requirements.append(df.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return([result_ids[:n],result_requirements[:n],result_scores[:n]])
get_cluster_top_n(20,3,df)

[[6537, 17529, 17079],
 ['share your knowledge in and across project teams',
  'Working closely with stakeholders from various analytical, DevOps, and product teams',
  'Collaborate with a team of professionals with whom you are building a fast-growing company'],
 [0.7798840999603271, 0.7635129690170288, 0.7566394805908203]]

In [24]:
for i in range(30):
    print(get_cluster_top_n(i,3))
    print('-----------------------')

[[5224, 10954, 438], ['An international environment: We have offices in Leuven, Warsaw, London and Sydney and are present in many countries in EU but also other continents.', 'Based in the Benelux – we have offices in Amsterdam, Eindhoven and Brussels within an European and Global networ', 'Work in one of our offices or remote (from germany, spain, italy, france, netherlands, poland or the UK)'], [0.7427778244018555, 0.7249656915664673, 0.7087485790252686]]
-----------------------
[[14079, 17760, 11756], ['Flexible working hours, mobile working.', 'work environment with flexible working hours', 'Flexible working hours.'], [0.8282475471496582, 0.8218463659286499, 0.8086700439453125]]
-----------------------
[[18149, 14087, 1888], ['You are keen on bringing scientific findings into application, creating tangible innovatio', 'You will communicate closely with your interdisciplinary teams and stakeholder and present your scientific findings', 'Communicate and present analytical findings to

[[10860, 18133, 16080], ['Fluent in written and spoken English', 'Fluent in written and spoken English', 'Fluent spoken and written English'], [0.8687751293182373, 0.8687751293182373, 0.8683462738990784]]
-----------------------
[[13908, 1310, 9574], ['Strong analytical and problem-solving skills', 'Strong analytical and problem-solving skills', 'Strong analytical and problem-solving skills;'], [0.9054720401763916, 0.9054720401763916, 0.9039334058761597]]
-----------------------
[[6537, 17529, 17079], ['share your knowledge in and across project teams', 'Working closely with stakeholders from various analytical, DevOps, and product teams', 'Collaborate with a team of professionals with whom you are building a fast-growing company'], [0.7798840999603271, 0.7635129690170288, 0.7566394805908203]]
-----------------------
[[20413, 11580, 19378], ['Master’s degree or PhD in computer science, machine learning, data science, mathematics, statistics, or related quantitative field', 'Degree in m

In [29]:
def cluster_sentence(sentence):
    sentences = [sentence]
    embedding = model.encode(sentences)
    cluster = kmeans.predict(embedding)
    return cluster[0]

In [67]:
nonsense = []
nonsense.append(cluster_sentence('you can work from home if you want'))
nonsense.append(cluster_sentence('You will receive excellent compensation'))
nonsense.append(cluster_sentence('we have people from all over the world'))
nonsense.append(cluster_sentence('\\nMaster'))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [62]:
similarities = []
for index, row in df.iterrows():
    score = util.pytorch_cos_sim(kmeans.cluster_centers_[row['cluster']], embeddings[index])[0].item()
    similarities.append(score)
    #print(util.pytorch_cos_sim(kmeans.cluster_centers_[row['cluster']], embeddings[index])[0].item())
df['similarity_to_centroid']=similarities

In [93]:
df_filtered = df
for n in nonsense:
    print(n)
    df_filtered = df.loc[df['cluster'] != n]

1
21012
29
21011
0
21064
16
20933


In [99]:
cluster = cluster_sentence('Dashboarding: Power BI (expert) / Tibco Spotfire')
print(cluster)
get_cluster_top_n(cluster,3,df_filtered)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

9


[[20698, 1772, 17476],
 ['Create business intelligence, dashboards, visualizations, and or other advanced analytics reports to adequately tell the business narrative and offer recommendations that are practiced, actionable, and have material impact, in addition to being well-supported by analytical models and data',
  'Besides modelling tasks you will be part of generating new business insights for surrounding experts and stakeholders using advanced analytics',
  'Provide the business with analysis and BI tools (dashboards, APIs, reports, etc.)'],
 [0.842223584651947, 0.8156753778457642, 0.8118122220039368]]

In [105]:
skills_cv_koen = ['Partly responsible for product quality and technological yield of cheese production process',\
                  'By Data Driven approach I automated 75% of my original role and realized a yield improvement of >€500.000/year',\
                  'After working as an operator I took advantage of a growth opportunity to lead a team of 7 operators in 3 production departments',\
                  ' Development of physics lab excercise program',\
                  'Data Science (MSc):IU International University (In progress, thesis writing phase)',\
                  ' Python','SQL','Visual Basic','M, Dax', 'Jupyter Notebook', 'Dataiku Data Science Studio','Docker',\
                  'Dashboarding: Power BI (expert) / Tibco Spotfire','Languages Dutch English']
cluster_skill = []
similarities = []
for skill in skills_cv_koen:
    cluster_skill.append(cluster_sentence(skill))
    score = util.pytorch_cos_sim(kmeans.cluster_centers_[cluster], model.encode(skill)[0].item())
print(cluster_skill)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [103]:
print('-------------------')
print('Skillset Analysis')
print('Skills present in given skillset')
print('=================================')
for cluster in range(30):
    if cluster not in nonsense and cluster in cluster_skill:
        requirement_list = get_cluster_top_n(cluster,3, df_filtered)[1]
        print('cluster '+str(cluster)+' is present in the given skillset, requirements close to the centroid of cluster '+str(cluster)+' are:')
        for r in requirement_list:
            if len(r)<300:
                print(r)
        print('-------------------------------')

-------------------
Skillset Analysis
Skills present in given skillset
cluster 3 is present in the given skillset, requirements close to the centroid of cluster 3 are:
Programming experience in Python, plus ideally some exposure to R, Scala or SQL
Proficiency in programming in either Python or R and relevant data manipulation packages
Experience in any programming language (Python, R, …)
-------------------------------
cluster 6 is present in the given skillset, requirements close to the centroid of cluster 6 are:
Establish scalable, efficient, automated processes for model development and validation\n 
Develop software tools, hardware infrastructure or data products that exploit state of the art research to place technology in the hands of the operational users.
Frame, design and execute your own solutions to modeling problems that will improve user experience and or optimize key metrics important to the business.
-------------------------------
cluster 9 is present in the given skill

In [97]:
for cluster in range(30):
    if cluster not in nonsense and cluster not in cluster_skill:
        requirement_list = get_cluster_top_n(cluster,3, df_filtered)[1]
        print('cluster '+str(cluster)+' is missing from the given skillset, requirements close to the centroid of cluster '+str(cluster)+' are:')
        for r in requirement_list:
            if len(r)<300:
                print(r)
        print('-------------------------------')

cluster 2 is missing from the given skillset, requirements close to the centroid of cluster 2 are:
You are keen on bringing scientific findings into application, creating tangible innovatio
You will communicate closely with your interdisciplinary teams and stakeholder and present your scientific findings
Communicate and present analytical findings to researchers
-------------------------------
cluster 4 is missing from the given skillset, requirements close to the centroid of cluster 4 are:
Experience with data visualisation and related tools
Experience in data visualization and reporting
Experience with data visualization tools
-------------------------------
cluster 5 is missing from the given skillset, requirements close to the centroid of cluster 5 are:
Substantial hands-on experience and demonstrated knowledge leading scientific projects in biological areas including cell biology, biochemistry, toxicology, pharmacology.
By keeping up-to-date with the latest computational capabilit