In [1]:
import pandas as pd
pd.options.plotting.backend = "plotly" #interactive plots will be useful in this context
import plotly.express as px
import numpy as np
import gensim
from sklearn.decomposition import PCA
import sent2vec
from sent2vec.vectorizer import Vectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
import nltk
import torch
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
df=pd.read_csv('../Data_Acquisition/df_toy.csv')
row_id = range(0,len(df),1)
df['row_id'] = row_id
df.head()

Unnamed: 0.1,Unnamed: 0,dt,url,title,location,country,full_text,list_elements,row_id
0,0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0
1,1,2021-08-08 21:01:09.824428,https://se.indeed.com/viewjob?jk=dbfe7d7c087065d7,"Data Scientist, Search Insights",Stockholm,se,"Data, Research & Insights\nData Science\nThe P...",['\nExperimentation strategy including AB test...,1
2,2,2021-08-08 21:01:18.030788,https://se.indeed.com/viewjob?jk=841a8fdd1bb00f2f,Junior data scientist,411 21 Göteborg,se,Utilifeed is a rapidly expanding startup with ...,"['\nImproving pour data ingestion pipeline, we...",2
3,3,2021-08-08 21:01:20.651507,https://se.indeed.com/viewjob?jk=88c25a48d2da8a7a,Data Scientist,118 72 Stockholm,se,"AI Data scientistLocation: Stockholm, SwedenDi...",['Proven experience in Machine Learning (ML) a...,3
4,4,2021-08-08 21:01:27.135797,https://se.indeed.com/viewjob?jk=1b4b8ee005f06350,Data Scientist,Solna,se,"Currently, we are looking for a talented data ...","['\nYou have a start-up mentality, with delive...",4


In [3]:
def preprocess_for_bert(string):
    string = string.strip("\n.’:")
    string = string.strip("’")
    string = string.strip("\\n")
    string = string.replace("/"," ")
    return(string)

In [42]:
#Pivots df of vacancy texts into a df of job requirements retrieved from 'list_elements' in vacancy df
columns = df.columns.values.tolist()
df_r = pd.DataFrame(columns = ['requirement_raw','requirement_tokenized']+columns)
for index, row in df.iterrows():
    string = row['list_elements']
    remove = ['\,','[',']','\\n']
    for char in remove:
        string.replace(char, '')
    string = string.strip('\n')
    splitted = string.split('\'') #list of requirements is stored as string so split on '
    list = []
    for item in splitted:
        if len(item)>2:
            requirement_raw = item
            requirement_tokenized = preprocess_for_bert(item)
            requirement_tokenized = gensim.utils.simple_preprocess(requirement_tokenized)
            row_list = [requirement_raw,requirement_tokenized]
            for col in columns:
                row_list = row_list+[df.loc[index,col]]          
            df_r.loc[len(df_r)] = row_list 

In [43]:
requirements = []
for index, row in df_r.iterrows():
    requirements.append(preprocess_for_bert(row['requirement_raw']))
df_r['requirement'] = requirements

In [6]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sentences = requirements
embeddings = model.encode(sentences)

In [379]:
df_r['embeddings'] = embeddings.tolist()

In [7]:
def pca_embeddings(vectors, name):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(vectors)
    _df = pd.DataFrame(data = principal_components, columns = [name+'_pc1',name+'_pc2'])
    return(_df)

In [44]:
_df = pca_embeddings(embeddings, 'sbert')
df_r['sbert_pc1'] = _df.iloc[:,0]
df_r['sbert_pc2'] = _df.iloc[:,1]

In [45]:
X = vector_list = np.array(embeddings.tolist())
n_clusters = 25
kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance,repeats=25,avoid_empty_clusters=True)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

df_r['cluster_sbert'] = pd.Series(assigned_clusters, index=df_r.index)

In [10]:
clusters = kclusterer.means()

In [11]:
def get_most_similar(query, n):
    top_k = min(n, len(embeddings))
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    for score, idx in zip(top_results[0], top_results[1]):
        print(df_r.at[idx.item(),'requirement']+" "+str(score.item()))
        
queries = ['proficient in python and R', 'work hard play hard', 'statistics, modelling and math', 'this should be an outlier']
for q in queries: 
    get_most_similar(q,3)

Proficient in using Python, R 0.9712088704109192
Python and or R skills 0.8707777261734009
Programming skills in R, Python or a similar language 0.8440821170806885
Strong work ethic and a can-do attitude 0.5846492052078247
Ability to work effectively & independently 0.5582389831542969
Diverse and challenging tasks 0.5527569055557251
Strong Mathematical Background: statistics, probability, and ideally operations research 0.6722531318664551
Experience with advanced analytical and statistical modelling techniques 0.6244396567344666
Strong background in Statistics and Mathematics 0.6138238310813904
t care if you are weak in other areas",  0.36669251322746277
Document the limitations of the approaches and recommend constraints to improve the performance of the Anomaly Detection algorithms 0.35262495279312134
A big data and analytics nerd 0.3523988723754883


In [12]:
def get_cluster_top_n(cluster, n):
    embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
get_cluster_top_n(20,3)

Experience in any programming language (Python, R, …) Score: 0.8301967728967619
Experience with Python programming Score: 0.8285388875186669
Strong coding skills in Python and the Python data ecosystem, including: pandas, numpy, scikit-learn, scipy, SQL, visualization tools (matplotlib, seaborn, bokeh, plotly, etc.) Score: 0.8122851281217455


In [395]:
2 in df_r[(df_r.cluster_sbert == '0')].index


True

In [423]:
def get_cluster_bottom_n(cluster, n):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores)
#    for score, idx in zip(top_results[0][:10], top_results[1][:10]):
    for score, idx in zip(top_results[0], top_results[1]):
        if idx.item() in df_r[(df_r.cluster_sbert == '0')].index:
            #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
            result_ids.append(idx.item())
            result_requirements.append(df_r.at[idx.item(),'requirement'])
            result_scores.append(score.item())
    return([result_ids[-n:],result_requirements[-n:],result_scores[-n:]])


Work with product managers, data analysts, business representatives, and other stakeholders to understand their problems and design clever algorithmic solutions, and make sure we properly define success for improvements, establish baselines, and measure effects Score: 0.7879772583427895
Work with product managers, data analysts, business representatives, and other stakeholders to understand their problems and design clever algorithmic solutions, and make sure we properly define success for improvements, establish baselines, and measure effects Score: 0.7879772583427895
Collaborate with the business facing teams to help our customers with new insights Score: 0.7771676981391009
None
[[1495, 1817, 977], ['Collaborate with Product and Business teams in order to embed Machine Learning into our apps, services and products and create predictive models and generate insights from our data', 'Use advanced statistical methods including Artificial Intelligence, Machine Learning and quantitative an

In [425]:
def get_cluster_sorted_n(cluster):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores)
#    for score, idx in zip(top_results[0][:10], top_results[1][:10]):
    for score, idx in zip(top_results[0], top_results[1]):
        if idx.item() in df_r[(df_r.cluster_sbert == '0')].index:
            #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
            result_ids.append(idx.item())
            result_requirements.append(df_r.at[idx.item(),'requirement'])
            result_scores.append(score.item())
    return([result_ids,result_requirements,result_scores])

In [438]:
def score_requirements_by_centroid(cluster):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores, descending=True)

    for score, idx in zip(top_results[0], top_results[1]):
        result_ids.append(idx.item())
        result_requirements.append(df_r.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return[result_ids,result_requirements,result_scores]

In [451]:
results = score_requirements_by_centroid(0)
for result in results[0]:
    requirement = results[1][results[0].index(result)]
    score = results[2][results[0].index(result)]
    print(str(score)+" "+requirement)

0.9121549964351906 Experience with Machine Learning
0.8895509241308155 Machine Learning
0.8750788231823406 Desired experience in machine learning ;
0.8676226400152371 Experience with data science and machine learning methods
0.860725256858651 Passion for machine learning and data science
0.858923286210044 Proven experience in Machine Learning (ML) and data science
0.8534780465999184 Developing machine learning enhanced data products
0.8504953152778978 Experience with data analysis and machine learning libraries
0.8501132165492165 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc
0.8501132165492165 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc
0.8461559689822864 Strong knowledge of state-of-the-art machine learning and statistical methods
0.8445757903581443 Machine learning to improve the user experience
0.8445757903581443 

0.22981753163382346 Flexible working environment
0.2293024154097665 Ability to work effectively & independently
0.2292468191413657 , "Ability to work independently and manage one
0.2290007617591136 Meet customers to understand their business needs
0.22882318120769038 Strong experience coding in Python. Other programming languages like Scala, R, or Java are also valued
0.22866182475643407 Experience in a relevant professional role (2-5 years)
0.22822424349661613 Demonstrated excellent customer relations skills
0.22808857561990004 Excellent Written and verbal communication skills. Ability to communicate at a level appropriate to the audience
0.22753484062175744 Self-motivated, team spirit and service-oriented mindset
0.2274822377230259 Collaborative environment with our offices in the US and EMEA
0.2274349452337379 the possibility to work remotely for a good work-life balance
0.22706502774270393 A value driven company with an inclusive and autonomous culture
0.22691696360673508 Knowledge

In [325]:
def describe_requirement(query, n):
    cluster = kclusterer.classify_vectorspace(model.encode(query)) 
    print("Query "+query+" is most similar to these requirements are")
    get_most_similar(query,3)
    
    print("\n")
    print("This query belongs to K-cluster "+str(cluster))
    print("The sentences closest to the centroid of this cluster are ")
    get_cluster_top_n(cluster, 3)

In [327]:
describe_requirement("Proficient in SQL",3)

Query Proficient in SQL is most similar to these requirements are
Proficient in using SQL, 0.9665846228599548
Excellent SQL skills 0.8933648467063904
Advanced SQL 0.8318519592285156


This query belongs to K-cluster 21
The sentences closest to the centroid of this cluster are 
Experience with SQL and relational databases is a plus Score: 0.8975477853652286
Familiarity with relational databases and intermediate level knowledge of SQL Score: 0.8619069805522507
Working with Databases, SQL Score: 0.858118711030419


In [390]:
df_r['cluster_sbert']= df_r['cluster_sbert'].astype(str)
fig = px.scatter(df_r,'sbert_pc1','sbert_pc2', hover_data = ['requirement_raw'],color = 'cluster_sbert' )
fig.show()

In [46]:
columns = df.columns.values.tolist()
df_clusters = pd.DataFrame(columns = [*range(0, n_clusters, 1)])

In [144]:
list_of_vec = []
for index, row in df.iterrows():
    df_r_job = df_r[df_r.row_id == index]
    cluster_list = []
    cluster_vec = []

    for i, row_r in df_r_job.iterrows():
        cluster = row_r['cluster_sbert']
        if cluster not in cluster_list:
            cluster_list.append(cluster)
    for i in range(0,n_clusters):
        if(i in cluster_list):
            cluster_vec.append(1)
        else:
            cluster_vec.append(0)
    list_of_vec.append(cluster_vec)

cols = []
for i in range(0,n_clusters):
    cols.append(str(i))
df_clusters = pd.DataFrame(list_of_vec, columns = cols)
df_c = pd.concat([df, df_clusters], axis=1)

In [319]:
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', None)
df_c

Unnamed: 0.1,Unnamed: 0,dt,url,title,location,country,full_text,list_elements,row_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0
1,1,2021-08-08 21:01:09.824428,https://se.indeed.com/viewjob?jk=dbfe7d7c087065d7,"Data Scientist, Search Insights",Stockholm,se,"Data, Research & Insights\nData Science\nThe P...",['\nExperimentation strategy including AB test...,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0
2,2,2021-08-08 21:01:18.030788,https://se.indeed.com/viewjob?jk=841a8fdd1bb00f2f,Junior data scientist,411 21 Göteborg,se,Utilifeed is a rapidly expanding startup with ...,"['\nImproving pour data ingestion pipeline, we...",2,0,1,0,1,1,0,0,0,0,0,1,1,1,1,0,1,1,1,0,0,1,1,1,0,0
3,3,2021-08-08 21:01:20.651507,https://se.indeed.com/viewjob?jk=88c25a48d2da8a7a,Data Scientist,118 72 Stockholm,se,"AI Data scientistLocation: Stockholm, SwedenDi...",['Proven experience in Machine Learning (ML) a...,3,1,0,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0
4,4,2021-08-08 21:01:27.135797,https://se.indeed.com/viewjob?jk=1b4b8ee005f06350,Data Scientist,Solna,se,"Currently, we are looking for a talented data ...","['\nYou have a start-up mentality, with delive...",4,1,0,1,0,0,0,0,0,1,1,0,0,1,1,1,0,0,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1,2021-08-09 08:58:22.818522,https://at.indeed.com/viewjob?jk=1eadddfd7f2dd6c0,HR Data Scientist (m/f/d),"Wien, W",at,HR Data Scientist (m/f/d)\n!\nIn this internat...,['Processing and analyzing complex structured ...,125,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
126,2,2021-08-09 08:59:04.860975,https://at.indeed.com/viewjob?jk=81f4400146e29934,Data Scientist (m/f/d),"Wien, W",at,Our hearts are burning for sport in all its fa...,"['Understand business problems, challenge and ...",126,1,0,1,1,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,1,1,1,0
127,3,2021-08-09 08:59:26.520043,https://at.indeed.com/viewjob?jk=b101a6cca6341e9e,Data Scientist Consultant (m/f/d),"Wien, W",at,At Machine Learning Reply Austria we strive to...,['\nIdentify and analyze problems in an analyt...,127,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,0,1,1,0
128,4,2021-08-09 08:59:59.803772,https://at.indeed.com/viewjob?jk=82a7a322b2b970e4,Data Scientist (P4),"Wien, W",at,Data Scientist (P4) - (2021/0363 (211264))\nOr...,"['Drive divisional efforts to scope, design, d...",128,1,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0


In [303]:
def complement(skillset):
    clusters = []
    sums = []
    for s in skillset:
        clusters.append(str(kclusterer.classify_vectorspace(model.encode(s))))
    print(clusters)
    for i, row in df_c.iterrows():
        row = df_c.loc[i,clusters]
        sums.append(row.sum(axis =0))
    max_sum = max(sums)
    indices = []
    for i, row in df_c.iterrows():
        row = df_c.loc[i,clusters]
        if row.sum(axis =0) == max_sum:
            indices.append(i)
    df_sim = df_c.loc[indices]
    
    cols = []
    for i in range(0,n_clusters):
        if (str(i) not in clusters) & (str(i) not in cols):
            cols.append(str(i))
    print(cols)
    df_disjoint = df_c.loc[indices, cols]
    
    max_score = 0
    max_cluster = None
    
    for col in cols:
        print(col)
        score = df_disjoint[col].sum()
        similarity_percentage = score/len(df_disjoint)
        print("For cluster "+col+" the sum is "+str(df_disjoint[col].sum())+" which equals a similarity of "+str(similarity_percentage))
        if score>max_score:
            max_score = score
            max_cluster = int(col)
    return max_cluster

In [305]:
most_similar_cluster = complement(["python","communication skills", "Data Science"])
print("\nThe most similar cluster to your query is "+ str(most_similar_cluster))
print("\nThe sentences closest to the centroid of this cluster are ")
get_cluster_top_n(most_similar_cluster,3)

['20', '23', '17']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '18', '19', '21', '22', '24']
0
For cluster 0 the sum is 8 which equals a similarity of 0.7272727272727273
1
For cluster 1 the sum is 0 which equals a similarity of 0.0
2
For cluster 2 the sum is 5 which equals a similarity of 0.45454545454545453
3
For cluster 3 the sum is 8 which equals a similarity of 0.7272727272727273
4
For cluster 4 the sum is 4 which equals a similarity of 0.36363636363636365
5
For cluster 5 the sum is 5 which equals a similarity of 0.45454545454545453
6
For cluster 6 the sum is 3 which equals a similarity of 0.2727272727272727
7
For cluster 7 the sum is 5 which equals a similarity of 0.45454545454545453
8
For cluster 8 the sum is 6 which equals a similarity of 0.5454545454545454
9
For cluster 9 the sum is 6 which equals a similarity of 0.5454545454545454
10
For cluster 10 the sum is 6 which equals a similarity of 0.5454545454545454
11
For cluster 11 th

In [339]:
df_r.head()

Unnamed: 0.1,requirement_raw,requirement_tokenized,Unnamed: 0,dt,url,title,location,country,full_text,list_elements,row_id,requirement,sbert_pc1,sbert_pc2,cluster_sbert
0,Collaborate with stakeholders and other engine...,"[collaborate, with, stakeholders, and, other, ...",0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,Collaborate with stakeholders and other engine...,-0.203189,-1.463076,7
1,\nApply the right tools for the job and solve ...,"[apply, the, right, tools, for, the, job, and,...",0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,Apply the right tools for the job and solve bu...,0.756698,-1.494982,2
2,\nDevelop knowledge representations and virtua...,"[develop, knowledge, representations, and, vir...",0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,Develop knowledge representations and virtual-...,-1.035903,0.009097,0
3,\nProvide model explanations and apply structu...,"[provide, model, explanations, and, apply, str...",0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,Provide model explanations and apply structure...,-0.9629,-0.126159,2
4,\nLearn desired behavior from examples using c...,"[learn, desired, behavior, from, examples, usi...",0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,Learn desired behavior from examples using cau...,-0.196047,1.583192,0


In [359]:
df_r[(df_r.cluster_sbert == '1')]['requirement']
#df_r[(df_r.cluster_sbert == '1')]

45                                    Private health care
46                                         Private pensio
187                  Experience dealing with medical data
331                  Experience dealing with medical data
402     Skilled in antibody-based assays, including im...
                              ...                        
1762    AI helps in clinical trials of cancer - Clinic...
1765    s not forget our nursing services for your sic...
1881                            Optional health insurance
1887    Support design, test and deployment of populat...
2107                                           Homeoffice
Name: requirement, Length: 29, dtype: object

In [None]:
example_skills