In [11]:
import pandas as pd
pd.options.plotting.backend = "plotly" #interactive plots will be useful in this context
import plotly.express as px
import numpy as np
import gensim
from sklearn.decomposition import PCA
import sent2vec
from sent2vec.vectorizer import Vectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
import nltk
import pickle
import torch
from sklearn.cluster import KMeans
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
df=pd.read_csv('../datasets/df_full.csv')

#Give retracible ID before shuffling the dataset
row_id = range(0,len(df),1)
df['row_id'] = row_id

df = df.sample(frac=1, random_state=1).reset_index(drop=True)

In [13]:
#Train/Test/Evaluation split on shuffled rows
train_frac = 0.7
test_frac = 0.9

n_rows_train = int(len(df)*train_frac)
n_rows_test = int(len(df)*test_frac)

df_train = df.iloc[:n_rows_train]
df_test = df.iloc[n_rows_train:n_rows_test]
df_eval = df.iloc[n_rows_test:]

In [14]:
def preprocess_for_bert(string):
    string = string.strip("\n.’:")
    string = string.strip("’")
    string = string.strip("\\n")
    string = string.replace("/"," ")
    return(string)

In [15]:
def pivot(df):#Pivots df of vacancy texts into a df of job requirements retrieved from 'list_elements' in vacancy df
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    columns = df.columns.values.tolist()
    df_r = pd.DataFrame(columns = ['requirement_raw','requirement_tokenized']+columns)
    for index, row in df.iterrows():
        string = str(row['list_elements'])
        remove = ['\,','[',']','\\n']
        for char in remove:
            string.replace(char, '')
        string = string.strip('\n')
        splitted = string.split('\'') #list of requirements is stored as string so split on '
        list = []
        for item in splitted:
            if len(item)>2:
                requirement_raw = item
                requirement_tokenized = preprocess_for_bert(item)
                requirement_tokenized = gensim.utils.simple_preprocess(requirement_tokenized)
                row_list = [requirement_raw,requirement_tokenized]
                for col in columns:
                    row_list = row_list+[df.loc[index,col]]          
                df_r.loc[len(df_r)] = row_list
    requirements = []
    for index, row in df_r.iterrows():
        requirements.append(preprocess_for_bert(row['requirement_raw']))
    df_r['requirement']=requirements
    embeddings = model.encode(requirements)
    df_r['embedding'] = embeddings.tolist()
    return df_r, embeddings
result_train = pivot(df_train)
result_test = pivot(df_test)
df_r_train = result_train[0]
embeddings_train = result_train[1] #good to keep the embeddings in mem as array
df_r_test = result_test[0]
embeddings_test = result_test[1]

In [229]:
X_train = np.array(embeddings_train.tolist())
kmeans = KMeans(n_clusters=21, random_state=0).fit(X_train)
#df_r['cluster_k100']= kmeans.labels_
#kmeans.predict([[0, 0], [12, 3]])
#kmeans.cluster_centers_
filename = 'k21.sav'
pickle.dump(kmeans, open(filename, 'wb'))

In [159]:
clusters_test = kmeans.predict(np.array(embeddings_test.tolist()))
df_r_test['cluster_k21']=clusters_test

In [18]:
def pca_embeddings(vectors, name):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(vectors)
    _df = pd.DataFrame(data = principal_components, columns = [name+'_pc1',name+'_pc2'])
    return(_df)

In [20]:
_df = pca_embeddings(embeddings_test, 'sbert')
df_r_test['sbert_pc1'] = _df.iloc[:,0]
df_r_test['sbert_pc2'] = _df.iloc[:,1]

In [160]:
df_r_test['cluster_k21']= df_r_test['cluster_k21'].astype(str)
fig = px.scatter(df_r_test,'sbert_pc1','sbert_pc2', hover_data = ['requirement_raw'],color = 'cluster_k21' )
fig.update_traces(marker=dict(size=3))
fig.show()

In [22]:
from collections import Counter
def get_bigrams(requirement_list):
    list = []
    for requirement in requirement_list[1]:
        for word in requirement.split():
            word = word.lower()
            if (len(word) >1) & (word not in stop_words):
                list.append(word)
    
    bigrams = zip(list,list[1:])
    bigrams_count = Counter(bigrams)
    return(bigrams_count)


In [187]:
def get_word_count(requirement_list):
    list = []
    for requirement in requirement_list[1]:
        for word in requirement.split():
            word = word.lower()
            if (len(word) >1) & (word not in stop_words):
                list.append(word)
    word_count = Counter(list)
    return(word_count)

In [161]:
def get_cluster_top_n(cluster, n):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings_test.astype(float)
    top_k = min(n, len(embeddings_test))
    
    cos_scores = util.pytorch_cos_sim(kmeans.cluster_centers_[cluster], embeddings_f)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
        result_ids.append(idx.item())
        result_requirements.append(df_r_test.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return([result_ids[:n],result_requirements[:n],result_scores[:n]])
get_cluster_top_n(20,3)

[[3999, 3992, 2604],
 ['Hands-on experience with cloud platforms like AWS, Azure, …',
  'Experience with cloud platforms like AWS and Azure',
  'Experience in a cloud environments i.e. AWS, Azure, GCE, Kubernetes'],
 [0.9156034794824083, 0.914896260603691, 0.8939065190092181]]

In [188]:
for cluster in range(0,21):
    print("cluster "+str(cluster))
    cluster_length = len(df_r_test.loc[df_r_test['cluster_k21']==str(cluster)])
    print("has length "+str(cluster_length))
    print("has most common bigrams")
    print(get_bigrams(get_cluster_top_n(cluster,cluster_length)).most_common(5))
    print("has most common words")
    print(get_word_count(get_cluster_top_n(cluster,cluster_length)).most_common(5))
    print("----------")
df_r_test.head()
df_r_test.dtypes

cluster 0
has length 240
has most common bigrams
[(('communication', 'skills'), 11), (('ability', 'work'), 11), (('team', 'members'), 8), (('data', 'scientists'), 7), (('product', 'teams'), 6)]
has most common words
[('work', 56), ('team', 55), ('teams', 41), ('business', 38), ('stakeholders', 36)]
----------
cluster 1
has length 326
has most common bigrams
[(('data', 'science'), 22), (('scientific', 'technical'), 8), (('data', 'analysis'), 8), (('related', 'field'), 7), (('years', 'experience'), 6)]
has most common words
[('data', 109), ('scientific', 72), ('experience', 71), ('research', 40), ('development', 40)]
----------
cluster 2
has length 154
has most common bigrams
[(('team', 'player'), 18), (('ability', 'work'), 14), (('communication', 'skills'), 10), (('team', 'members'), 7), (('skills', 'ability'), 6)]
has most common words
[('team', 94), ('teams', 43), ('work', 39), ('skills', 26), ('ability', 25)]
----------
cluster 3
has length 350
has most common bigrams
[(('software', 

requirement_raw           object
requirement_tokenized     object
Unnamed: 0                object
dt                        object
url                       object
title                     object
location                  object
country                   object
full_text                 object
list_elements             object
row_id                    object
requirement               object
embedding                 object
cluster_k100              object
sbert_pc1                float32
sbert_pc2                float32
cluster_k21               object
dtype: object

In [261]:
k20_bigrams = {}
for cluster in range(0,21):
    bigrams = get_bigrams(get_cluster_top_n(cluster,50)).most_common(10)
    bigram_string = ""
    for tuple in bigrams:
        bigram_string+=tuple[0][0]+" "+tuple[0][1]+", "
    k20_bigrams[cluster] =bigram_string
    #print(get_bigrams(get_cluster_top_n(cluster,50)).most_common(5))
filename = 'k20_bigrams'
pickle.dump(k20_bigrams, open(filename, 'wb'))

'team player, ability work, communication skills, team environment, excellent communication, collaboration skills, skills ability, within team, communication teamwork, good communication, '

In [25]:
n = 10
most_common_clusters = df_r_test['cluster_k100'].value_counts()[:n].index.tolist()

In [26]:
for cluster in most_common_clusters:
    print(get_bigrams(get_cluster_top_n(int(cluster),100)).most_common(5))

[(('data', 'analysis'), 9), (('machine', 'learning'), 7), (('model', 'performance'), 5), (('data', 'science'), 5), (('statistical', 'methods'), 5)]
[(('data', 'science'), 16), (('large', 'datasets'), 9), (('big', 'data'), 6), (('data', 'engineering'), 6), (('large', 'data'), 5)]
[(('software', 'development'), 18), (('software', 'engineering'), 6), (('development', 'tools'), 6), (('tools', 'processes'), 6), (('processes', '(docker,'), 6)]
[(('r&d', 'projects'), 4), (('multiple', 'projects'), 4), (('identify', 'opportunities'), 4), (('stakeholders', 'throughout'), 4), (('development', 'projects'), 3)]
[(('machine', 'learning'), 90), (('learning', 'techniques'), 20), (('learning', 'algorithms'), 10), (('learning', 'models'), 8), (('data', 'science'), 7)]
[(('working', 'people'), 5), (('good', 'positive'), 3), (('positive', 'impact.'), 3), (('impact.', 'strive'), 3), (('strive', 'transparent'), 3)]
[(('product', 'teams'), 7), (('customer', 'value'), 5), (('product', 'team'), 4), (('stakeho

In [226]:
df_synthetic_requirements = pd.read_csv('../synthetic_requirements_long.csv', sep = ';')

The synthetic requirements were created manually by brainstorming, see visualization. This list is in no way meant to be exhaustive. The idea behind this list is that in order for the output of the model to be useful the number of clusters should be sufficiently large to determine the difference among the synthetic requirements. This will help choosing an appropriate number of clusters.

In [227]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
synthetic_requirements = []
for requirement in df_synthetic_requirements['requirement'].tolist():
    synthetic_requirements.append(preprocess_for_bert(requirement))
synthetic_embeddings = model.encode(synthetic_requirements)
synthetic_clusters =  kmeans.predict(np.array(synthetic_embeddings.tolist()))

In [62]:
r_index = 0
for requirement in df_synthetic_requirements['requirement'].tolist():
    print(requirement)
    print(synthetic_clusters[r_index])
    print(get_bigrams(get_cluster_top_n(synthetic_clusters[r_index],100)).most_common(5))
    print("")
    r_index+=1
    

Cloud Compute with AWS / GCP / Azure
26
[(('experience', 'cloud'), 12), (('google', 'cloud'), 10), (('cloud', 'platform'), 9), (('experience', 'working'), 9), (('platform', 'technology'), 6)]

Amazon web services experience
26
[(('experience', 'cloud'), 12), (('google', 'cloud'), 10), (('cloud', 'platform'), 9), (('experience', 'working'), 9), (('platform', 'technology'), 6)]

Work experience on Google Cloud Compute
26
[(('experience', 'cloud'), 12), (('google', 'cloud'), 10), (('cloud', 'platform'), 9), (('experience', 'working'), 9), (('platform', 'technology'), 6)]

Working with microsoft Azure
26
[(('experience', 'cloud'), 12), (('google', 'cloud'), 10), (('cloud', 'platform'), 9), (('experience', 'working'), 9), (('platform', 'technology'), 6)]

Containterization with Docker / Kubernetes / Kubeflow
24
[(('new', 'technologies'), 5), (('scientific', 'technical'), 5), (('data', 'science'), 5), (('enable', 'acast'), 4), (('drive', 'learn'), 3)]

Virtualization skills
28
[(('software',

In [63]:
def cluster_synthetic(k):
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X_train)
    synthetic_clusters =  kmeans.predict(np.array(synthetic_embeddings.tolist()))
    return synthetic_clusters

In [64]:
len(np.unique(cluster_synthetic(5)))

3

In [228]:
kclusters = range(1,50,1)
df_score = df_synthetic_requirements #make a copy to manipulate
n_synth = []
for i in kclusters:
    print(i)
    arr = cluster_synthetic(i) #train model for this iteration of k-clusters on training data, use it to cluster synthetic r
    df_score[i]=arr #create a column in score dataframe with the cluster for each synthetic requirement for this model instance
    n_synth.append(len(np.unique(arr))) #append the amount of unique clusters for the current model instance to a list

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [230]:
for column in df_score:
    if type(column) == int:
        loss_list = []
        for requirement_index, requirement_row in df_score.iterrows(): #iterate over each row of synthetic df
            df_label = df_score.loc[df_score['label']==requirement_row['label']] #sub dataframe of requirements with same label
            df_unlabeled = df_score.loc[df_score['label']!=requirement_row['label']] #disjoint of above
            loss = 0

            for labeled_index, labeled_row in df_label.iterrows(): # iterate over sub dataframe 
                if labeled_row[column] != requirement_row[column]: # incorrectly not in same cluster
                    loss+=1/len(df_label)
                    
            for unlabeled_index, unlabeled_row in df_unlabeled.iterrows():
                if unlabeled_row[column] == requirement_row[column]: # incorrectly in same cluster
                    loss+=1/len(df_unlabeled)
                    
            loss_list.append(loss)
        df_score['loss_'+str(column)] = loss_list

In [231]:
df_score

Unnamed: 0,requirement,label,1,2,3,4,5,6,7,8,...,loss_40,loss_41,loss_42,loss_43,loss_44,loss_45,loss_46,loss_47,loss_48,loss_49
0,Cloud Compute with AWS / GCP / Azure,1,0,1,2,0,2,3,2,6,...,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.022222,0.0,0.0
1,Amazon web services experience,1,0,1,2,0,2,3,2,6,...,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.022222,0.0,0.0
2,Work experience on Google Cloud Compute,1,0,1,2,0,2,3,2,6,...,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.022222,0.0,0.0
3,Working with microsoft Azure,1,0,1,2,0,2,3,2,6,...,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.022222,0.0,0.0
4,Containterization with Docker / Kubernetes / K...,2,0,1,2,0,2,0,6,0,...,0.816667,0.588889,0.816667,0.566667,0.316667,0.588889,0.611111,0.361111,0.066667,0.75
5,Virtualization skills,2,0,1,2,0,2,0,6,5,...,0.794444,0.75,0.816667,0.75,0.316667,0.75,0.794444,0.361111,0.066667,0.566667
6,Containerizing work environments using Docker,2,0,1,0,0,2,0,1,4,...,0.838889,0.75,0.772222,0.838889,0.75,0.772222,0.75,0.838889,0.066667,0.75
7,Deployment using Kubeflow,2,0,1,2,0,2,0,1,3,...,0.838889,0.588889,0.838889,0.566667,0.316667,0.588889,0.611111,0.361111,0.066667,0.566667
8,Automation / Scripting Linux proficiency / Bin...,3,0,1,2,0,2,0,6,0,...,0.294444,0.5,0.294444,0.272222,0.5,0.25,0.5,0.361111,0.5,0.522222
9,Automating workflows using shell scripting tools,3,0,1,2,0,2,0,6,0,...,0.772222,0.75,0.772222,0.272222,0.75,0.772222,0.75,0.361111,0.75,0.522222


In [223]:
df_score.iloc[:, 103:].mean().plot(kind='bar')

In [67]:
px.line(x=kclusters,y=n_synth)

In [None]:
for cluster in range(0,99):
    print(get_bigrams(get_cluster_top_n(cluster,100)).most_common(5))


In [157]:
def get_clusterer_top_n(cluster, n, n_clusters):
    clusterer = KMeans(n_clusters=n_clusters, random_state=0).fit(X_train)
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings_test.astype(float)
    top_k = min(n, len(embeddings_test))
    
    
    cos_scores = util.pytorch_cos_sim(clusterer.cluster_centers_[cluster], embeddings_f)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
        result_ids.append(idx.item())
        result_requirements.append(df_r_test.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return([result_ids[:n],result_requirements[:n],result_scores[:n]])
get_clusterer_top_n(20,3,21)

[[3999, 3992, 2604],
 ['Hands-on experience with cloud platforms like AWS, Azure, …',
  'Experience with cloud platforms like AWS and Azure',
  'Experience in a cloud environments i.e. AWS, Azure, GCE, Kubernetes'],
 [0.9156034794824083, 0.914896260603691, 0.8939065190092181]]

In [None]:
clusterer_21 = KMeans(n_clusters = 21, random_state=0).fit(X_train)


In [190]:
df_score.head(10)

Unnamed: 0,requirement,label,10,15,20,score_10,score_15,score_20,1,2,...,score_40,score_41,score_42,score_43,score_44,score_45,score_46,score_47,score_48,score_49
0,Cloud Compute with AWS / GCP / Azure,1,7,12,11,0.777778,0.977778,1.0,0,1,...,0.977778,1.0,1.0,0.977778,1.0,1.0,1.0,0.977778,1.0,1.0
1,Amazon web services experience,1,7,12,11,0.777778,0.977778,1.0,0,1,...,0.977778,1.0,1.0,0.977778,1.0,1.0,1.0,0.977778,1.0,1.0
2,Work experience on Google Cloud Compute,1,7,12,11,0.777778,0.977778,1.0,0,1,...,0.977778,1.0,1.0,0.977778,1.0,1.0,1.0,0.977778,1.0,1.0
3,Working with microsoft Azure,1,7,12,11,0.777778,0.977778,1.0,0,1,...,0.977778,1.0,1.0,0.977778,1.0,1.0,1.0,0.977778,1.0,1.0
4,Containterization with Docker / Kubernetes / K...,2,7,12,4,0.777778,0.161111,0.616667,0,1,...,0.183333,0.411111,0.183333,0.433333,0.683333,0.411111,0.388889,0.638889,0.933333,0.25
5,Virtualization skills,2,7,7,4,0.777778,0.388889,0.616667,0,1,...,0.205556,0.25,0.183333,0.25,0.683333,0.25,0.205556,0.638889,0.933333,0.433333
6,Containerizing work environments using Docker,2,7,10,15,0.777778,0.25,0.25,0,1,...,0.161111,0.25,0.227778,0.161111,0.25,0.227778,0.25,0.161111,0.933333,0.25
7,Deployment using Kubeflow,2,7,7,4,0.777778,0.388889,0.616667,0,1,...,0.161111,0.411111,0.161111,0.433333,0.683333,0.411111,0.388889,0.638889,0.933333,0.433333
8,Automation / Scripting Linux proficiency / Bin...,3,7,7,4,0.777778,0.933333,0.888889,0,1,...,0.705556,0.5,0.705556,0.727778,0.5,0.75,0.5,0.638889,0.5,0.477778
9,Automating workflows using shell scripting tools,3,7,7,4,0.777778,0.933333,0.888889,0,1,...,0.227778,0.25,0.227778,0.727778,0.25,0.227778,0.25,0.638889,0.25,0.477778
