In [2]:
import pandas as pd
pd.options.plotting.backend = "plotly" #interactive plots will be useful in this context
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import gensim
from sentence_transformers import SentenceTransformer, util
import pickle
import torch
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('../datasets/df_full.csv')
df_r=pd.read_csv('../datasets/df_k31_k39.csv')
df_cv = pd.read_csv('../datasets/df_cv.csv')
df_handbook = pd.read_csv('../datasets/df_handbook_DS_60.csv')
k39_umap = pickle.load(open('../Model_Selection/k_39_umap', 'rb'))
k39_full = pickle.load(open('../Model_Selection/k_39_full', 'rb'))
k31_umap = pickle.load(open('../Model_Selection/k_31_umap', 'rb'))
k31_full = pickle.load(open('../Model_Selection/k_31_full', 'rb'))

In [4]:
list_of_vec = []
job_list = []
for index, row in df_r.iterrows():
    job = row['url']
    if job not in job_list:
        job_list.append(job)
print(len(job_list))
for job in job_list:
    df_r_job = df_r[df_r.url == job]
        
    cluster_list = []
    cluster_vec = []

    for i, row_r in df_r_job.iterrows():
        cluster = row_r['cluster_k31_full']
        if cluster not in cluster_list:
            cluster_list.append(cluster)
            
    cluster_vec.append(job)
    for i in range(0,31):
        if(i in cluster_list):
            cluster_vec.append(1)
        else:
            cluster_vec.append(0)
    list_of_vec.append(cluster_vec)

cols = []
cols.append('job_link')
for i in range(0,31):
    cols.append(str(i))
df_clusters = pd.DataFrame(list_of_vec, columns = cols)

1086


In [5]:
df_clusters

Unnamed: 0,job_link,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,30
0,https://se.indeed.com/viewjob?jk=b0669075c820856d,0,0,0,0,0,0,1,0,0,...,1,1,0,1,1,0,0,0,0,0
1,https://se.indeed.com/viewjob?jk=dbfe7d7c087065d7,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,1,0,0,0,0
2,https://se.indeed.com/viewjob?jk=841a8fdd1bb00f2f,0,1,1,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
3,https://se.indeed.com/viewjob?jk=88c25a48d2da8a7a,1,0,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,https://se.indeed.com/viewjob?jk=1b4b8ee005f06350,1,0,0,0,1,0,0,0,0,...,1,0,0,0,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,https://www.indeed.com/viewjob?jk=e6fee8e1f133...,0,0,0,0,0,0,0,1,1,...,0,0,1,0,1,0,0,0,1,1
1082,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,0,0,0,0,0,0,0,1,1,...,1,1,0,1,1,1,0,0,0,1
1083,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1084,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,0,0,0,0,0,0,1,0,1,...,0,0,1,1,0,0,0,0,0,1


In [6]:
from collections import Counter
def get_bigrams(requirement_list):
    list = []
    for requirement in requirement_list:
        for word in requirement.split():
            word = word.lower()
            if (len(word) >1) & (word not in stop_words):
                list.append(word)
    
    bigrams = zip(list,list[1:])
    #print(list)
    bigrams_count = Counter(bigrams)
    return(bigrams_count)

In [7]:
def get_label(cluster):
    df_r_cluster = df_r[df_r.cluster_k31_full == cluster]
    cluster_requirement_list = []
    for i, row in df_r_cluster.iterrows():
        cluster_requirement_list.append(row['requirement'])
    tuple = get_bigrams(cluster_requirement_list).most_common(3)
    label = ""
    for bigram in range(3):   
        for word in tuple[bigram][0]:
            if bigram ==0:
                label+='<b>'+word+"</b> "
            else:
                label+=word+" "
        label=label[:-1]
        label+=", "
    label = label[:-2]
    label = label.replace(",,",",")
    return(label)
get_label(2)

'<b>ability</b> <b>work</b>, work independently, fast-paced environment'

In [11]:
x = []
y = []
for i in range(0,len(k31_full.cluster_centers_)):
    y.append(get_label(i))
    x.append(df_clusters[str(i)].mean())
    
sorted = list(zip(x,y))
sorted.sort()
x=[]
y=[]
for tuple in sorted:
    x.append(tuple[0])
    y.append(tuple[1])

fig = go.Figure([go.Bar(x=x, y=y,orientation='h')])
fig.update_layout(height=800, \
                  title = '3 most common bigrams of each cluster and relative cluster precense in job postings',\
                  barmode='stack',\
                  xaxis_title="Relative Cluster Presence in Job Postings",\
                  yaxis_title="3 most common Bigrams in Cluster",)
fig.show()

In [12]:
y = []
x=[]
for i in range(0,len(k31_full.cluster_centers_)):
    y.append(get_label(i))
    x.append(df_clusters[str(i)].mean())
filename = 'cluster_label_bigrams'
pickle.dump(y, open(filename, 'wb'))
filename = 'cluster_importance'
pickle.dump(x, open(filename, 'wb'))

In [11]:
cv_koen = ['Data Driven Process Technologist',\
         'Partly responsible for product quality and technological yield of cheese production process',\
         'By Data Driven approach I automated 75/% of my original role and realized a yield improvement of > €500.000 / year',\
         'Responsible for setting up and maintaining on-the-job training programs and masterclasses for on-boarding and up-skilling of operators',\
         'After working as an operator I took advantage of a growth opportunity to lead a team of 7 operators in 3 production departments',\
         'Training & Education specialist production',\
         'Development of physics lab excercise program',\
         'Mentor of freshman students post-secondary vocational education',\
         'Data Science (MSc) IU International University',\
         'Environment: Python', 'SQL', 'Visual Basic, M, Dax',\
         'Jupyter Notebook', 'Dataiku Data Science Studio', 'Docker',\
         'Dashboarding: Power BI (expert) / Tibco Spotfire',\
         'English', 'Dutch']

In [12]:
def get_df(cv, plot):
    df_sim = pd.DataFrame()
    df_sim['cluster'] = range(len(k31_full.cluster_centers_))
    
    labels = []
    presence = []
    
    for i in range(0,len(k31_full.cluster_centers_)):
        labels.append(get_label(i))
        presence.append(df_clusters[str(i)].mean())
    
    model = 'all-distilroberta-v1'
    model = SentenceTransformer(model)
    embeddings_cv = model.encode(cv)
    embeddings_f = embeddings_cv.astype(float)
    clusters_cv = k31_full.predict(embeddings_f)
    clusters_cv_l  = clusters_cv.tolist()
    
    cv_scores = []
    for i, cluster in enumerate(clusters_cv):
        cv_scores.append(util.pytorch_cos_sim(k31_full.cluster_centers_[cluster], embeddings_f[i]).item())
    
    scores = []
    for cluster in range(len(k31_full.cluster_centers_)):
        if cluster not in clusters_cv_l:
            scores.append(0)
        else:
            score = 0
            indexes = np.where(clusters_cv==cluster)[0]
            for i in indexes:
                if cv_scores[i] > score:
                    score = cv_scores[i]
            scores.append(score)   
    
    df_sim['score'] = scores
    df_sim['presence'] = presence
    df_sim['labels'] = labels
    
    df_sim['CV_similarity'] = df_sim['presence']*df_sim['score']
    df_sim['Presence_Cluster_in_Job_Postings'] = df_sim['presence']-df_sim['CV_similarity']
    df_sim = df_sim.sort_values('presence')
    
    if plot:
        fig = px.bar(df_sim, y="labels", x=["CV_similarity","Presence_Cluster_in_Job_Postings"], hover_data = ['presence'])
        fig.update_layout(height=800, \
                          title = 'Author\'s CV similarity to requirement clusters in context of relative cluster presence',\
                          barmode='stack', \
                          yaxis_title="3 most common Bigrams in Cluster",\
                          xaxis_title="CV Similarity to Cluster")
        fig.show()
        fig = px.bar(df_sim, y="labels", x=["CV_similarity","Presence_Cluster_in_Job_Postings"], hover_data = ['presence'])
        fig.show()
    return df_sim
    
df_sim = get_df(cv_koen, plot = True)

In [19]:
df_cv.head()
len(df_cv)

5738

In [None]:
scores = []
for index, row in df_cv.iterrows:
    scores.append()

In [14]:
df_candidate = df_cv[df_cv['id']==8]
skills = df_candidate['skill'].to_list()

In [15]:
df_sim = plot_cv(skills)
df_sim['CV_similarity'].sum()/df_sim['presence'].sum()

NameError: name 'plot_cv' is not defined

In [18]:
df_cv.head()

Unnamed: 0.1,Unnamed: 0,id,title,skill,cluster,embedding
0,0,0,Data Scientist/Analytics Consultant,"design sql, r & python scripts to directly qu...",11,"[0.020906170830130577, -0.02053937129676342, -..."
1,1,0,Data Scientist/Analytics Consultant,aid in the development of predictive modeling...,6,"[0.045750461518764496, -0.03818187490105629, -..."
2,2,0,Data Scientist/Analytics Consultant,test model fit using cross-validation and boo...,19,"[-0.011868440546095371, -0.0615849532186985, -..."
3,3,0,Data Scientist/Analytics Consultant,develop in an scrum environment with team mem...,16,"[0.04099218547344208, -0.02379242517054081, 0...."
4,4,0,Data Scientist/Analytics Consultant,"model data using pandas/r dataframes, powerpi...",19,"[-0.0024511003866791725, -0.02079622820019722,..."


In [17]:
df_sim_all = pd.DataFrame()

for i in df_cv['id'].unique():
    df_ = df_cv[df_cv['id']==i]
    skills = df_['skill'].to_list()
    if len(df_sim_all)==0:
        df_sim_all = get_df(skills,False)
    else:
        df_sim_all = df_sim_all.append(get_df(skills,False),ignore_index=True)
    print(i)

0
1
2
3
4
5


KeyboardInterrupt: 

In [None]:
len(df_sim_all)

In [None]:
df_sim_all.to_csv('../datasets/df_sim_cv_all.csv')

In [None]:
df_sim_all[df_sim_all['cluster']==1]

In [None]:
print(df_sim_all.columns)

In [None]:
df_sim_avg = pd.DataFrame(columns = df_sim_all.columns)
for cluster in range(len(k31_full.cluster_centers_)):
    df_=df_sim_all[df_sim_all['cluster']==cluster]
    df_ = df_.mean()
    df_['labels'] = get_label(cluster)
    df_sim_avg = df_sim_avg.append(df_, ignore_index = True)
df_sim_avg = df_sim_avg.sort_values('presence')
df_sim_avg.mean()

In [None]:
fig = px.bar(df_sim_avg, y="labels", x=["CV_similarity","Presence_Cluster_in_Job_Postings"], hover_data = ['presence'])
fig.update_layout(height=800, \
                          title = 'Mean similarity of 65 Data Scientist CV\'s to requirement clusters in context of relative cluster presence',\
                          barmode='stack',\
                          yaxis_title="3 most common Bigrams in Cluster",\
                          xaxis_title="CV Similarity to Cluster")
fig.show()

In [None]:
df_handbook = get_df(df_handbook['objective'].to_list(),plot = True)

In [None]:
handbook_objective_similarity = df_handbook['CV_similarity'].to_list()

In [None]:
df_sim_avg['handbook_similarity'] = handbook_objective_similarity

In [None]:
df_sim_avg

In [None]:
fig = px.bar(df_sim_avg, y="labels", x=["handbook_similarity","CV_similarity","presence"],\
             color_discrete_sequence =['green','blue','red'])
fig.update_layout(height=850, \
                          title = 'Similarity of CV\'s and Learning Curriculum to Requirements in Job Postings',\
                          barmode='group',\
                          yaxis_title="3 most common Bigrams in Cluster",\
                          xaxis_title="CV Similarity to Cluster")
fig.show()