In [2]:
import pandas as pd
pd.options.plotting.backend = "plotly" #interactive plots will be useful in this context
import plotly.express as px
import numpy as np
import gensim
from sklearn.decomposition import PCA
import sent2vec
from sent2vec.vectorizer import Vectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
from nltk.cluster import KMeansClusterer
import nltk
import torch
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df=pd.read_csv('../Data_Acquisition/df_toy.csv')
row_id = range(0,len(df),1)
df['row_id'] = row_id
df.head()

Unnamed: 0.1,Unnamed: 0,dt,url,title,location,country,full_text,list_elements,row_id
0,0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0
1,1,2021-08-08 21:01:09.824428,https://se.indeed.com/viewjob?jk=dbfe7d7c087065d7,"Data Scientist, Search Insights",Stockholm,se,"Data, Research & Insights\nData Science\nThe P...",['\nExperimentation strategy including AB test...,1
2,2,2021-08-08 21:01:18.030788,https://se.indeed.com/viewjob?jk=841a8fdd1bb00f2f,Junior data scientist,411 21 Göteborg,se,Utilifeed is a rapidly expanding startup with ...,"['\nImproving pour data ingestion pipeline, we...",2
3,3,2021-08-08 21:01:20.651507,https://se.indeed.com/viewjob?jk=88c25a48d2da8a7a,Data Scientist,118 72 Stockholm,se,"AI Data scientistLocation: Stockholm, SwedenDi...",['Proven experience in Machine Learning (ML) a...,3
4,4,2021-08-08 21:01:27.135797,https://se.indeed.com/viewjob?jk=1b4b8ee005f06350,Data Scientist,Solna,se,"Currently, we are looking for a talented data ...","['\nYou have a start-up mentality, with delive...",4


In [4]:
def preprocess_for_bert(string):
    string = string.strip("\n.’:")
    string = string.strip("’")
    string = string.strip("\\n")
    string = string.replace("/"," ")
    return(string)

In [5]:
#Pivots df of vacancy texts into a df of job requirements retrieved from 'list_elements' in vacancy df
columns = df.columns.values.tolist()
df_r = pd.DataFrame(columns = ['requirement_raw','requirement_tokenized']+columns)
for index, row in df.iterrows():
    string = row['list_elements']
    remove = ['\,','[',']','\\n']
    for char in remove:
        string.replace(char, '')
    string = string.strip('\n')
    splitted = string.split('\'') #list of requirements is stored as string so split on '
    list = []
    for item in splitted:
        if len(item)>2:
            requirement_raw = item
            requirement_tokenized = preprocess_for_bert(item)
            requirement_tokenized = gensim.utils.simple_preprocess(requirement_tokenized)
            row_list = [requirement_raw,requirement_tokenized]
            for col in columns:
                row_list = row_list+[df.loc[index,col]]          
            df_r.loc[len(df_r)] = row_list 

In [6]:
requirements = []
for index, row in df_r.iterrows():
    requirements.append(preprocess_for_bert(row['requirement_raw']))
df_r['requirement'] = requirements

In [7]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
sentences = requirements
embeddings = model.encode(sentences)

In [8]:
df_r['embeddings'] = embeddings.tolist()

In [9]:
def pca_embeddings(vectors, name):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(vectors)
    _df = pd.DataFrame(data = principal_components, columns = [name+'_pc1',name+'_pc2'])
    return(_df)

In [10]:
_df = pca_embeddings(embeddings, 'sbert')
df_r['sbert_pc1'] = _df.iloc[:,0]
df_r['sbert_pc2'] = _df.iloc[:,1]

In [11]:
X = vector_list = np.array(embeddings.tolist())
n_clusters = 25
kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance,repeats=25,avoid_empty_clusters=True)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

df_r['cluster_sbert'] = pd.Series(assigned_clusters, index=df_r.index)

In [12]:
X = vector_list = np.array(embeddings.tolist())
n_clusters = 50
k50 = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance,repeats=25,avoid_empty_clusters=True)
assigned_clusters = k50.cluster(X, assign_clusters=True)

df_r['cluster_k50'] = pd.Series(assigned_clusters, index=df_r.index)

In [13]:
df_r[df_r['cluster_k50']]

KeyError: "None of [Int64Index([17,  9,  0,  0,  0,  5, 43,  4, 26, 26,\n            ...\n            22, 37, 46, 21, 11, 42, 42, 47, 11, 27],\n           dtype='int64', length=2110)] are in the [columns]"

In [14]:
clusters = kclusterer.means()

In [223]:
c50 = k50.means()

In [15]:
def get_most_similar(query, n):
    top_k = min(n, len(embeddings))
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    for score, idx in zip(top_results[0], top_results[1]):
        print(df_r.at[idx.item(),'requirement']+" "+str(score.item()))
        
queries = ['proficient in python and R', 'work hard play hard', 'statistics, modelling and math', 'this should be an outlier']
for q in queries: 
    get_most_similar(q,3)

Proficient in using Python, R 0.9712088704109192
Python and or R skills 0.8707777261734009
Programming skills in R, Python or a similar language 0.8440821170806885
Strong work ethic and a can-do attitude 0.5846492052078247
Ability to work effectively & independently 0.5582389831542969
Diverse and challenging tasks 0.5527569055557251
Strong Mathematical Background: statistics, probability, and ideally operations research 0.6722531318664551
Experience with advanced analytical and statistical modelling techniques 0.6244396567344666
Strong background in Statistics and Mathematics 0.6138238310813904
t care if you are weak in other areas",  0.36669251322746277
Document the limitations of the approaches and recommend constraints to improve the performance of the Anomaly Detection algorithms 0.35262495279312134
A big data and analytics nerd 0.3523988723754883


In [16]:
distances = []
for index, row in df_r.iterrows():
    cluster = int(row.loc['cluster_sbert'])
    cos_score = util.pytorch_cos_sim(clusters[cluster], embeddings[index].astype(float))
    cos_score = cos_score[0][0].item()
    distances.append(cos_score)

df_r['distance_from_centroid'] = distances

In [17]:
def get_cluster_top_n(cluster, n):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    for score, idx in zip(top_results[0], top_results[1]):
        #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
        result_ids.append(idx.item())
        result_requirements.append(df_r.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return([result_ids[:n],result_requirements[:n],result_scores[:n]])
get_cluster_top_n(20,3)

[[1587, 731, 156],
 ['Experience with Python programming',
  'Experience in any programming language (Python, R, …)',
  'Strong coding skills in Python and the Python data ecosystem, including: pandas, numpy, scikit-learn, scipy, SQL, visualization tools (matplotlib, seaborn, bokeh, plotly, etc.)'],
 [0.8331620578856123, 0.8322848577996512, 0.8025982636413285]]

In [23]:
from collections import Counter
def get_bigrams(requirement_list):
    list = []
    for requirement in requirement_list[1]:
        for word in requirement.split():
            word = word.lower()
            if (len(word) >1) & (word not in stop_words):
                list.append(word)
    
    bigrams = zip(list,list[1:])
    bigrams_count = Counter(bigrams)
    return(bigrams_count)

In [24]:
#get_bigrams(get_cluster_top_n(26,3)).most_common(3)
for cluster in range(0,24):
    print(get_bigrams(get_cluster_top_n(cluster,10)).most_common(3))

[(('machine', 'learning'), 10), (('experience', 'machine'), 3), (('data', 'science'), 3)]
[(('develop', 'processes'), 2), (('processes', 'tools'), 2), (('tools', 'monitor'), 2)]
[(('temporarily', 'due'), 7), (('due', 'covid-19'), 7), (('covid-19', 'temporarily'), 6)]
[(('communicating', 'analysis'), 3), (('stakeholders', 'possibility'), 2), (('possibility', 'involved'), 2)]
[(('competitive', 'salary'), 3), (('benefits', 'package'), 2), (('package', 'competitive'), 2)]
[(('(preferred)",', '(preferred)",'), 2), (('(preferred)",', 'preferred'), 1), (('preferred', 'privacy'), 1)]
[(('work', 'product'), 2), (('product', 'managers,'), 2), (('managers,', 'data'), 2)]
[(('share', 'present'), 2), (('present', 'ideas'), 2), (('ideas', 'successes'), 2)]
[(('excellent', 'collaboration'), 2), (('collaboration', 'skills'), 2), (('skills', 'working'), 2)]
[(('innovative', 'work'), 2), (('creativity,', 'insight'), 2), (('insight', 'passion'), 2)]
[(('experience', 'working'), 3), (('prior', 'experience

In [14]:
def get_cluster_bottom_n(cluster, n):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    top_k = min(n, len(embeddings))
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores)
#    for score, idx in zip(top_results[0][:10], top_results[1][:10]):
    for score, idx in zip(top_results[0], top_results[1]):
        if idx.item() in df_r[(df_r.cluster_sbert == '0')].index:
            #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
            result_ids.append(idx.item())
            result_requirements.append(df_r.at[idx.item(),'requirement'])
            result_scores.append(score.item())
    return([result_ids[-n:],result_requirements[-n:],result_scores[-n:]])


In [122]:
def get_cluster_sorted(cluster):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores)
#    for score, idx in zip(top_results[0][:10], top_results[1][:10]):
    for score, idx in zip(top_results[0], top_results[1]):
        if idx.item() in df_r[(df_r.cluster_sbert == '0')].index:
            #print(df_r.at[idx.item(),'requirement']+" Score: "+str(score.item()))
            result_ids.append(idx.item())
            result_requirements.append(df_r.at[idx.item(),'requirement'])
            result_scores.append(score.item())
    return([result_ids,result_requirements,result_scores])

In [16]:
def score_requirements_by_centroid(cluster):
    result_ids = []
    result_scores = []
    result_requirements = []
    embeddings_f = embeddings.astype(float)
    
    cos_scores = util.pytorch_cos_sim(kclusterer.means()[cluster], embeddings_f)[0]
    top_results = torch.sort(cos_scores, descending=True)

    for score, idx in zip(top_results[0], top_results[1]):
        result_ids.append(idx.item())
        result_requirements.append(df_r.at[idx.item(),'requirement'])
        result_scores.append(score.item())
    return[result_ids,result_requirements,result_scores]

In [17]:
results = score_requirements_by_centroid(0)
for result in results[0]:
    requirement = results[1][results[0].index(result)]
    score = results[2][results[0].index(result)]
    print(str(score)+" "+requirement)

0.9129305463523905 Experience with Machine Learning
0.890990087143819 Machine Learning
0.8756335145931344 Desired experience in machine learning ;
0.867906864484677 Experience with data science and machine learning methods
0.8627440865939148 Proven experience in Machine Learning (ML) and data science
0.8605188189859011 Passion for machine learning and data science
0.8528523405369661 Developing machine learning enhanced data products
0.8501203358781196 Experience with data analysis and machine learning libraries
0.8481442059829636 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc
0.8481442059829636 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc
0.8469897273170661 Strong knowledge of state-of-the-art machine learning and statistical methods
0.8460564803334591 Machine learning to improve the user experience
0.8460564803334591 

0.4835999412106789 Perform complex data analysis, communicate insights tailored to different audiences, and make recommendations based on your findings that contribute to company strategic decisions. We strive to preach and practice decision intelligence in organizational decision-making, and as a member of Data & Insight, you play a key role in these efforts
0.4835999412106789 Perform complex data analysis, communicate insights tailored to different audiences, and make recommendations based on your findings that contribute to company strategic decisions. We strive to preach and practice decision intelligence in organizational decision-making, and as a member of Data & Insight, you play a key role in these efforts
0.48348542786413795 Experience with visualization tools (e.g. Tableau, PowerBI, Metabase, QLik)
0.48190844987301845 You will be doing data analysis for different clients
0.48188740009586817 Hands-on experience with production-level data science projects
0.48185919068531946 Da

0.3125532720245764 Extracting value from microbiome and medical data with statistical methods
0.3124449321838391 You feel confident when interacting with senior management stakeholders, can quickly digest new information, have a keen attention to detail and are able to develop strategic concepts
0.3122606438046852 Training with a personal trainer once a week
0.31194881643941597 Putting analytical solutions into productio
0.31174384928635945 A setting that allows for concentrated work and encourages space for your ideas
0.3116436570900869 Training and conferences;
0.3111609527239266 Clear, efficient communication skills
0.31085143694326267 You constantly observe the market and optimize existing models and technology
0.31078575122276897 Excellent personal and professional development possibilities
0.31078575122276897 Excellent personal and professional development possibilities
0.31072485010795003 Lead data mining and collection procedures
0.3106958505906645 PhD or MS degree in Engineeri

0.12506402314751083 You bring passion and playfulness to your work and those around you
0.12494473029337895 PRIVACY PROTECTION
0.12491786121944656 Lots of good coffee
0.12486668214407598 , "\nA master
0.12474184331408553 Automatic Warranty Claim Approval
0.12474184331408553 Automatic Warranty Claim Approval
0.12470943669852277 International and dynamic environment (over 29 nationalities and 24 languages spoken!)
0.12165387843882279 The positions are remote freelance roles with a duration of 2 years.
0.12109819652540058 s trust brand for online commerce in the heart of Cologne", 
0.11781623425334159 Willingness to work in a multicultural multilingual environment
0.11679381450321054 Fluent in English, both written and orally. Dutch, French, German, Korean, Japanese, Chinese capabilities are an asset
0.11653859685085885 Stakeholder management
0.11558611391829272 Fluent FR and EN
0.1153740938582365 Yes
0.1153740938582365 Yes
0.1153740938582365 Yes
0.11519623472366013 MSc Degree
0.114784968

In [159]:
result = get_cluster_top_n(0,10)

bins = np.array([0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95])
for bin in bins:
    

x = np.array(results[2])
bins = np.digitize(x,bins,right=True)
print(bins)

for i in range (0,len(result[0])):
    score = result[2][i]
    requirement = result[1][i]
    bin = bins[i]
    print(str(score)+" "+requirement+ " "+ str(bin))
    
    

[6 6 6 ... 0 0 0]
0.9129305463523905 Experience with Machine Learning 6
0.890990087143819 Machine Learning 6
0.8756335145931344 Desired experience in machine learning ; 6
0.867906864484677 Experience with data science and machine learning methods 6
0.8627440865939148 Proven experience in Machine Learning (ML) and data science 5
0.8605188189859011 Passion for machine learning and data science 5
0.8528523405369661 Developing machine learning enhanced data products 5
0.8501203358781196 Experience with data analysis and machine learning libraries 5
0.8481442059829636 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc 5
0.8481442059829636 Working experience with state-of-the-art machine learning models in particular deep learning, CNN, LSTM, probabilistic models, etc 5


In [18]:
df_eval = pd.DataFrame(columns = ['cluster','requirement','result'])
for centroid in range(0,n_clusters):
    results = score_requirements_by_centroid(centroid)
    print(centroid)
    for result in results[0][:100]:
        results = score_requirements_by_centroid(centroid)
        requirement = results[1][results[0].index(result)]
        score = results[2][results[0].index(result)]
        new_row = {'cluster':centroid, 'requirement':requirement, 'score':score}
        df_eval = df_eval.append(new_row, ignore_index = True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [35]:
df_labeled = pd.read_csv('evaluation_sample_labeled.csv', sep = ';')
df_labeled = df_labeled.dropna()
df_labeled.plot.scatter("score","label")

In [107]:
from sklearn.linear_model import LogisticRegression

X = df_labeled['score'].to_numpy()
y = df_labeled['label'].to_numpy()
logistic_regression = LogisticRegression(random_state=0).fit(X.reshape(-1,1), y)

In [221]:
x = np.linspace(-5,5,100)
y = logistic_regression.predict_proba(x.reshape(-1,1))[:,1]
print(y)
px.line(x =x,y =y)

[9.77682736e-08 1.30595867e-07 1.74445960e-07 2.33019568e-07
 3.11260392e-07 4.15772074e-07 5.55375550e-07 7.41853546e-07
 9.90945045e-07 1.32367366e-06 1.76812203e-06 2.36180196e-06
 3.15482037e-06 4.21410816e-06 5.62906890e-06 7.51912397e-06
 1.00437906e-05 1.34161433e-05 1.79207934e-05 2.39379040e-05
 3.19752634e-05 4.27111210e-05 5.70513917e-05 7.62060455e-05
 1.01791102e-04 1.35964795e-04 1.81609327e-04 2.42573396e-04
 3.23995738e-04 4.32736527e-04 5.77952250e-04 7.71861164e-04
 1.03076141e-03 1.37638322e-03 1.83768136e-03 2.45320487e-03
 3.27421907e-03 4.36879877e-03 5.82715995e-03 7.76854236e-03
 1.03499841e-02 1.37773134e-02 1.83185724e-02 2.43198024e-02
 3.22225246e-02 4.25811634e-02 5.60768602e-02 7.35214285e-02
 9.58417049e-02 1.24031013e-01 1.59052945e-01 2.01686948e-01
 2.52320066e-01 3.10717059e-01 3.75835680e-01 4.45775554e-01
 5.17930638e-01 5.89345604e-01 6.57183572e-01 7.19155357e-01
 7.73780635e-01 8.20434013e-01 8.59216564e-01 8.90738392e-01
 9.15893264e-01 9.356750

In [20]:
closest = []
closest_m1 = []
closest_m2 = []
for index, row in df_eval.iterrows():
    cluster = row['cluster']
    results = get_cluster_top_n(cluster, 3)
    closest.append(results[1][0])
    closest_m1.append(results[1][1])
    closest_m2.append(results[1][2])
df_eval['closest_to_centroid'] = closest
df_eval['closest_to_centroid_m1'] = closest_m1
df_eval['closest_to_centroid_m2'] = closest_m2

In [21]:
df_eval = df_eval.sort_values(by=['score'], ascending = False)
df_sample = df_eval.sample(weights = 'score', frac = 0.1)
df_sample.to_csv('evaluation_sample')

In [22]:
print(df_sample.head(25))

     cluster                                        requirement result  \
270        2  Manage and drive interactions between the LIMS...    NaN   
1240      12  Familiarity with deployment workflow tools: Az...    NaN   
609        6  You get a competitive compensation package, in...    NaN   
504        5                        Temporarily due to COVID-19    NaN   
744        7                            Problem-solving ability    NaN   
680        6  The possibility to take stock in the success o...    NaN   
2439      24  Strong Analytical skills – has ability to star...    NaN   
1307      13  Experience with cloud platforms (AWS, Google C...    NaN   
938        9  Identify opportunities for leveraging company ...    NaN   
449        4  Are able to communicate and empathize with cus...    NaN   
1455      14  Building, setting up and managing data ingesti...    NaN   
460        4  Supporting Sartorius sales teams as subject ma...    NaN   
1488      14  Responsible for data onb

In [23]:
def cluster_similarity(cluster):
    result_ids = []
    result_scores = []
    cos_scores = util.pytorch_cos_sim(clusters[cluster],clusters)
    sorted_results = torch.sort(cos_scores, descending = True)
    for score, idx in zip(sorted_results[0][0], sorted_results[1][0]):
        result_ids.append(idx.item())
        result_scores.append(score.item())
    return[result_ids,result_scores]
    
#closest cluster to cluster
cluster_similarity(1)[1][1]

#now find a n_clusters that matches the desired similarity!

0.7455646928315189

In [24]:
closest_score_sum = 0
for cluster in range(0,n_clusters-1):
    closest_score = cluster_similarity(cluster)[1][1]
    closest_score_sum+=closest_score
closest_score_sum/n_clusters

0.6249182374330726

In [25]:
def describe_requirement(query, n):
    cluster = kclusterer.classify_vectorspace(model.encode(query)) 
    print("Query "+query+" is most similar to these requirements are")
    get_most_similar(query,3)
    
    print("\n")
    print("This query belongs to K-cluster "+str(cluster))
    print("The sentences closest to the centroid of this cluster are ")
    get_cluster_top_n(cluster, 3)

In [26]:
describe_requirement("Proficient in SQL",3)

Query Proficient in SQL is most similar to these requirements are
Proficient in using SQL, 0.9665846228599548
Excellent SQL skills 0.8933648467063904
Advanced SQL 0.8318519592285156


This query belongs to K-cluster 21
The sentences closest to the centroid of this cluster are 


In [27]:
df_r['cluster_sbert']= df_r['cluster_sbert'].astype(str)
fig = px.scatter(df_r,'sbert_pc1','sbert_pc2', hover_data = ['requirement_raw'],color = 'cluster_sbert' )
fig.show()

In [235]:
df_r['cluster_k50']= df_r['cluster_k50'].astype(str)
fig = px.scatter(df_r,'sbert_pc1','sbert_pc2', hover_data = ['requirement'],color = 'cluster_k50' )
fig.show()

In [237]:
df_r['cluster_sbert']= df_r['cluster_sbert'].astype(str)
df_cluster = df_r[df_r['cluster_sbert']==str(9)]
fig = px.scatter(df_cluster,'sbert_pc1','sbert_pc2', hover_data = ['requirement'],color = 'distance_from_centroid' )
fig.show()

In [217]:
df_r[df_r['distance_from_centroid']<0.5]
print(len(df_r))

2110


In [28]:
columns = df.columns.values.tolist()
df_clusters = pd.DataFrame(columns = [*range(0, n_clusters, 1)])

In [29]:
list_of_vec = []
for index, row in df.iterrows():
    df_r_job = df_r[df_r.row_id == index]
    cluster_list = []
    cluster_vec = []

    for i, row_r in df_r_job.iterrows():
        cluster = row_r['cluster_sbert']
        if cluster not in cluster_list:
            cluster_list.append(cluster)
    for i in range(0,n_clusters):
        if(i in cluster_list):
            cluster_vec.append(1)
        else:
            cluster_vec.append(0)
    list_of_vec.append(cluster_vec)

cols = []
for i in range(0,n_clusters):
    cols.append(str(i))
df_clusters = pd.DataFrame(list_of_vec, columns = cols)
df_c = pd.concat([df, df_clusters], axis=1)

In [30]:
pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', None)
df_c

Unnamed: 0.1,Unnamed: 0,dt,url,title,location,country,full_text,list_elements,row_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0,2021-08-08 21:01:03.303868,https://se.indeed.com/viewjob?jk=b0669075c820856d,Data Scientist,Jobba hemifrån,se,"About us\n\nHere at Mavenoid, we are building ...",['Collaborate with stakeholders and other engi...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2021-08-08 21:01:09.824428,https://se.indeed.com/viewjob?jk=dbfe7d7c087065d7,"Data Scientist, Search Insights",Stockholm,se,"Data, Research & Insights\nData Science\nThe P...",['\nExperimentation strategy including AB test...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,2021-08-08 21:01:18.030788,https://se.indeed.com/viewjob?jk=841a8fdd1bb00f2f,Junior data scientist,411 21 Göteborg,se,Utilifeed is a rapidly expanding startup with ...,"['\nImproving pour data ingestion pipeline, we...",2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,2021-08-08 21:01:20.651507,https://se.indeed.com/viewjob?jk=88c25a48d2da8a7a,Data Scientist,118 72 Stockholm,se,"AI Data scientistLocation: Stockholm, SwedenDi...",['Proven experience in Machine Learning (ML) a...,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,2021-08-08 21:01:27.135797,https://se.indeed.com/viewjob?jk=1b4b8ee005f06350,Data Scientist,Solna,se,"Currently, we are looking for a talented data ...","['\nYou have a start-up mentality, with delive...",4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1,2021-08-09 08:58:22.818522,https://at.indeed.com/viewjob?jk=1eadddfd7f2dd6c0,HR Data Scientist (m/f/d),"Wien, W",at,HR Data Scientist (m/f/d)\n!\nIn this internat...,['Processing and analyzing complex structured ...,125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126,2,2021-08-09 08:59:04.860975,https://at.indeed.com/viewjob?jk=81f4400146e29934,Data Scientist (m/f/d),"Wien, W",at,Our hearts are burning for sport in all its fa...,"['Understand business problems, challenge and ...",126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
127,3,2021-08-09 08:59:26.520043,https://at.indeed.com/viewjob?jk=b101a6cca6341e9e,Data Scientist Consultant (m/f/d),"Wien, W",at,At Machine Learning Reply Austria we strive to...,['\nIdentify and analyze problems in an analyt...,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
128,4,2021-08-09 08:59:59.803772,https://at.indeed.com/viewjob?jk=82a7a322b2b970e4,Data Scientist (P4),"Wien, W",at,Data Scientist (P4) - (2021/0363 (211264))\nOr...,"['Drive divisional efforts to scope, design, d...",128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
def complement(skillset):
    clusters = []
    sums = []
    for s in skillset:
        clusters.append(str(kclusterer.classify_vectorspace(model.encode(s))))
    print(clusters)
    for i, row in df_c.iterrows():
        row = df_c.loc[i,clusters]
        sums.append(row.sum(axis =0))
    max_sum = max(sums)
    indices = []
    for i, row in df_c.iterrows():
        row = df_c.loc[i,clusters]
        if row.sum(axis =0) == max_sum:
            indices.append(i)
    df_sim = df_c.loc[indices]
    
    cols = []
    for i in range(0,n_clusters):
        if (str(i) not in clusters) & (str(i) not in cols):
            cols.append(str(i))
    print(cols)
    df_disjoint = df_c.loc[indices, cols]
    
    max_score = 0
    max_cluster = None
    
    for col in cols:
        print(col)
        score = df_disjoint[col].sum()
        similarity_percentage = score/len(df_disjoint)
        print("For cluster "+col+" the sum is "+str(df_disjoint[col].sum())+" which equals a similarity of "+str(similarity_percentage))
        if score>max_score:
            max_score = score
            max_cluster = int(col)
    return max_cluster

In [32]:
most_similar_cluster = complement(["python","communication skills", "Data Science"])
print("\nThe most similar cluster to your query is "+ str(most_similar_cluster))
print("\nThe sentences closest to the centroid of this cluster are ")
get_cluster_top_n(most_similar_cluster,3)

['20', '24', '14']
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '15', '16', '17', '18', '19', '21', '22', '23']
0
For cluster 0 the sum is 0 which equals a similarity of 0.0
1
For cluster 1 the sum is 0 which equals a similarity of 0.0
2
For cluster 2 the sum is 0 which equals a similarity of 0.0
3
For cluster 3 the sum is 0 which equals a similarity of 0.0
4
For cluster 4 the sum is 0 which equals a similarity of 0.0
5
For cluster 5 the sum is 0 which equals a similarity of 0.0
6
For cluster 6 the sum is 0 which equals a similarity of 0.0
7
For cluster 7 the sum is 0 which equals a similarity of 0.0
8
For cluster 8 the sum is 0 which equals a similarity of 0.0
9
For cluster 9 the sum is 0 which equals a similarity of 0.0
10
For cluster 10 the sum is 0 which equals a similarity of 0.0
11
For cluster 11 the sum is 0 which equals a similarity of 0.0
12
For cluster 12 the sum is 0 which equals a similarity of 0.0
13
For cluster 13 the sum is 0 which equals a 

TypeError: list indices must be integers or slices, not NoneType

In [None]:
df_r.head()

In [None]:
df_r[(df_r.cluster_sbert == '1')]['requirement']
#df_r[(df_r.cluster_sbert == '1')]