In [2]:
import pandas as pd

In [3]:
file_path = '../data/raw/potential-talents.csv'

In [4]:
df = pd.read_csv(file_path, index_col = 'id')
title_df = df['job_title']
sentences = list(set(title_df.values))

In [5]:
keywords =["Aspiring human resources", "seeking human resources"]

In [6]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# 1. Load Pretrained BERT Model and Tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name, return_dict=True)
tokenizer = BertTokenizer.from_pretrained(model_name)

def get_bert_embeddings(sentences):
    """Return embeddings for a list of sentences"""
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Using the `pooler_output` as sentence embeddings. Another common strategy is to average the last layer embeddings.
    embeddings = outputs.pooler_output
    return embeddings

# 2. Tokenize the Sentences and 3. Extract BERT Embeddings
embeddings_set1 = get_bert_embeddings(keywords)
embeddings_set2 = get_bert_embeddings(sentences)

# Convert embeddings to numpy for easier calculations
embeddings_set1 = embeddings_set1.numpy()
embeddings_set2 = embeddings_set2.numpy()

# 4. Calculate Similarity
similarities = cosine_similarity(embeddings_set1, embeddings_set2)


In [7]:
top_10_similar_sentences = []

for i, sentence in enumerate(keywords):
    # Getting the indices of top 10 similar sentences
    top_10_indices = similarities[i].argsort()[-10:][::-1] 
    
    top_10_for_sentence = [(sentences[j], similarities[i][j]) for j in top_10_indices]
    top_10_similar_sentences.append((sentence, top_10_for_sentence))

# Printing the results
for s1, top_10 in top_10_similar_sentences:
    print(f"Most similar sentences to '{s1}':")
    for s2, sim in top_10:
        print(f"   '{s2}' with a similarity of {sim:.4f}.")
    print("---------")

Most similar sentences to 'Aspiring human resources':
   'Seeking Human Resources Position' with a similarity of 0.9868.
   'Seeking Human Resources Opportunities' with a similarity of 0.9862.
   'People Development Coordinator at Ryan' with a similarity of 0.9847.
   'HR Senior Specialist' with a similarity of 0.9829.
   'Aspiring Human Resources Specialist' with a similarity of 0.9696.
   'Always set them up for Success' with a similarity of 0.9682.
   'Lead Official at Western Illinois University' with a similarity of 0.9625.
   'Human Resources Specialist at Luxottica' with a similarity of 0.9614.
   'Student at Chapman University' with a similarity of 0.9578.
   'Human Resources Professional' with a similarity of 0.9519.
---------
Most similar sentences to 'seeking human resources':
   'Seeking Human Resources Opportunities' with a similarity of 0.9946.
   'Seeking Human Resources Position' with a similarity of 0.9908.
   'People Development Coordinator at Ryan' with a similarity 