In [8]:
import numpy as np
import pandas as pd

In [9]:
keywords =["Aspiring human resources", "seeking human resources"]

In [12]:
data_file_path = '../data/raw/potential-talents.csv'
google_model_path = '../models/GoogleNews-vectors-negative300.bin'

In [13]:
# Load dataset
df = pd.read_csv(data_file_path, index_col = 'id')

# Inspect data
df.head()

Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [15]:
title_df = df['job_title']
sentences = list(set(title_df.values))

In [16]:
from gensim.models import KeyedVectors

# assuming the model is downloaded and stored in the path 'path/to/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(google_model_path, binary=True)


In [17]:
def average_word2vec(sentence, model):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model.index_to_key]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # return zero vector if no words in the sentence is in the vocabulary of the model


In [18]:
keyword_vectors = [average_word2vec(sentence, model) for sentence in keywords]
sentence_vectors = [average_word2vec(sentence, model) for sentence in sentences]

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(keyword_vectors, sentence_vectors)

In [20]:
top_10_similar_sentences = []

for i, sentence in enumerate(keywords):
    # Getting the indices of top 10 similar sentences
    top_10_indices = similarities[i].argsort()[-10:][::-1] 
    
    top_10_for_sentence = [(sentences[j], similarities[i][j]) for j in top_10_indices]
    top_10_similar_sentences.append((sentence, top_10_for_sentence))

# Printing the results
for s1, top_10 in top_10_similar_sentences:
    print(f"Most similar sentences to '{s1}':")
    for s2, sim in top_10:
        print(f"   '{s2}' with a similarity of {sim:.4f}.")
    print("---------")

Most similar sentences to 'Aspiring human resources':
   'Aspiring Human Resources Professional' with a similarity of 0.6287.
   'Aspiring Human Resources Manager, seeking internship in Human Resources.' with a similarity of 0.6136.
   'Aspiring Human Resources Specialist' with a similarity of 0.5983.
   'Aspiring Human Resources Professional | Passionate about helping to create an inclusive and engaging work environment' with a similarity of 0.5976.
   'Aspiring Human Resources Professional | An energetic and Team-Focused Leader' with a similarity of 0.5776.
   'Aspiring Human Resources Management student seeking an internship' with a similarity of 0.5763.
   'Student at Humber College and Aspiring Human Resources Generalist' with a similarity of 0.5338.
   'Liberal Arts Major. Aspiring Human Resources Analyst.' with a similarity of 0.5240.
   'Aspiring Human Resources Manager | Graduating May 2020 | Seeking an Entry-Level Human Resources Position in St. Louis' with a similarity of 0.

In [22]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_embeddings(embeddings, labels):
    tsne = TSNE(n_components=2, random_state=0)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    for i, label in enumerate(labels):
        x, y = reduced_embeddings[i]
        plt.scatter(x, y, marker='x', color='red')
        plt.text(x+0.01, y, label, fontsize=9)
    plt.show()

plot_embeddings(sentence_vectors, sentences)


AttributeError: 'list' object has no attribute 'shape'