In [2]:
import os
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm
import pickle

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
current_dir = os.path.dirname(os.path.abspath("__file__"))
output_dir = os.path.join(current_dir, "..", "outputs")
job_csv_path = os.path.join(current_dir, "..", "Resume-Dataset", "training_data.csv")

In [None]:
extracted_texts = {}
for filename in os.listdir(os.path.join(output_dir, "extracted_texts")):
    with open(os.path.join(output_dir, "extracted_texts", filename), 'r', encoding='utf-8') as f:
        resume_id = filename.split('.')[0]  
        extracted_texts[resume_id] = f.read()

resume_embeddings = {}
for resume_id, text in tqdm(extracted_texts.items(), desc="Generating resume embeddings"):
    embedding = model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
    resume_embeddings[resume_id] = embedding

Generating resume embeddings: 100%|██████████| 2457/2457 [04:52<00:00,  8.40it/s]


In [None]:
def sanitize_filename(title):
    sanitized_title = re.sub(r'[0-9<>:"/\\|?*%]', '_', title)
    return sanitized_title

In [None]:
job_descriptions = pd.read_csv(job_csv_path)
job_embeddings = {}
for index, row in tqdm(job_descriptions.iterrows(), desc="Generating job embeddings"):
    job_title = row['position_title']
    sanitized_job_title = sanitize_filename(job_title)
    job_description = row['job_description']
    embedding = model.encode(job_description, convert_to_numpy=True, normalize_embeddings=True)
    job_embeddings[sanitized_job_title] = embedding

Generating job embeddings: 853it [01:23, 10.17it/s]


In [None]:
os.makedirs(os.path.join(output_dir, "embeddings", "resumes"), exist_ok=True)

for resume_id, embedding in resume_embeddings.items():
    file_path = os.path.join(output_dir, "embeddings", "resumes", f'{resume_id}.npy')
    np.save(file_path, embedding)

print("Embeddings generated and saved in 'outputs/embeddings'.")

Embeddings generated and saved in 'outputs/embeddings'.


In [None]:
os.makedirs(os.path.join(output_dir, "embeddings", "job_description"), exist_ok=True)

for job_title, embedding in job_embeddings.items():
    sanitized_job_title = sanitize_filename(job_title)
    file_path = os.path.join(output_dir, "embeddings", "job_description", f'{sanitized_job_title}.npy')
    np.save(file_path, embedding)

print("Embeddings generated and saved in 'outputs/embeddings'.")

Embeddings generated and saved in 'outputs/embeddings'.


In [None]:
resume_emb_path= os.path.join(output_dir, "embeddings",f'resumes.pkl')
job_emb_path= os.path.join(output_dir, "embeddings",f'jobs.pkl')

with open(resume_emb_path, "wb") as f:
    pickle.dump(resume_embeddings,f)

with open(job_emb_path, "wb") as f:
    pickle.dump(job_embeddings,f)

In [None]:
import pickle

with open(resume_emb_path, "rb") as f:
    resumes = pickle.load(f)

print(f"Type of object: {type(resumes)}")

if isinstance(resumes, dict):
    print(f"Number of resumes: {len(resumes)}")
    print(f"Sample keys: {list(resumes.keys())[:5]}")  

elif isinstance(resumes, list):
    print(f"Number of resumes: {len(resumes)}")
    print(f"Sample resume embeddings: {resumes[:2]}")  


Type of object: <class 'dict'>
Number of resumes: 2457
Sample keys: ['10001727', '10005171', '10030015', '10041713', '10062724']


In [13]:
embedding_shapes = {k: np.array(v).shape for k, v in resumes.items()}
unique_shapes = set(embedding_shapes.values())

print(f"Unique embedding shapes: {unique_shapes}")


Unique embedding shapes: {(384,)}
