In [5]:
## Phase 2 Embedding and Model Integration
### CSC 4444: Artificial Intelligence

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os


In [None]:
# If file exists, load it; otherwise expect a DataFrame from earlier notebook execution.
DATA_PATH = "cleaned_resumes.csv" # or whateevr the file path is

if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print("Loaded cleaned dataset from file.")
else:
    df = df_cleaned.copy() #or from other data frame 
    print("Using df_cleaned from previous notebook cell.")

df.head()


In [None]:
# Load model
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)

print(f"Loaded embedding model: {model_name}")


In [None]:
# Ensure required columns exist # these can be changed based on cleaned data set
required_cols = ["resume_text", "job_description"]

if not all(col in df.columns for col in required_cols):
    raise ValueError("Dataset is missing required columns: resume_text, job_description")

# Convert to lists
resume_list = df['resume_text'].tolist()
jd_list = df['job_description'].tolist()

# Generate embeddings
resume_embeddings = model.encode(resume_list, batch_size=32, show_progress_bar=True)
jd_embeddings = model.encode(jd_list, batch_size=32, show_progress_bar=True)

print("Embedding generation complete.")
print(f"Resume embeddings shape: {resume_embeddings.shape}")
print(f"JD embeddings shape: {jd_embeddings.shape}")


In [None]:
os.makedirs("data/embeddings", exist_ok=True)

np.save("data/embeddings/resume_embeddings.npy", resume_embeddings)
np.save("data/embeddings/jd_embeddings.npy", jd_embeddings)

print("Embeddings saved to data/embeddings/")


In [None]:
# Pick some indices to test similarity
idx = 0  # Compare Resume 0 to all JD embeddings

resume_vec = resume_embeddings[idx].reshape(1, -1)
similarities = cosine_similarity(resume_vec, jd_embeddings)[0]

# Show top 5 matches
top_matches = np.argsort(similarities)[::-1][:5]

print("Top matching job descriptions for Resume 0:")
for i in top_matches:
    print(f"JD {i} â€” similarity: {similarities[i]:.4f}")
