In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os
from IPython.display import display


In [6]:
DATA_DIR = os.path.join(os.pardir, "data")
EMB_DIR = os.path.join(DATA_DIR, "embeddings")

In [7]:
resume_embeddings = np.load(os.path.join(EMB_DIR, "resume_embeddings.npy"))
job_embeddings    = np.load(os.path.join(EMB_DIR, "job_embeddings.npy"))


In [8]:
resume_df = pd.read_csv(os.path.join(DATA_DIR, "resumes_cleaned.csv"))
job_df    = pd.read_csv(os.path.join(DATA_DIR, "jobs_cleaned.csv"))

In [6]:
sim_matrix = cosine_similarity(job_embeddings, resume_embeddings)


In [7]:

# Randomly sample jobs and show their top-k resume matches
import numpy as np
from IPython.display import display

num_random_jobs = 3   # how many random jobs to show
top_k = 5             # top-k resumes to display
rng_seed = 42         # change for a different random draw
rng = np.random.default_rng(rng_seed)

# Columns to display from resumes (only those that exist will be used)
display_cols = [c for c in [
    "Name", "Current_Title", "YearsExperience", "Skills", "Location", "Resume_clean", "Resume"
] if c in resume_df.columns]

def snippet(text, max_chars=280):
    if isinstance(text, str) and len(text) > max_chars:
        return text[:max_chars].rsplit(" ", 1)[0] + " ..."
    return text

def topk_indices(scores: np.ndarray, k: int) -> np.ndarray:
    k = min(k, scores.shape[0])
    idx = np.argpartition(scores, -k)[-k:]
    return idx[np.argsort(scores[idx])[::-1]]

def show_matches_for_job_idx(job_idx: int, k: int = 5):
    scores = sim_matrix[job_idx]
    idxs = topk_indices(scores, k)
    rows = []
    for rank, res_idx in enumerate(idxs, 1):
        row = {
            "rank": rank,
            "ResumeID": resume_df.iloc[res_idx]["ID"] if "ID" in resume_df.columns else res_idx,
            "score": float(scores[res_idx]),
        }
        for c in display_cols:
            val = resume_df.iloc[res_idx][c]
            if isinstance(val, str) and c.lower().startswith("resume"):
                val = snippet(val, 300)
            row[c] = val
        rows.append(row)
    display(pd.DataFrame(rows))

# Pick random job indices (without replacement)
job_count = len(job_df)
job_indices = rng.choice(job_count, size=min(num_random_jobs, job_count), replace=False)

for i in job_indices:
    title = job_df.iloc[i]["Title"] if "Title" in job_df.columns else f"Job {i}"
    print(f"Job {i}: {title}")
    # Optional: show a short JD snippet if available
    jd_col = "job_text_clean" if "job_text_clean" in job_df.columns else None
    if jd_col:
        print("JD:", snippet(job_df.iloc[i][jd_col], 200))
    show_matches_for_job_idx(i, k=top_k)
    print()

Job 3566: Sales Manager
JD: description mobbis llc seeking sale manager responsible development performance sale activity assigned market hard work loading requirement search potential customer create maintain strong effective ...


Unnamed: 0,rank,ResumeID,score,Resume_clean
0,1,24946537,0.733859,multimedia sale consultant professional summar...
1,2,16694152,0.691963,business development executive summary result ...
2,3,19156751,0.675584,sale representative summary customer service r...
3,4,81310245,0.671528,business development executive professional su...
4,5,27213082,0.668817,business development director professional sum...



Job 486: Search Engine Optimization Specialist
JD: description linkgard system llc seeking ethical energetic highly motivated individual fulfill position seo specialist part search engine optimization group especially looking good student savvy ...


Unnamed: 0,rank,ResumeID,score,Resume_clean
0,1,18488289,0.62618,senior digital marketing specialist summary di...
1,2,15602094,0.587772,consultant professional summary talented profe...
2,3,62994611,0.574915,software developer professional summary enthus...
3,4,38535335,0.574472,engineering coordinator summary desire work co...
4,5,12011623,0.571208,engineering quality technician career overview...



Job 4215: Sales Manager
JD: description sale manager responsible promoting selling company product retail marketplace requirement work retailer assigned territory establish maintain relationship potential client analyze provide ...


Unnamed: 0,rank,ResumeID,score,Resume_clean
0,1,34303500,0.753544,sale director summary continue career organiza...
1,2,36970996,0.740031,sale consultant professional summary customer ...
2,3,14070138,0.738662,business development manager summary experienc...
3,4,24946537,0.727522,multimedia sale consultant professional summar...
4,5,18171955,0.727454,sale manager highlight m office proficiency mi...





In [9]:
# Rerank top-N cosine candidates with a cross-encoder (ms-marco-MiniLM-L-6-v2)


from sentence_transformers import CrossEncoder
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
cross = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device=device)


# Helper: pick best available text columns
def _get_job_text(row):
    for c in ["job_text_clean", "job_text", "Description", "JobDescription", "JD", "Text"]:
        if c in row and isinstance(row[c], str) and row[c].strip():
            return row[c]
    return ""


def _get_resume_text(row):
    for c in ["Resume_clean", "Resume", "Text", "Summary"]:
        if c in row and isinstance(row[c], str) and row[c].strip():
            return row[c]
    return ""


def rerank_matches_for_job_idx(job_idx: int, preselect: int = 50, k: int = 5, batch_size: int = 32) -> pd.DataFrame:
    # 1) Take top-N by cosine similarity
    scores = sim_matrix[job_idx]
    m = min(preselect, scores.size)
    cand = np.argpartition(scores, -m)[-m:]
    cand = cand[np.argsort(scores[cand])[::-1]]  # sorted by cosine desc


    # 2) Build pairs (job_text, resume_text)
    job_text = _get_job_text(job_df.iloc[job_idx])
    if not job_text:
        raise KeyError("No usable job text column found (expected job_text_clean/job_text/Description).")


    cand_texts = [(_get_resume_text(resume_df.iloc[i]), i) for i in cand]
    cand_nonempty = [(t, i) for (t, i) in cand_texts if t]
    if not cand_nonempty:
        return pd.DataFrame()


    pairs = [(job_text, t) for (t, _) in cand_nonempty]
    cand_idxs = np.array([i for (_, i) in cand_nonempty], dtype=int)


    # 3) Cross-encode to get relevance scores
    ce_scores = cross.predict(pairs, batch_size=batch_size)


    # 4) Take top-k by cross-encoder score
    k_final = min(k, len(cand_idxs))
    order = np.argsort(-ce_scores)[:k_final]
    final_idxs = cand_idxs[order]
    final_scores = ce_scores[order]


    # 5) Assemble output (reuse display_cols if defined globally; otherwise fallback to a default list)
    cols = (["ID"] if "ID" in resume_df.columns else [])
    default_display_cols = [c for c in ["Name","Current_Title","YearsExperience","Skills","Location","Resume_clean","Resume"] if c in resume_df.columns]
    display_cols_local = globals().get("display_cols", default_display_cols)


    out = resume_df.iloc[final_idxs][cols + display_cols_local].copy()
    if "snippet" in globals():
        for c in ["Resume", "Resume_clean"]:
            if c in out.columns:
                out[c] = out[c].map(lambda t: snippet(t, 300))
    out.insert(0, "rank", np.arange(1, len(final_idxs)+1))
    out.insert(1, "ce_score", final_scores)          # cross-encoder score
    out.insert(2, "cosine", scores[final_idxs])      # original cosine (for reference)
    return out.reset_index(drop=True)


# Example: randomly pick jobs and show reranked results
num_random_jobs = 3
rng = np.random.default_rng(42)
job_indices = rng.choice(len(job_df), size=min(num_random_jobs, len(job_df)), replace=False)


for j in job_indices:
    title = job_df.iloc[j]["Title"] if "Title" in job_df.columns else f"Job {j}"
    print(f"Job {j}: {title}")
    if "job_text_clean" in job_df.columns:
        print("JD:", snippet(job_df.iloc[j]["job_text_clean"], 200))
    display(rerank_matches_for_job_idx(j, preselect=50, k=5, batch_size=32))
    print()

Job 3566: Sales Manager
JD: description mobbis llc seeking sale manager responsible development performance sale activity assigned market hard work loading requirement search potential customer create maintain strong effective ...


Unnamed: 0,rank,ce_score,cosine,ID,Resume_clean
0,1,2.758733,0.643974,18171955,sale manager highlight m office proficiency mi...
1,2,2.232316,0.691963,16694152,business development executive summary result ...
2,3,2.145585,0.642382,26932091,sale sale associate career focus performance m...
3,4,1.91975,0.733859,24946537,multimedia sale consultant professional summar...
4,5,1.851414,0.647908,14241621,business development center manager profession...



Job 486: Search Engine Optimization Specialist
JD: description linkgard system llc seeking ethical energetic highly motivated individual fulfill position seo specialist part search engine optimization group especially looking good student savvy ...


Unnamed: 0,rank,ce_score,cosine,ID,Resume_clean
0,1,-2.410043,0.62618,18488289,senior digital marketing specialist summary di...
1,2,-3.13685,0.533308,22754014,content strategist summary energetic persuasiv...
2,3,-3.205608,0.553911,19179079,recruiting hr manager summary experienced corp...
3,4,-3.375922,0.556241,18354623,digital marketing manager career focus digital...
4,5,-3.380011,0.563106,20628003,digital marketing specialist summary digital m...



Job 4215: Sales Manager
JD: description sale manager responsible promoting selling company product retail marketplace requirement work retailer assigned territory establish maintain relationship potential client analyze provide ...


Unnamed: 0,rank,ce_score,cosine,ID,Resume_clean
0,1,3.478582,0.694371,40987524,sale summary year sale operation management ex...
1,2,3.278801,0.727282,16694152,business development executive summary result ...
2,3,3.051945,0.669191,12082377,sale representative professional summary exper...
3,4,2.887556,0.682526,59422148,sale executive summary sale executive result d...
4,5,2.834988,0.668104,37540732,sale executive summary twenty year experience ...





In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# NOTE: In the real product flow the *job* comes in as an external input.
# Here we deliberately choose ONLY ONE job embedding at random, because we
# pre-embedded a small set of jobs purely to have convenient, reusable
# test inputs. That is the only reason jobs were embedded ahead of time.

# We reuse the already-loaded objects from above:
# - job_embeddings, resume_embeddings
# - job_df (jobs_cleaned.csv)
# - resume_df (resumes_cleaned.csv)

# --- 1. Randomly select a single job ---
job_index = np.random.randint(0, job_embeddings.shape[0])
selected_job_embedding = job_embeddings[job_index].reshape(1, -1)
selected_job_row = job_df.iloc[job_index]

print(f"Selected job index: {job_index}")
print("Selected job title:", selected_job_row.get("Title", "N/A"))
print("Selected job ID:", selected_job_row.get("ID", job_index))
print()

# --- 2. Compute cosine similarity between this one job and all resumes ---
cosine_scores = cosine_similarity(selected_job_embedding, resume_embeddings)[0]

# --- 3. Rerank resumes by similarity to this one job ---
top_k = 10  # change if you want more/less results

# argsort returns ascending, so take negative for descending
ranked_indices = np.argsort(-cosine_scores)[:top_k]
ranked_scores = cosine_scores[ranked_indices]

ranked_resumes = resume_df.iloc[ranked_indices].copy()
ranked_resumes["similarity"] = ranked_scores

# Choose sensible columns that actually exist in resumes_cleaned.csv
candidate_id_col = "ID" if "ID" in ranked_resumes.columns else None
name_col = "Name" if "Name" in ranked_resumes.columns else None
title_col = "Current_Title" if "Current_Title" in ranked_resumes.columns else None
summary_col = "Resume_clean" if "Resume_clean" in ranked_resumes.columns else (
    "Resume" if "Resume" in ranked_resumes.columns else None
)

def _snippet(text, max_chars=300):
    if isinstance(text, str) and len(text) > max_chars:
        return text[:max_chars].rsplit(" ", 1)[0] + " ..."
    return text

# --- 4. Print results ---
print("Top candidate resumes for the selected job:\n")
for rank, (idx, row) in enumerate(ranked_resumes.iterrows(), start=1):
    print(f"Rank {rank} | Resume index: {idx} | similarity: {row['similarity']:.4f}")
    if candidate_id_col:
        print("Candidate ID:", row.get(candidate_id_col))
    if name_col:
        print("Candidate name:", row.get(name_col))
    if title_col:
        print("Current title:", row.get(title_col))
    if summary_col:
        print("Resume snippet:", _snippet(row.get(summary_col, "")))
    print('-' * 80)

Selected job index: 4358
Selected job title: Sales Manager
Selected job ID: 4358

Top candidate resumes for the selected job:

Rank 1 | Resume index: 1075 | similarity: 0.6829
Candidate ID: 18171955
Resume snippet: sale manager highlight m office proficiency microsoft outlook lotus note team leadership exceptional time management accomplishment managed successful sale team member consistently exceeded sale goal average month developed highly effective sale training strategy sale manager team coached ...
--------------------------------------------------------------------------------
Rank 2 | Resume index: 1027 | similarity: 0.6620
Candidate ID: 26932091
Resume snippet: sale sale associate career focus performance management consultant year experience planning developing implementing behavioral operationally focused procedure enable sale productivity currently r esponsible maximizing performance across retail team southwest territory working collaboratively ...
-------------------------