In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# -------------------------------
# Load cleaned datasets
# -------------------------------
resume_df = pd.read_csv("../data/original/resumes_cleaned.csv")
job_df    = pd.read_csv("../data/original/jobs_cleaned.csv")

resume_texts = resume_df["Resume_clean"].tolist()
job_texts    = job_df["job_text_clean"].tolist()

# -------------------------------
# Embedding models to test
# -------------------------------
embedding_models = {
    "multi_qa_mpnet": "multi-qa-mpnet-base-dot-v1",
    "all_mpnet": "all-mpnet-base-v2",
    "e5_large": "intfloat/e5-large-v2"
    "all-MiniLM": "all-MiniLM-L6-v2"
}

# -------------------------------
# Generate & save embeddings
# -------------------------------
for name, model_name in embedding_models.items():
    print(f"\nGenerating embeddings using: {model_name} ...")
    
    # Load model
    model = SentenceTransformer(model_name)
    
    # Resume embeddings
    resume_emb = model.encode(
        resume_texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )
    
    # Job embeddings
    job_emb = model.encode(
        job_texts,
        convert_to_numpy=True,
        show_progress_bar=True
    )
    
    # L2 Normalization
    resume_emb = resume_emb / np.linalg.norm(resume_emb, axis=1, keepdims=True)
    job_emb    = job_emb    / np.linalg.norm(job_emb, axis=1, keepdims=True)
    
    # Save embeddings
    np.save(f"../data/embeddings/resume_emb_{name}.npy", resume_emb)
    np.save(f"../data/embeddings/job_emb_{name}.npy", job_emb)

    print(f"Saved: resume_emb_{name}.npy and job_emb_{name}.npy")



Generating embeddings using: multi-qa-mpnet-base-dot-v1 ...


Batches: 100%|██████████| 78/78 [09:47<00:00,  7.54s/it]
Batches: 100%|██████████| 171/171 [13:44<00:00,  4.82s/it] 


Saved: resume_emb_multi_qa_mpnet.npy and job_emb_multi_qa_mpnet.npy

Generating embeddings using: all-mpnet-base-v2 ...


Batches: 100%|██████████| 78/78 [07:32<00:00,  5.80s/it]
Batches: 100%|██████████| 171/171 [06:25<00:00,  2.25s/it]


Saved: resume_emb_all_mpnet.npy and job_emb_all_mpnet.npy

Generating embeddings using: intfloat/e5-large-v2 ...


Batches: 100%|██████████| 78/78 [29:55<00:00, 23.02s/it]
Batches: 100%|██████████| 171/171 [24:49<00:00,  8.71s/it]


Saved: resume_emb_e5_large.npy and job_emb_e5_large.npy


In [13]:
import numpy as np

def load_emb(model_name):
    res = np.load(f"../data/embeddings/resume_emb_{model_name}.npy")
    job = np.load(f"../data/embeddings/job_emb_{model_name}.npy")
    return res, job

models = ["multi_qa_mpnet", "all_mpnet", "e5_large","all-MiniLM"]

embeddings = {m: load_emb(m) for m in models}


In [14]:
from sentence_transformers import util

def get_top_k(job_idx, resume_emb, job_emb, k=10):
    sims = util.cos_sim(job_emb[job_idx], resume_emb)[0].cpu().numpy()
    idx = sims.argsort()[::-1][:k]
    return idx, sims[idx]


In [15]:
testing_jobs = {
    "admin": 0,                        # Secretary / Office Helper
    "finance": 914,                    # Financial Analyst
    "tech": 78,                        # Java Senior Developer / Architect
    "education": 3,                    # Research Analyst or replace with English Teacher index
    "customer_service": 146,           # Administrative Assistant (customer-facing)
}


In [16]:
from sentence_transformers import util

# -------------------------------------
# Helper: print job posting
# -------------------------------------
def show_job_posting(job_idx, job_df, preview_chars=2000):
    print("\n" + "="*120)
    print(f"JOB POSTING #{job_idx}")
    print("="*120)
    
    if "Title" in job_df.columns:
        print(f"TITLE: {job_df.iloc[job_idx]['Title']}")
    if "Company" in job_df.columns:
        print(f"COMPANY: {job_df.iloc[job_idx]['Company']}")
    
    print("\n--- RAW JOB TEXT ---")
    if "job_text" in job_df.columns:
        print(job_df.iloc[job_idx]["job_text"][:preview_chars])
    
    print("\n--- CLEANED JOB TEXT ---")
    print(job_df.iloc[job_idx]["job_text_clean"][:preview_chars])
    
    print("="*120 + "\n")


# -------------------------------------
# Helper: print resumes in readable format
# -------------------------------------
def show_ranked_resumes(model_name, top_idx, scores, resume_df, k=10, preview_chars=800):
    print("\n" + "="*120)
    print(f"TOP {k} RESUMES — MODEL: {model_name}")
    print("="*120 + "\n")
    
    for rank, (idx, score) in enumerate(zip(top_idx, scores), start=1):
        print("-"*120)
        print(f"RANK {rank}")
        print(f"Row index:  {idx}")
        print(f"Resume ID:  {resume_df.iloc[idx]['ID']}")
        print(f"Score:      {score:.4f}")
        print("-"*120)
        print(resume_df.iloc[idx]["Resume_clean"][:preview_chars])
        print("-"*120 + "\n")


# -------------------------------------
# TESTING JOBS ARRAY (EDIT THIS)
# -------------------------------------
testing_jobs = {
    "admin": 0,           # Secretary / Office Helper
    "finance": 914,       # Financial Analyst
    "tech": 78,           # Java Senior Developer
    "education": 3,       # Research Analyst
    "customer_service": 146,  # Admin Assistant
}

# -------------------------------------
# MAIN LOOP: iterate through ALL test jobs
# -------------------------------------

models = ["multi_qa_mpnet", "all_mpnet", "e5_large"]

for domain, job_index in testing_jobs.items():

    print("\n" + "="*140)
    print(f"TESTING DOMAIN: {domain.upper()}  |  JOB INDEX: {job_index}")
    print("="*140)

    # Show this job posting
    show_job_posting(job_index, job_df)

    # Run all embedding models for this job
    for model_name in models:
        
        print(f"\nProcessing model: {model_name}")
        
        resume_emb = np.load(f"../data/embeddings/resume_emb_{model_name}.npy")
        job_emb    = np.load(f"../data/embeddings/job_emb_{model_name}.npy")
        
        scores_all = util.cos_sim(job_emb[job_index], resume_emb)[0].cpu().numpy()
        top_idx = scores_all.argsort()[::-1][:5]
        top_scores = scores_all[top_idx]
        
        show_ranked_resumes(model_name, top_idx, top_scores, resume_df)



TESTING DOMAIN: ADMIN  |  JOB INDEX: 0

JOB POSTING #0
TITLE: Secretary / office helper
COMPANY: LadyDalieda

--- RAW JOB TEXT ---
Description: Secretary office helper, Typing transitions from
English to Armenian, able to speck and understand English, Armenian and
Russian, to answer the phone and make calls, arrange appointments,
answering the door and welcoming in customers. Requirements: To take care of the office on there own some
times, work under there own supervision, be self motivated to keeping
the office in good order and file papers correctly. Have good
communication skills with customers by phone and in person. Qualifications: MS Word & Excel programs About Company: New Company starting up soon English school for
students.

--- CLEANED JOB TEXT ---
description secretary office helper typing transition english armenian able speck understand english armenian russian answer phone make call arrange appointment answering door welcoming customer requirement take care office time 

In [18]:
import numpy as np
import pandas as pd
from sentence_transformers import util
from tqdm import tqdm
import random
import re

# -----------------------------------------------
# Load datasets
# -----------------------------------------------


# Embedding models (pick best)
resume_emb = np.load("../data/embeddings/resume_emb_e5_large.npy")
job_emb    = np.load("../data/embeddings/job_emb_e5_large.npy")

# -----------------------------------------------
# Feature helpers
# -----------------------------------------------

def keyword_overlap(a, b):
    s1, s2 = set(a.lower().split()), set(b.lower().split())
    return len(s1 & s2) / max(len(s1), 1)

def count_skills(text, skill_list):
    text = text.lower()
    return sum(1 for s in skill_list if s in text)

def extract_education_score(text, edu_map):
    text = text.lower()
    for edu, score in edu_map.items():
        if edu in text:
            return score
    return 0  # no education found

def count_experience_words(text, exp_words):
    text = text.lower()
    return sum(text.count(w.lower()) for w in exp_words)

def detect_domain(text, domain_keywords):
    text = text.lower()
    scores = {dom: 0 for dom in domain_keywords}

    for dom, keywords in domain_keywords.items():
        for kw in keywords:
            if kw in text:
                scores[dom] += 1

    # return the domain with the highest score
    best_domain = max(scores, key=scores.get)
    return best_domain, scores[best_domain]

def compute_hybrid_score(cos_sim, skill_score, exp_score, edu_score, domain_score, weights):
    return (
        weights["semantic"]   * cos_sim +
        weights["skills"]     * skill_score +
        weights["experience"] * exp_score +
        weights["education"]  * edu_score +
        weights["domain"]     * domain_score
    )

# -----------------------------------------------
# Build training dataset
# -----------------------------------------------

rows = []

for job_idx in tqdm(range(len(job_df)), desc="Building training data"):

    job_text = job_df.iloc[job_idx]["job_text_clean"]

    # similarity vector
    sims = util.cos_sim(job_emb[job_idx], resume_emb)[0].cpu().numpy()

    # positives = top 5
    pos_idx = sims.argsort()[::-1][:5]

    # negatives = random 20 from bottom 70%
    cutoff = int(len(sims) * 0.7)
    neg_pool = sims.argsort()[:cutoff]
    neg_idx = random.sample(list(neg_pool), 20)

    for r_idx, label in [(i, 1) for i in pos_idx] + [(i, 0) for i in neg_idx]:

        resume_text = resume_df.iloc[r_idx]["Resume_clean"]

        # compute features
        kw_overlap = keyword_overlap(job_text, resume_text)
        skill_count = count_skills(resume_text, skills)
        edu_score = extract_education_score(resume_text, education_levels)
        exp_score = count_experience_words(resume_text, experience_words)

        job_domain, job_dom_score = detect_domain(job_text, domain_keywords)
        res_domain, res_dom_score = detect_domain(resume_text, domain_keywords)

        # domain match = 1 if same domain, else 0
        domain_match = 1 if job_domain == res_domain else 0

        # hybrid score
        hybrid = compute_hybrid_score(
            sims[r_idx],
            skill_count,
            exp_score,
            edu_score,
            domain_match,
            weights
        )

        rows.append({
            "job_id": job_idx,
            "resume_id": r_idx,
            "cosine_sim": sims[r_idx],
            "keyword_overlap": kw_overlap,
            "skill_count": skill_count,
            "education_score": edu_score,
            "experience_score": exp_score,
            "domain_match": domain_match,
            "hybrid_score": hybrid,
            "label": label,
        })

train_df = pd.DataFrame(rows)
train_df.to_csv("../data/train_pairs_rich.csv", index=False)

print("Training dataset created → train_pairs_rich.csv")


Building training data:   0%|          | 0/5448 [00:00<?, ?it/s]


NameError: name 'skills' is not defined