In [21]:
# Resume-to-Job Recommender System using SBERT

import os
import re
import string
import time
import logging
import warnings
import pandas as pd
import numpy as np
import joblib
import torch
from sentence_transformers import SentenceTransformer, util

try:
    import fitz  # PyMuPDF
except ImportError:
    raise ImportError("Install PyMuPDF with: pip install PyMuPDF")

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', filename='job_recommender_sbert.log')
logger = logging.getLogger()

# --- Utility Functions ---
def extract_resume_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        return " ".join([page.get_text() for page in doc])
    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        return ""

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return re.sub(r'\s+', ' ', text).strip()

def weight_text(text, weight):
    if not isinstance(text, str):
        return ""
    weighted = text * int(weight)
    frac = weight - int(weight)
    if frac:
        words = text.split()
        weighted += " " + " ".join(words[:int(len(words) * frac)])
    return weighted

# --- SBERT Caching ---
def build_or_load_sbert_embeddings(jobs_df, cache_dir="./sbert_cache"):
    os.makedirs(cache_dir, exist_ok=True)
    vec_path = os.path.join(cache_dir, "job_embeddings.npy")
    model_path = os.path.join(cache_dir, "sbert_model.pkl")

    if os.path.exists(vec_path) and os.path.exists(model_path):
        logger.info("Loading cached SBERT embeddings...")
        return np.load(vec_path), joblib.load(model_path), jobs_df

    weights = {'Job Title': 3.0, 'Role': 2.5, 'skills': 2.0, 'Job Description': 1.0, 'Company': 0.8}
    jobs_df['Weighted_Text'] = ""
    for field, weight in weights.items():
        if field in jobs_df.columns:
            jobs_df['Weighted_Text'] += jobs_df[field].fillna('').apply(lambda x: weight_text(str(x), weight) + " ")
    jobs_df['Weighted_Text_Clean'] = jobs_df['Weighted_Text'].apply(clean_text)

    model = SentenceTransformer('all-MiniLM-L6-v2')
    job_embeddings = model.encode(jobs_df['Weighted_Text_Clean'].tolist(), convert_to_numpy=True, show_progress_bar=True)

    np.save(vec_path, job_embeddings)
    joblib.dump(model, model_path)
    return job_embeddings, model, jobs_df

# --- Similarity Matching ---
def match_resume_sbert(resume_text, job_embeddings, model, jobs_df, top_n=10):
    resume_clean = clean_text(resume_text)
    resume_embedding = model.encode(resume_clean, convert_to_tensor=True)
    job_embeddings_tensor = torch.tensor(job_embeddings).to(resume_embedding.device)

    similarities = util.pytorch_cos_sim(resume_embedding, job_embeddings_tensor)[0].cpu().numpy()
    jobs_df = jobs_df.copy()
    jobs_df['similarity'] = similarities
    return jobs_df.sort_values('similarity', ascending=False).head(top_n)

# --- Main Recommender ---
def run_job_recommender_sbert(resume_path, jobs_csv_path, top_n=10):
    start = time.time()
    logger.info("Loading data...")
    jobs_df = pd.read_csv(jobs_csv_path)
    resume_text = extract_resume_text(resume_path)
    if not resume_text:
        logger.warning("Resume extraction failed.")
        return pd.DataFrame({'Message': ['Resume extraction failed']})

    job_embeddings, model, processed_df = build_or_load_sbert_embeddings(jobs_df)
    results = match_resume_sbert(resume_text, job_embeddings, model, processed_df, top_n)
    logger.info(f"Total execution time: {time.time() - start:.2f}s")
    return results

# --- Example Execution ---
if __name__ == "__main__":
    resume_path = 'sampleresume.pdf'
    jobs_csv_path = 'job_descriptions.csv'
    top_matches = run_job_recommender_sbert(resume_path, jobs_csv_path, top_n=5)
    print(top_matches[['Job Title', 'Company', 'Role', 'location', 'similarity']])

Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 16.44it/s]


                            Job Title                   Company  \
1111523  Digital Marketing Specialist  Associated British Foods   
1161255  Digital Marketing Specialist  Associated British Foods   
397716   Digital Marketing Specialist  Associated British Foods   
1529428  Digital Marketing Specialist  Associated British Foods   
513603   Digital Marketing Specialist  Associated British Foods   

                         Role  location  similarity  
1111523  Social Media Manager     Dakar     0.62881  
1161255  Social Media Manager    Bangui     0.62881  
397716   Social Media Manager    Madrid     0.62881  
1529428  Social Media Manager   Mbabane     0.62881  
513603   Social Media Manager  Funafuti     0.62881  
