In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load cleaned datasets
resumes = pd.read_csv('resume_data_cleaned_final.csv')
jobs = pd.read_csv('job_descriptions_cleaned.csv')

bert_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_relevancy(resume_row, job_row):
    resume_text = f"{resume_row['skills']} {resume_row['education']} {resume_row['experience']}"
    job_text = f"{job_row['required_skills']} {job_row['required_education']} {job_row['required_experience']}"
    emb1 = bert_model.encode(resume_text, convert_to_tensor=True)
    emb2 = bert_model.encode(job_text, convert_to_tensor=True)
    score = util.cos_sim(emb1, emb2).item()

    # Normalize to [0, 1] then scale to [0, 100]
    normalized_score = (score + 1) / 2  # Converts [-1,1] to [0,1]
    return round(normalized_score * 100, 2)
    # return round(score * 100, 2)

# Example: Pair each resume with a random sample of jobs
import numpy as np
pairs = []
for idx, resume in resumes.iterrows():
    sampled_jobs = jobs.sample(n=5, random_state=42)  # or use smarter sampling
    for _, job in sampled_jobs.iterrows():
        score = get_relevancy(resume, job)
        pairs.append({
            'resume_id': resume['resume_id'],
            'job_id': job['job_id'],
            'resume_text': f"{resume['skills']} {resume['education']} {resume['experience']}",
            'job_text': f"{job['required_skills']} {job['required_education']} {job['required_experience']}",
            'relevancy_score': score
        })

labeled_df = pd.DataFrame(pairs)
labeled_df.to_csv('labeled_resume_job_pairs.csv', index=False)