In [1]:
# 1. Imports & Paths
import importlib.util
import os, sys, re, pandas as pd
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed


sys.path.append(os.path.join(os.pardir, "utils"))

DATA_DIR = os.path.join(os.pardir, "data/clusters")
resume_path = os.path.join(DATA_DIR, "resumes_clustered.csv")
jobs_path   = os.path.join(DATA_DIR, "jobs_clustered.csv")


EMB_DIR = os.path.join(os.pardir, "data/embeddings")
remb_path = os.path.join(EMB_DIR, "resume_embeddings.npy")
jemb_path   = os.path.join(EMB_DIR, "job_embeddings.npy")
SAVE_DIR = os.path.join(os.pardir, "data/embeddings")

In [2]:
%load_ext autoreload
%autoreload 2
from utils import *


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)

resume_df = resume_df[["ID", "Resume_clean"]]
job_posts_df = job_posts_df[["job_text_clean"]]

resume_embeddings = np.load(remb_path)
job_embeddings = np.load(jemb_path)

In [5]:
for j_idx, job_vec in enumerate(job_embeddings):
    # Compute similarity between this job and all resumes
    sims = cosine_similarity([job_vec], resume_embeddings)[0]
    
    # Get top 5 most similar resumes
    top_idx = sims.argsort()[-5:][::-1]  # highest 5 cosine scores
    pairs = []
    for r_idx in top_idx:
        pairs.append({
            "job_idx": j_idx,
            "resume_idx": r_idx,
            "cosine_similarity": sims[r_idx]
        })

pairs_df = pd.DataFrame(pairs)
# pairs_df.to_csv("job_resume_cosine_pairs.csv", index=False)
# print(f"✅ Saved {len(pairs_df):,} job–resume pairs")
pairs_df.head()

Unnamed: 0,job_idx,resume_idx,cosine_similarity
0,5447,553,0.666872
1,5447,1929,0.648839
2,5447,1299,0.645512
3,5447,581,0.640185
4,5447,1050,0.638735


In [6]:
pairs_df

Unnamed: 0,job_idx,resume_idx,cosine_similarity
0,5447,553,0.666872
1,5447,1929,0.648839
2,5447,1299,0.645512
3,5447,581,0.640185
4,5447,1050,0.638735


In [7]:
resumes = pd.read_csv(resume_path)
jobs= pd.read_csv(jobs_path)

In [8]:
# Merge resume text
pairs_df["resume_text"] = pairs_df["resume_idx"].apply(lambda i: resumes.loc[i, "Resume_clean"])
# Merge job text
pairs_df["job_text"] = pairs_df["job_idx"].apply(lambda j: jobs.loc[j, "job_text_clean"])


In [15]:
# ========== CELL 8: Save Training Dataset ==========
import os

finetune_df = pairs_df[["resume_text", "job_text", "cosine_similarity"]].rename(
    columns={"cosine_similarity": "label"}
)

# Create output directory if it doesn't exist
output_dir = os.path.join(os.pardir, "data_outputs")
os.makedirs(output_dir, exist_ok=True)

# Save with proper path
output_path = os.path.join(output_dir, "roberta_finetuning_dataset.csv")
finetune_df.to_csv(output_path, index=False)

print(f"✅ Saved {len(finetune_df):,} training pairs to:")
print(f"   {os.path.abspath(output_path)}")
print(f"\nDataset shape: {finetune_df.shape}")
print(f"Label range: [{finetune_df['label'].min():.4f}, {finetune_df['label'].max():.4f}]")
finetune_df.head()


✅ Saved 5 training pairs to:
   c:\Users\adity\Desktop\ML_PYTHON\ai_recruitment\ai-recruitment\data_outputs\roberta_finetuning_dataset.csv

Dataset shape: (5, 3)
Label range: [0.6387, 0.6669]


Unnamed: 0,resume_text,job_text,label
0,multimedia sale consultant professional summar...,description san lazzaro llc looking individual...,0.666872
1,concrete construction summary highly experienc...,description san lazzaro llc looking individual...,0.648839
2,digital marketing specialist summary digital m...,description san lazzaro llc looking individual...,0.645512
3,business development rep summary ambitious mar...,description san lazzaro llc looking individual...,0.640185
4,sale director summary continue career organiza...,description san lazzaro llc looking individual...,0.638735
