https://huggingface.co/shashu2325/resume-job-matcher-lora

Uses dataset: https://huggingface.co/datasets/cnamuangtoun/resume-job-description-fit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/AI-ML Self Learning/next_horizon/resume_job_recommendation/model-shashu2325-resume-job-matcher-lora/dataset.csv')
df

In [50]:
from transformers import AutoModel, AutoTokenizer
from peft import PeftModel
import torch
import torch.nn.functional as F
import openai
import numpy as np
import os
from google.colab import userdata

# Ensure OpenAI API key is set as a Colab secret named 'OPENAI_API_KEY'
try:
    openai.api_key = userdata.get('OPENAI_API_KEY')
except userdata.SecretNotFoundError:
    print("OpenAI API key not found in Colab secrets. Please set it as a secret named 'OPENAI_API_KEY'.")
    openai.api_key = None


# Load BGE models and tokenizer
# Load base model
try:
    base_model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
    # Load Peft model
    model = PeftModel.from_pretrained(base_model, "shashu2325/resume-job-matcher-lora")
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
    print("BGE models and tokenizer loaded successfully.")
except Exception as e:
    print(f"Failed to load BGE models or tokenizer: {e}")
    base_model = None
    model = None
    tokenizer = None


def calculate_bge_similarity(resume_text, job_text):
    """Calculates similarity between resume and job text using BGE embeddings."""
    if model is None or tokenizer is None:
        print("BGE models or tokenizer not loaded. Cannot calculate similarity.")
        return None

    try:
        # Process texts
        resume_inputs = tokenizer(resume_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        job_inputs = tokenizer(job_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)

        # Get embeddings
        with torch.no_grad():
            # Get embeddings using mean pooling
            resume_outputs = model(**resume_inputs)
            job_outputs = model(**job_inputs)

            # Mean pooling
            resume_emb = resume_outputs.last_hidden_state.mean(dim=1)
            job_emb = job_outputs.last_hidden_state.mean(dim=1)

            # Normalize and calculate similarity
            resume_emb = F.normalize(resume_emb, p=2, dim=1)
            job_emb = F.normalize(job_emb, p=2, dim=1)

            similarity = torch.sum(resume_emb * job_emb, dim=1)
            match_score = torch.sigmoid(similarity).item()

        return match_score
    except Exception as e:
        print(f"Error calculating BGE similarity: {e}")
        return None

def get_openai_embedding(text, model="text-embedding-3-small"):
    """Gets OpenAI embedding for a given text."""
    if openai.api_key is None:
        print("OpenAI API key not set. Cannot get embedding.")
        return None
    try:
        text = text.replace("\n", " ")
        return openai.embeddings.create(input=[text], model=model).data[0].embedding
    except Exception as e:
        print(f"Error getting OpenAI embedding: {e}")
        return None

def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors"""
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(a.dot(b) / (na * nb))


def calculate_openai_similarity(resume_text, job_text):
    """Calculates cosine similarity between resume and job text using OpenAI embeddings."""
    resume_embedding = get_openai_embedding(resume_text)
    job_embedding = get_openai_embedding(job_text)

    if resume_embedding is None or job_embedding is None:
        print("Failed to get OpenAI embeddings. Cannot calculate similarity.")
        return None

    # Calculate similarity using the cosine function
    similarity_score = cosine_similarity(resume_embedding, job_embedding)

    return similarity_score

# Example usage to verify functions (optional)
# resume_text = "Software engineer with Python experience"
# job_text = "Looking for Python developer"

# bge_score = calculate_bge_similarity(resume_text, job_text)
# openai_score = calculate_openai_similarity(resume_text, job_text)

# print(f"BGE Match score: {bge_score:.4f}")
# print(f"OpenAI Match score: {openai_score:.4f}")

BGE models and tokenizer loaded successfully.


In [55]:
select_df = df[df['Decision'] == 'select']
reject_df = df[df['Decision'] == 'reject']

# Determine sample sizes. Aim for equal representation if possible, or adjust if one category is much smaller.
total_samples = 1000
num_select = min(len(select_df), total_samples // 2)
num_reject = min(len(reject_df), total_samples - num_select)

# Adjust if one category is much smaller and we couldn't get 50 total with the initial split
if num_select + num_reject < total_samples:
    if len(select_df) > len(reject_df):
        num_select = min(len(select_df), total_samples - num_reject)
    else:
        num_reject = min(len(reject_df), total_samples - num_select)

sampled_select_df = select_df.sample(n=num_select, random_state=42) # Use random_state for reproducibility
sampled_reject_df = reject_df.sample(n=num_reject, random_state=42)

sampled_df = pd.concat([sampled_select_df, sampled_reject_df])

# Shuffle the combined sample to mix the select and reject rows
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

display(sampled_df.head())
display(sampled_df.info())

Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description
0,Data Engineer,"Here's a sample resume for Diana Johnson, a Da...",reject,Lack of enthusiasm or motivation.,Here is a job description for a Data Engineer:...
1,Database Administrator,Here's a sample resume for Carrie Smith:\n\nCa...,reject,Lacked leadership skills for a senior position.,We're hiring a Database Administrator to devel...
2,E-commerce Specialist,"Here's a professional resume for Sarah Myers, ...",reject,Needs improvement in machine learning algorithms.,Join our fast-growing team and help us scale o...
3,Software Engineer,**Meera**\n**Software Engineer Candidate**\n\n...,reject,The candidate lacked sufficient expertise in k...,Looking for a skilled Software Engineer with a...
4,IT Support Specialist,Here's a sample resume for Lauren King:\n\nLau...,select,Strong technical skills in AI and ML.,Be part of a passionate team at the forefront ...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Role                 1000 non-null   object
 1   Resume               1000 non-null   object
 2   Decision             1000 non-null   object
 3   Reason_for_decision  1000 non-null   object
 4   Job_Description      1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


None

In [56]:
bge_scores = []
openai_scores = []

for index, row in sampled_df.iterrows():
    resume_text = str(row['Resume']) if pd.notna(row['Resume']) else ""
    job_text = str(row['Job_Description']) if pd.notna(row['Job_Description']) else ""

    # Calculate BGE similarity with error handling
    try:
        print(f"Calculating BGE similarity for row {index}")
        bge_score = calculate_bge_similarity(resume_text, job_text)
        print(f"BGE similarity for row {index}: {bge_score}")
    except Exception as e:
        print(f"Error calculating BGE similarity for row {index}: {e}")
        bge_score = None # or np.nan

    bge_scores.append(bge_score)

    # Calculate OpenAI similarity with error handling
    try:
        # Check if API key is set before calling the function
        if openai.api_key is None:
            print(f"Skipping OpenAI similarity calculation for row {index} because API key is not set.")
            openai_score = None
        else:
            print(f"Calculating OpenAI similarity for row {index}")
            openai_score = calculate_openai_similarity(resume_text, job_text)
            print(f"OpenAI similarity for row {index}: {openai_score}")
    except Exception as e:
        print(f"Error calculating OpenAI similarity for row {index}: {e}")
        openai_score = None # or np.nan

    openai_scores.append(openai_score)

print("Score calculation complete.")

Calculating BGE similarity for row 0
BGE similarity for row 0: 0.688313364982605
Calculating OpenAI similarity for row 0
OpenAI similarity for row 0: 0.6665256834471475
Calculating BGE similarity for row 1
BGE similarity for row 1: 0.6440492868423462
Calculating OpenAI similarity for row 1
OpenAI similarity for row 1: 0.4304944558833671
Calculating BGE similarity for row 2
BGE similarity for row 2: 0.6475348472595215
Calculating OpenAI similarity for row 2
OpenAI similarity for row 2: 0.5265663025480486
Calculating BGE similarity for row 3
BGE similarity for row 3: 0.647560715675354
Calculating OpenAI similarity for row 3
OpenAI similarity for row 3: 0.5287259579597844
Calculating BGE similarity for row 4
BGE similarity for row 4: 0.5631677508354187
Calculating OpenAI similarity for row 4
OpenAI similarity for row 4: 0.3744969984247195
Calculating BGE similarity for row 5
BGE similarity for row 5: 0.6059319376945496
Calculating OpenAI similarity for row 5
OpenAI similarity for row 5: 0

KeyboardInterrupt: 

In [None]:
# Add the scores as new columns to the sampled DataFrame
sampled_df['bge_similarity'] = bge_scores
sampled_df['openai_similarity'] = openai_scores

# Classify based on a threshold of 0.5
sampled_df['bge_prediction'] = sampled_df['bge_similarity'].apply(lambda x: 'select' if x > 0.5 else 'reject')
sampled_df['openai_prediction'] = sampled_df['openai_similarity'].apply(lambda x: 'select' if x > 0.5 else 'reject')

# Display the comparative table with original decision and predictions
print("Comparative Table of Similarity Scores and Predictions:")
display(sampled_df[['Role', 'Resume', 'Job_Description', 'Decision', 'bge_similarity', 'bge_prediction', 'openai_similarity', 'openai_prediction']])

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Convert 'Decision' column to numerical labels if needed for some metrics (optional)
# sampled_df['Decision_numeric'] = sampled_df['Decision'].apply(lambda x: 1 if x == 'select' else 0)
# sampled_df['bge_prediction_numeric'] = sampled_df['bge_prediction'].apply(lambda x: 1 if x == 'select' else 0)
# sampled_df['openai_prediction_numeric'] = sampled_df['openai_prediction'].apply(lambda x: 1 if x == 'select' else 0)


# Compare performance for BGE predictions
print("Performance Comparison for BGE Predictions:")
print(classification_report(sampled_df['Decision'], sampled_df['bge_prediction']))
print(f"Accuracy: {accuracy_score(sampled_df['Decision'], sampled_df['bge_prediction']):.4f}")

print("\n" + "="*50 + "\n")

# Compare performance for OpenAI predictions
print("Performance Comparison for OpenAI Predictions:")
print(classification_report(sampled_df['Decision'], sampled_df['openai_prediction']))
print(f"Accuracy: {accuracy_score(sampled_df['Decision'], sampled_df['openai_prediction']):.4f}")