In [2]:
pip install voyageai

Collecting voyageai
  Downloading voyageai-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting aiolimiter (from voyageai)
  Downloading aiolimiter-1.2.1-py3-none-any.whl.metadata (4.5 kB)
Downloading voyageai-0.3.5-py3-none-any.whl (28 kB)
Downloading aiolimiter-1.2.1-py3-none-any.whl (6.7 kB)
Installing collected packages: aiolimiter, voyageai
Successfully installed aiolimiter-1.2.1 voyageai-0.3.5


# Resume-to-Job Description Matching: Multi-Model Performance Comparison

This notebook compares the performance of different embedding models for resume-to-job description matching tasks. We evaluate multiple models including:

- **BGE** (BAAI General Embedding with LoRA fine-tuning)
- **OpenAI** (text-embedding-3-small)
- **BGE-M3** (Multilingual BGE)
- **CareerBERT** (Career-specific BERT)
- **ConFit V2** (Using all-mpnet-base-v2)
- **Voyage-3-Large** (Voyage AI embedding)
- **LLaMA-3.1** (Remote Inference via Inference Providers)

The analysis includes similarity score calculations, threshold-based predictions, and comprehensive performance metrics.

## 1. Setup Environment and Load Libraries

Import all required libraries for model loading, text processing, similarity calculations, and performance evaluation.

In [3]:
# Import required libraries
from transformers import AutoModel, AutoTokenizer, LlamaModel, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from peft import PeftModel
import torch
import torch.nn.functional as F
import openai
import numpy as np
import os
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import voyageai
import requests
import time
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import json

print("All libraries imported successfully!")

All libraries imported successfully!


## 2. Initialize API Keys and Environment Variables

Load environment variables from .env file and set up API keys for external services.

In [4]:
# Load environment variables from .env file
# load_dotenv()

# Get API keys from environment variables
#openai.api_key = os.getenv('OPENAI_API_KEY')
#voyage_api_key = os.getenv('VOYAGE_API_KEY')

from google.colab import userdata
openai.api_key = userdata.get('OPENAI_API_KEY')
voyage_api_key = userdata.get('VOYAGE_API_KEY')

# Validate API keys
if openai.api_key is None:
    print("‚ö†Ô∏è  OpenAI API key not found in .env file. Please set OPENAI_API_KEY in your .env file.")
else:
    print("‚úÖ OpenAI API key loaded successfully.")

if voyage_api_key is None:
    print("‚ö†Ô∏è  Voyage API key not found in .env file. Please set VOYAGE_API_KEY in your .env file.")
else:
    print("‚úÖ Voyage API key loaded successfully.")
    voyageai.api_key = voyage_api_key

‚úÖ OpenAI API key loaded successfully.
‚úÖ Voyage API key loaded successfully.


## 3. Load Embedding Models

Initialize all embedding models with proper error handling. This may take several minutes as models are downloaded and loaded.

In [5]:
# Load BGE models and tokenizer
print("Loading BGE models...")
try:
    base_model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
    # Load Peft model (LoRA adapter)
    model = PeftModel.from_pretrained(base_model, "shashu2325/resume-job-matcher-lora")
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
    print("‚úÖ BGE models and tokenizer loaded successfully.")
except Exception as e:
    print(f"‚ùå Failed to load BGE models or tokenizer: {e}")
    base_model = None
    model = None
    tokenizer = None

Loading BGE models...


config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

‚úÖ BGE models and tokenizer loaded successfully.


In [6]:
# Load BGE-M3 model
print("Loading BGE-M3 model...")
try:
    bge_m3_model = SentenceTransformer("BAAI/bge-m3")
    print("‚úÖ BGE-M3 model loaded successfully.")
except Exception as e:
    print(f"‚ùå Failed to load BGE-M3 model: {e}")
    bge_m3_model = None

Loading BGE-M3 model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

‚úÖ BGE-M3 model loaded successfully.


In [7]:
# Load ConFit V2 model (using all-mpnet-base-v2 as a proxy for ConFit)
print("Loading ConFit V2 model...")
try:
    confit_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    print("‚úÖ ConFit V2 model loaded successfully.")
except Exception as e:
    print(f"‚ùå Failed to load ConFit V2 model: {e}")
    confit_model = None

Loading ConFit V2 model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ ConFit V2 model loaded successfully.


In [8]:
# Load LLaMA-3.1 for embeddings
# print("Setting up LLaMA-3.1 model...")
# try:
#     base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
#
#     # Load the base model
#     # base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
#     # llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
#
#     # Load the LoRA adapter
#     # peft_model_id = "LlamaFactoryAI/Llama-3.1-8B-Instruct-cv-job-description-matching"
#     # config = PeftConfig.from_pretrained(peft_model_id)
#     # llama_model = PeftModel.from_pretrained(base_model, peft_model_id)
#
#    # print("‚úÖ LLaMA-3.1 model setup complete.")
# except Exception as e:
#    print(f"‚ùå Failed to load LLaMA-3.1 model: {e}")
#    llama_model = None
#    llama_tokenizer = None

In [24]:
import os
from openai import OpenAI
from google.colab import userdata

# Get Hugging Face token from Colab secrets
hf_token = userdata.get('HF_TOKEN')

if hf_token is None:
    print("‚ö†Ô∏è Hugging Face token not found in Colab secrets. Please add it as 'HF_TOKEN'.")
else:
    print("‚úÖ Hugging Face token loaded successfully.")
    # Initialize the client with the Hugging Face Inference API base URL
    client = OpenAI(
        base_url="https://router.huggingface.co/v1", # Updated base_url
        api_key=hf_token, # Use the HF token as the API key
    )

    def get_llama_remote_response(resume_text, job_text):
        """Gets a response from the remote LLaMA-3.1 model via Hugging Face Inference API."""
        if client is None:
            print("Hugging Face Inference API client not initialized.")
            return None

        messages = [
            {
                "role": "system",
                "content": """You are an advanced AI model designed to analyze the compatibility between a CV and a job description. You will receive a CV and a job description. Your task is to output a structured JSON format that includes the following:

1. matching_analysis: Analyze the CV against the job description to identify key strengths and gaps.
2. description: Summarize the relevance of the CV to the job description in a few concise sentences.
3. score: Provide a numerical compatibility score (0-100) based on qualifications, skills, and experience.
4. recommendation: Suggest actions for the candidate to improve their match or readiness for the role.

Your output must be in JSON format as follows:
{
  "matching_analysis": "Your detailed analysis here.",
  "description": "A brief summary here.",
  "score": 85,
  "recommendation": "Your suggestions here."
}
""",
            },
            {"role": "user", "content": f"<CV> {resume_text} </CV>\n<job_description> {job_text} </job_description>"},
        ]

        try:
            completion = client.chat.completions.create(
                model="meta-llama/Llama-3.1-8B-Instruct", # Specify the model ID
                messages=messages,
                max_tokens=256, # Set max tokens for the response
                response_format={ "type": "json_object" } # Request JSON object output
            )
            # The response structure might vary slightly, access the message content
            response_content = completion.choices[0].message.content
            return response_content

        except Exception as e:
            print(f"Error calling Hugging Face Inference API: {e}")
            return None

    # Example usage (optional, for testing)
    # example_resume = "Software Engineer with 5 years experience..."
    # example_job = "Looking for a Senior Software Engineer..."
    # remote_response = get_llama_remote_response(example_resume, example_job)
    # print("\nRemote API Response:")
    # print(remote_response)

    print("‚úÖ Hugging Face Inference API client setup complete.")

‚úÖ Hugging Face token loaded successfully.
‚úÖ Hugging Face Inference API client setup complete.


In [10]:
# Load CareerBERT model
print("Loading CareerBERT model...")
try:
    careerbert_model = SentenceTransformer("lwolfrum2/careerbert-g")
    print("‚úÖ CareerBERT model loaded successfully.")
except Exception as e:
    print(f"‚ùå Failed to load CareerBERT model: {e}")
    careerbert_model = None

Loading CareerBERT model...


modules.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/56.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/305 [00:00<?, ?B/s]

‚úÖ CareerBERT model loaded successfully.


In [11]:
print("\nüéâ Model loading complete!")


üéâ Model loading complete!


## 4. Define Similarity Calculation Functions

Implement functions for calculating similarity scores using different embedding models.

In [12]:
def cosine_similarity(a, b):
    """Calculate cosine similarity between two vectors"""
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(a.dot(b) / (na * nb))

def calculate_bge_similarity(resume_text, job_text):
    """Calculates similarity between resume and job text using BGE embeddings."""
    if model is None or tokenizer is None:
        print("BGE models or tokenizer not loaded. Cannot calculate similarity.")
        return None

    try:
        # Process texts
        resume_inputs = tokenizer(resume_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
        job_inputs = tokenizer(job_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)

        # Get embeddings
        with torch.no_grad():
            # Get embeddings using mean pooling
            resume_outputs = model(**resume_inputs)
            job_outputs = model(**job_inputs)

            # Mean pooling
            resume_emb = resume_outputs.last_hidden_state.mean(dim=1)
            job_emb = job_outputs.last_hidden_state.mean(dim=1)

            # Normalize and calculate similarity
            resume_emb = F.normalize(resume_emb, p=2, dim=1)
            job_emb = F.normalize(job_emb, p=2, dim=1)

            similarity = torch.sum(resume_emb * job_emb, dim=1)
            match_score = torch.sigmoid(similarity).item()

        return match_score
    except Exception as e:
        print(f"Error calculating BGE similarity: {e}")
        return None

print("‚úÖ BGE similarity function defined.")

‚úÖ BGE similarity function defined.


In [13]:
def get_openai_embedding(text, model="text-embedding-3-small"):
    """Gets OpenAI embedding for a given text."""
    if openai.api_key is None:
        print("OpenAI API key not set. Cannot get embedding.")
        return None
    try:
        text = text.replace("\n", " ")
        return openai.embeddings.create(input=[text], model=model).data[0].embedding
    except Exception as e:
        print(f"Error getting OpenAI embedding: {e}")
        return None

def calculate_openai_similarity(resume_text, job_text):
    """Calculates cosine similarity between resume and job text using OpenAI embeddings."""
    resume_embedding = get_openai_embedding(resume_text)
    job_embedding = get_openai_embedding(job_text)

    if resume_embedding is None or job_embedding is None:
        print("Failed to get OpenAI embeddings. Cannot calculate similarity.")
        return None

    # Calculate similarity using the cosine function
    similarity_score = cosine_similarity(resume_embedding, job_embedding)
    return similarity_score

print("‚úÖ OpenAI similarity functions defined.")

‚úÖ OpenAI similarity functions defined.


In [14]:
def calculate_bge_m3_similarity(resume_text, job_text):
    """Calculates similarity using BGE-M3 model."""
    if bge_m3_model is None:
        print("BGE-M3 model not loaded. Cannot calculate similarity.")
        return None

    try:
        resume_embedding = bge_m3_model.encode(resume_text)
        job_embedding = bge_m3_model.encode(job_text)
        similarity_score = cosine_similarity(resume_embedding, job_embedding)
        return similarity_score
    except Exception as e:
        print(f"Error calculating BGE-M3 similarity: {e}")
        return None

def calculate_careerbert_similarity(resume_text, job_text):
    """Calculates similarity using CareerBERT model."""
    if careerbert_model is None:
        print("CareerBERT model not loaded. Cannot calculate similarity.")
        return None

    try:
        resume_embedding = careerbert_model.encode(resume_text)
        job_embedding = careerbert_model.encode(job_text)
        similarity_score = cosine_similarity(resume_embedding, job_embedding)
        return similarity_score
    except Exception as e:
        print(f"Error calculating CareerBERT similarity: {e}")
        return None

def calculate_confit_similarity(resume_text, job_text):
    """Calculates similarity using ConFit V2 model."""
    if confit_model is None:
        print("ConFit V2 model not loaded. Cannot calculate similarity.")
        return None

    try:
        resume_embedding = confit_model.encode(resume_text)
        job_embedding = confit_model.encode(job_text)
        similarity_score = cosine_similarity(resume_embedding, job_embedding)
        return similarity_score
    except Exception as e:
        print(f"Error calculating ConFit similarity: {e}")
        return None

print("‚úÖ BGE-M3, CareerBERT, and ConFit similarity functions defined.")

‚úÖ BGE-M3, CareerBERT, and ConFit similarity functions defined.


In [15]:
def get_voyage_embedding(text, model="voyage-3-large"):
    """Gets Voyage AI embedding for a given text."""
    if voyage_api_key is None:
        print("Voyage API key not set. Cannot get embedding.")
        return None

    try:
        vo = voyageai.Client()
        result = vo.embed([text], model=model)
        return result.embeddings[0]
    except Exception as e:
        print(f"Error getting Voyage embedding: {e}")
        return None

def calculate_voyage_similarity(resume_text, job_text):
    """Calculates similarity using Voyage AI embeddings."""
    resume_embedding = get_voyage_embedding(resume_text)
    job_embedding = get_voyage_embedding(job_text)

    if resume_embedding is None or job_embedding is None:
        print("Failed to get Voyage embeddings. Cannot calculate similarity.")
        return None

    similarity_score = cosine_similarity(resume_embedding, job_embedding)
    return similarity_score

print("‚úÖ Voyage similarity functions defined.")

‚úÖ Voyage similarity functions defined.


In [16]:
#def calculate_llama_similarity(resume_text, job_text):
#    """Calculates similarity using LLaMA-3.1 model."""
#    if llama_model is None or llama_tokenizer is None:
#        print("LLaMA-3.1 model or tokenizer not loaded. Cannot calculate similarity.")
#        return None
#
#    try:
#        messages = [
#            {
#                "role": "system",
#                "content": """You are an advanced AI model designed to analyze the compatibility between a CV and a job description. You will receive a CV and a job description. Your task is to output a structured JSON format that includes the following:
#
#1. matching_analysis: Analyze the CV against the job description to identify key strengths and gaps.
#2. description: Summarize the relevance of the CV to the job description in a few concise sentences.
#3. score: Provide a numerical compatibility score (0-100) based on qualifications, skills, and experience.
#4. recommendation: Suggest actions for the candidate to improve their match or readiness for the role.
#
#Your output must be in JSON format as follows:
#{
#  "matching_analysis": "Your detailed analysis here.",
#  "description": "A brief summary here.",
#  "score": 85,
#  "recommendation": "Your suggestions here."
#}
#""",
#            },
#            {"role": "user", "content": f"<CV> {resume_text} </CV>\n<job_description> {job_text} </job_description>"},
#        ]
#        inputs = llama_tokenizer.apply_chat_template(
#            messages, add_generation_prompt=True, return_tensors="pt"
#        )
#        outputs = llama_model.generate(inputs, max_new_tokens=256) # Increased max_new_tokens
#        generated_text = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
#
#        # Attempt to parse the JSON output and extract the score
#        try:
#            # Find the JSON part of the output
#            start_index = generated_text.find('{')
#            end_index = generated_text.rfind('}') + 1
#            json_string = generated_text[start_index:end_index]
#
#            # Parse the JSON
#            result = json.loads(json_string)
#            score = result.get('score', None) # Get score, default to None if not found
#
#            if score is not None:
#                # Normalize score to 0-1 range
#                return score / 100.0
#            else:
#                print("Warning: 'score' not found in LLaMA-3.1 output JSON.")
#                return None
#
#        except json.JSONDecodeError as e:
#            print(f"Error decoding LLaMA-3.1 output JSON: {e}")
#            print(f"Generated text was: {generated_text}")
#            return None
#        except Exception as e:
#            print(f"Error processing LLaMA-3.1 output: {e}")
#            print(f"Generated text was: {generated_text}")
#            return None
#
#    except Exception as e:
#        print(f"Error calculating LLaMA-3.1 similarity: {e}")
#        return None
#
#
#print("‚úÖ LLaMA similarity functions defined.")

In [25]:
def calculate_llama_remote_similarity(resume_text, job_text):
    """Calculates similarity using remote LLaMA-3.1 model via Hugging Face Inference API."""
    remote_response_json_string = get_llama_remote_response(resume_text, job_text)

    if remote_response_json_string is None:
        print("Failed to get remote LLaMA-3.1 response.")
        return None

    try:
        # Parse the JSON string
        result = json.loads(remote_response_json_string)
        score = result.get('score', None) # Get score, default to None if not found

        if score is not None:
            # Normalize score to 0-1 range
            # Assuming the remote model returns a score between 0 and 100
            return score / 100.0
        else:
            print("Warning: 'score' not found in remote LLaMA-3.1 output JSON.")
            return None

    except json.JSONDecodeError as e:
        print(f"Error decoding remote LLaMA-3.1 output JSON: {e}")
        print(f"Generated text was: {remote_response_json_string}")
        return None
    except Exception as e:
        print(f"Error processing remote LLaMA-3.1 output: {e}")
        print(f"Generated text was: {remote_response_json_string}")
        return None

print("‚úÖ Remote LLaMA similarity function defined.")

‚úÖ Remote LLaMA similarity function defined.


In [18]:
print("üéâ All similarity calculation functions are ready!")

üéâ All similarity calculation functions are ready!


## 5. Load and Sample Dataset

Load the dataset and create a balanced sample for testing.

In [20]:
from google.colab import drive
drive.mount('/content/drive')
DATASET_PATH='/content/drive/MyDrive/AI-ML Self Learning/next_horizon/resume_job_recommendation/model-shashu2325-resume-job-matcher-lora'

Mounted at /content/drive


In [21]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv(f'{DATASET_PATH}/dataset.csv')
print(f"‚úÖ Dataset loaded successfully. Shape: {df.shape}")

# Display basic information about the dataset
print(f"\nDataset Info:")
print(f"Total records: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nDecision distribution:")
print(df['Decision'].value_counts())

# Split into select and reject categories
select_df = df[df['Decision'] == 'select']
reject_df = df[df['Decision'] == 'reject']

print(f"\nSelect records: {len(select_df)}")
print(f"Reject records: {len(reject_df)}")

# Determine sample sizes for balanced representation
total_samples = 100  # Adjust this number based on your needs
num_select = min(len(select_df), total_samples // 2)
num_reject = min(len(reject_df), total_samples - num_select)

# Adjust if one category is much smaller
if num_select + num_reject < total_samples:
    if len(select_df) > len(reject_df):
        num_select = min(len(select_df), total_samples - num_reject)
    else:
        num_reject = min(len(reject_df), total_samples - num_select)

print(f"\nSampling {num_select} select records and {num_reject} reject records...")

# Sample the data
sampled_select_df = select_df.sample(n=num_select, random_state=42)
sampled_reject_df = reject_df.sample(n=num_reject, random_state=42)

# Combine and shuffle
sampled_df = pd.concat([sampled_select_df, sampled_reject_df])
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"‚úÖ Final sample size: {len(sampled_df)} records")
print(f"Sample distribution: {sampled_df['Decision'].value_counts().to_dict()}")

Loading dataset...
‚úÖ Dataset loaded successfully. Shape: (10174, 5)

Dataset Info:
Total records: 10174
Columns: ['Role', 'Resume', 'Decision', 'Reason_for_decision', 'Job_Description']

Decision distribution:
Decision
reject    5114
select    5060
Name: count, dtype: int64

Select records: 5060
Reject records: 5114

Sampling 50 select records and 50 reject records...
‚úÖ Final sample size: 100 records
Sample distribution: {'reject': 50, 'select': 50}


## 6. Calculate Similarity Scores for All Models

Process each resume-job pair through all available embedding models. This may take several minutes depending on the sample size and available models.

In [26]:
# Initialize score lists for all models
bge_scores = []
openai_scores = []
bge_m3_scores = []
careerbert_scores = []
confit_scores = []
voyage_scores = []
#llama_scores = [] # Commented out as local LLaMA is removed
llama_remote_scores = []

print("üöÄ Starting similarity calculation for all models...")
print(f"Processing {len(sampled_df)} resume-job pairs...")

for index, row in sampled_df.iterrows():
    resume_text = str(row['Resume']) if pd.notna(row['Resume']) else ""
    job_text = str(row['Job_Description']) if pd.notna(row['Job_Description']) else ""

    print(f"\nüìã Processing row {index+1}/{len(sampled_df)}")

    # Calculate BGE similarity
    try:
        print(f"  üîç Calculating BGE similarity...")
        bge_score = calculate_bge_similarity(resume_text, job_text)
        print(f"  ‚úÖ BGE similarity: {bge_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating BGE similarity: {e}")
        bge_score = None
    bge_scores.append(bge_score)

    # Calculate OpenAI similarity
    try:
        if openai.api_key is None:
            print(f"  ‚è≠Ô∏è  Skipping OpenAI similarity (API key not set)")
            openai_score = None
        else:
            print(f"  üîç Calculating OpenAI similarity...")
            openai_score = calculate_openai_similarity(resume_text, job_text)
            print(f"  ‚úÖ OpenAI similarity: {openai_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating OpenAI similarity: {e}")
        openai_score = None
    openai_scores.append(openai_score)

    # Calculate BGE-M3 similarity
    try:
        print(f"  üîç Calculating BGE-M3 similarity...")
        bge_m3_score = calculate_bge_m3_similarity(resume_text, job_text)
        print(f"  ‚úÖ BGE-M3 similarity: {bge_m3_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating BGE-M3 similarity: {e}")
        bge_m3_score = None
    bge_m3_scores.append(bge_m3_score)

    # Calculate CareerBERT similarity
    try:
        print(f"  üîç Calculating CareerBERT similarity...")
        careerbert_score = calculate_careerbert_similarity(resume_text, job_text)
        print(f"  ‚úÖ CareerBERT similarity: {careerbert_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating CareerBERT similarity: {e}")
        careerbert_score = None
    careerbert_scores.append(careerbert_score)

    # Calculate ConFit similarity
    try:
        print(f"  üîç Calculating ConFit similarity...")
        confit_score = calculate_confit_similarity(resume_text, job_text)
        print(f"  ‚úÖ ConFit similarity: {confit_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating ConFit similarity: {e}")
        confit_score = None
    confit_scores.append(confit_score)

    # Calculate Voyage similarity
    try:
        if voyage_api_key is None:
            print(f"  ‚è≠Ô∏è  Skipping Voyage similarity (API key not set)")
            voyage_score = None
        else:
            print(f"  üîç Calculating Voyage similarity...")
            voyage_score = calculate_voyage_similarity(resume_text, job_text)
            print(f"  ‚úÖ Voyage similarity: {voyage_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating Voyage similarity: {e}")
        voyage_score = None
    voyage_scores.append(voyage_score)

    # Calculate LLaMA similarity
    #try: # Commented out as local LLaMA is removed
    #    print(f"  üîç Calculating LLaMA similarity...")
    #    llama_score = calculate_llama_similarity(resume_text, job_text)
    #    print(f"  ‚ö†Ô∏è  LLaMA similarity: {llama_score}")
    #except Exception as e:
    #    print(f"  ‚ùå Error calculating LLaMA similarity: {e}")
    #    llama_score = None
    #llama_scores.append(llama_score)

    # Calculate LLaMA remote similarity
    try:
        print(f"  üîç Calculating LLaMA remote similarity...")
        llama_remote_score = calculate_llama_remote_similarity(resume_text, job_text)
        print(f"  ‚ö†Ô∏è  LLaMA remote similarity: {llama_remote_score}")
    except Exception as e:
        print(f"  ‚ùå Error calculating LLaMA remote similarity: {e}")
        llama_remote_score = None
    llama_remote_scores.append(llama_remote_score)

print("\nüéâ Score calculation complete!")

üöÄ Starting similarity calculation for all models...
Processing 100 resume-job pairs...

üìã Processing row 1/100
  üîç Calculating BGE similarity...
  ‚úÖ BGE similarity: 0.6394322514533997
  üîç Calculating OpenAI similarity...
  ‚úÖ OpenAI similarity: 0.4729005547348723
  üîç Calculating BGE-M3 similarity...
  ‚úÖ BGE-M3 similarity: 0.6034197932107234
  üîç Calculating CareerBERT similarity...
  ‚úÖ CareerBERT similarity: 0.686802714282113
  üîç Calculating ConFit similarity...
  ‚úÖ ConFit similarity: 0.6205764148937876
  üîç Calculating Voyage similarity...
  ‚úÖ Voyage similarity: 0.6266660140820024
  üîç Calculating LLaMA remote similarity...
  ‚ö†Ô∏è  LLaMA remote similarity: 0.85

üìã Processing row 2/100
  üîç Calculating BGE similarity...
  ‚úÖ BGE similarity: 0.6075310111045837
  üîç Calculating OpenAI similarity...
  ‚úÖ OpenAI similarity: 0.4892628792201232
  üîç Calculating BGE-M3 similarity...
  ‚úÖ BGE-M3 similarity: 0.6195751762529121
  üîç Calculating 

## 7. Generate Predictions and Classification

Convert similarity scores to binary predictions using a threshold and create a comprehensive comparison table.

In [29]:
# Add the scores as new columns to the sampled DataFrame
sampled_df['bge_similarity'] = bge_scores
sampled_df['openai_similarity'] = openai_scores
sampled_df['bge_m3_similarity'] = bge_m3_scores
sampled_df['careerbert_similarity'] = careerbert_scores
sampled_df['confit_similarity'] = confit_scores
sampled_df['voyage_similarity'] = voyage_scores
#sampled_df['llama_similarity'] = llama_scores
sampled_df['llama_remote_similarity'] = llama_remote_scores

# Define threshold for classification
threshold = 0.5
print(f"Using threshold: {threshold} for classification")

# Create predictions for all models
def make_prediction(score, threshold=0.5):
    if score is None or pd.isna(score):
        return 'unknown'
    return 'select' if score > threshold else 'reject'

sampled_df['bge_prediction'] = sampled_df['bge_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['openai_prediction'] = sampled_df['openai_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['bge_m3_prediction'] = sampled_df['bge_m3_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['careerbert_prediction'] = sampled_df['careerbert_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['confit_prediction'] = sampled_df['confit_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['voyage_prediction'] = sampled_df['voyage_similarity'].apply(lambda x: make_prediction(x, threshold))
#sampled_df['llama_prediction'] = sampled_df['llama_similarity'].apply(lambda x: make_prediction(x, threshold))
sampled_df['llama_remote_prediction'] = sampled_df['llama_remote_similarity'].apply(lambda x: make_prediction(x, threshold))

print("‚úÖ Predictions generated for all models!")

# Display the comparative table
print("\nüìä Comparative Table of Similarity Scores and Predictions:")
display_columns = ['Role', 'Decision',
                  'bge_similarity', 'bge_prediction',
                  'openai_similarity', 'openai_prediction',
                  'bge_m3_similarity', 'bge_m3_prediction',
                  'careerbert_similarity', 'careerbert_prediction',
                  'confit_similarity', 'confit_prediction',
                  'voyage_similarity', 'voyage_prediction',
                  #'llama_similarity', 'llama_prediction',
                  'llama_remote_similarity', 'llama_remote_prediction']

comparison_table = sampled_df[display_columns]
print(comparison_table.to_string(index=False, float_format='%.4f'))

Using threshold: 0.5 for classification
‚úÖ Predictions generated for all models!

üìä Comparative Table of Similarity Scores and Predictions:
                      Role Decision  bge_similarity bge_prediction  openai_similarity openai_prediction  bge_m3_similarity bge_m3_prediction  careerbert_similarity careerbert_prediction  confit_similarity confit_prediction  voyage_similarity voyage_prediction  llama_remote_similarity llama_remote_prediction
         Robotics Engineer   reject          0.6394         select             0.4729            reject             0.6034            select                 0.6868                select             0.6206            select             0.6267            select                   0.8500                  select
     E-commerce Specialist   reject          0.6075         select             0.4893            reject             0.6196            select                 0.6223                select             0.5047            select             0.5

## 8. Evaluate Model Performance

Calculate comprehensive performance metrics for each embedding model including accuracy, precision, recall, and F1-score.

In [30]:
# Define models to evaluate
models = {
    'BGE': ('bge_prediction', 'bge_similarity'),
    'OpenAI': ('openai_prediction', 'openai_similarity'),
    'BGE-M3': ('bge_m3_prediction', 'bge_m3_similarity'),
    'CareerBERT': ('careerbert_prediction', 'careerbert_similarity'),
    'ConFit V2': ('confit_prediction', 'confit_similarity'),
    'Voyage-3-Large': ('voyage_prediction', 'voyage_similarity'),
    #'LLaMA-3.1': ('llama_prediction', 'llama_similarity'),
    'LLaMA-3.1-Remote': ('llama_remote_prediction', 'llama_remote_similarity')
}

# Store results for comparison
results_summary = []

print("üìä COMPREHENSIVE MODEL PERFORMANCE EVALUATION")
print("=" * 80)

for model_name, (pred_col, sim_col) in models.items():
    print(f"\nüîç {model_name} Model Performance:")
    print("-" * 50)

    # Filter out rows with unknown predictions
    valid_mask = sampled_df[pred_col] != 'unknown'
    valid_df = sampled_df[valid_mask]

    if len(valid_df) == 0:
        print(f"‚ùå No valid predictions for {model_name}")
        results_summary.append({
            'Model': model_name,
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1': None,
            'Valid_Samples': 0
        })
        continue

    try:
        # Calculate metrics
        accuracy = accuracy_score(valid_df['Decision'], valid_df[pred_col])
        precision, recall, f1, _ = precision_recall_fscore_support(
            valid_df['Decision'], valid_df[pred_col], average='weighted'
        )

        print(f"‚úÖ Accuracy: {accuracy:.4f}")
        print(f"‚úÖ Precision: {precision:.4f}")
        print(f"‚úÖ Recall: {recall:.4f}")
        print(f"‚úÖ F1-Score: {f1:.4f}")
        print(f"üìà Valid Samples: {len(valid_df)}/{len(sampled_df)}")

        # Detailed classification report
        print(f"\nüìã Detailed Classification Report:")
        print(classification_report(valid_df['Decision'], valid_df[pred_col]))

        # Store results
        results_summary.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'Valid_Samples': len(valid_df)
        })

    except Exception as e:
        print(f"‚ùå Error calculating metrics for {model_name}: {e}")
        results_summary.append({
            'Model': model_name,
            'Accuracy': None,
            'Precision': None,
            'Recall': None,
            'F1': None,
            'Valid_Samples': len(valid_df)
        })

print("\nüéâ Performance evaluation complete!")

üìä COMPREHENSIVE MODEL PERFORMANCE EVALUATION

üîç BGE Model Performance:
--------------------------------------------------
‚úÖ Accuracy: 0.5000
‚úÖ Precision: 0.2500
‚úÖ Recall: 0.5000
‚úÖ F1-Score: 0.3333
üìà Valid Samples: 100/100

üìã Detailed Classification Report:
              precision    recall  f1-score   support

      reject       0.00      0.00      0.00        50
      select       0.50      1.00      0.67        50

    accuracy                           0.50       100
   macro avg       0.25      0.50      0.33       100
weighted avg       0.25      0.50      0.33       100


üîç OpenAI Model Performance:
--------------------------------------------------
‚úÖ Accuracy: 0.4800
‚úÖ Precision: 0.4740
‚úÖ Recall: 0.4800
‚úÖ F1-Score: 0.4482
üìà Valid Samples: 100/100

üìã Detailed Classification Report:
              precision    recall  f1-score   support

      reject       0.49      0.72      0.58        50
      select       0.46      0.24      0.32        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 9. Create Summary Comparison Table

Generate a comprehensive summary table ranking all models by performance metrics.

In [31]:
# Create summary comparison table
print("üèÜ SUMMARY COMPARISON TABLE")
print("=" * 80)

results_df = pd.DataFrame(results_summary)
print(results_df.to_string(index=False, float_format='%.4f'))

# Find best performing model
valid_results = results_df[results_df['Accuracy'].notna()]
if len(valid_results) > 0:
    best_model = valid_results.loc[valid_results['Accuracy'].idxmax()]
    print(f"\nü•á Best Performing Model: {best_model['Model']} (Accuracy: {best_model['Accuracy']:.4f})")

    # Sort by accuracy for ranking
    sorted_results = valid_results.sort_values('Accuracy', ascending=False)
    print(f"\nüìä Model Ranking by Accuracy:")
    for i, (_, row) in enumerate(sorted_results.iterrows(), 1):
        medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else f"{i}."
        print(f"{medal} {row['Model']}: {row['Accuracy']:.4f}")
else:
    print("\n‚ö†Ô∏è  No valid results found for comparison.")

print("\n" + "=" * 80)

üèÜ SUMMARY COMPARISON TABLE
           Model  Accuracy  Precision  Recall     F1  Valid_Samples
             BGE    0.5000     0.2500  0.5000 0.3333            100
          OpenAI    0.4800     0.4740  0.4800 0.4482            100
          BGE-M3    0.4700     0.3848  0.4700 0.3498            100
      CareerBERT    0.5000     0.5000  0.5000 0.3800            100
       ConFit V2    0.4900     0.4887  0.4900 0.4748            100
  Voyage-3-Large    0.4700     0.3848  0.4700 0.3498            100
LLaMA-3.1-Remote    0.5250     0.5571  0.5250 0.4473             40

ü•á Best Performing Model: LLaMA-3.1-Remote (Accuracy: 0.5250)

üìä Model Ranking by Accuracy:
ü•á LLaMA-3.1-Remote: 0.5250
ü•à CareerBERT: 0.5000
ü•â BGE: 0.5000
4. ConFit V2: 0.4900
5. OpenAI: 0.4800
6. BGE-M3: 0.4700
7. Voyage-3-Large: 0.4700

