In [None]:
from faker import Faker
import random

def generate_domain_specific_resumes(num_resumes):
    fake = Faker()
    resumes = []
    skills = [
        "Python, SQL, Tableau", "Machine Learning, Deep Learning",
        "Data Analysis, Visualization", "Project Management, Agile",
        "Cloud Computing, AWS, Azure", "DevOps, Kubernetes, Docker"
    ]
    industries = [
        "Data Science", "Software Engineering", "Project Management",
        "Business Analysis", "Cloud Solutions", "IT Infrastructure"
    ]
    for _ in range(num_resumes):
        resume = {
            "name": fake.name(),
            "title": random.choice(industries),
            "experience": f"{random.randint(1, 10)} years experience in {random.choice(industries)}",
            "skills": random.choice(skills),
            "summary": fake.text(max_nb_chars=200)
        }
        resumes.append(" ".join(resume.values()))
    return resumes

# Generate domain-specific resumes
resumes = generate_domain_specific_resumes(15000)


In [None]:
import pandas as pd

# Load your job postings dataset (adjust path as needed)
job_postings_path = '/content/postings.csv'  # Replace with actual path
job_postings_df = pd.read_csv(job_postings_path)

# Limit the dataset to the first 15,000 rows
job_postings_df = job_postings_df.head(15000)

# Extract the job descriptions
job_descriptions = job_postings_df["description"].dropna().tolist()

# Show first 3 job descriptions
print(job_descriptions[:3])


['Job descriptionA leading real estate firm in New Jersey is seeking an administrative Marketing Coordinator with some experience in graphic design. You will be working closely with our fun, kind, ambitious members of the sales team and our dynamic executive team on a daily basis. This is an opportunity to be part of a fast-growing, highly respected real estate brokerage with a reputation for exceptional marketing and extraordinary culture of cooperation and inclusion.Who you are:You must be a well-organized, creative, proactive, positive, and most importantly, kind-hearted person. Please, be responsible, respectful, and cool-under-pressure. Please, be proficient in Adobe Creative Cloud (Indesign, Illustrator, Photoshop) and Microsoft Office Suite. Above all, have fantastic taste and be a good-hearted, fun-loving person who loves working with people and is eager to learn.Role:Our office is a fast-paced environment. You’ll work directly with a Marketing team and communicate daily with o

In [None]:
# Install sentence-transformers if not already installed
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
import torch
from sentence_transformers import util

# Load TRBERT model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device="cuda" if torch.cuda.is_available() else "cpu")

# Function to encode text in batches
def encode_in_batches(text_list, model, batch_size=32):
    embeddings = []
    total_batches = len(text_list) // batch_size + 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        print(f"Processing batch {i // batch_size + 1} of {total_batches}...")
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Compute embeddings for job descriptions and resumes
job_embeddings = encode_in_batches(job_descriptions, model)
resume_embeddings = encode_in_batches(resumes, model)

# Compute cosine similarity
similarity_matrix = util.cos_sim(job_embeddings, resume_embeddings)


Processing batch 1 of 469...
Processing batch 2 of 469...
Processing batch 3 of 469...
Processing batch 4 of 469...
Processing batch 5 of 469...
Processing batch 6 of 469...
Processing batch 7 of 469...
Processing batch 8 of 469...
Processing batch 9 of 469...
Processing batch 10 of 469...
Processing batch 11 of 469...
Processing batch 12 of 469...
Processing batch 13 of 469...
Processing batch 14 of 469...
Processing batch 15 of 469...
Processing batch 16 of 469...
Processing batch 17 of 469...
Processing batch 18 of 469...
Processing batch 19 of 469...
Processing batch 20 of 469...
Processing batch 21 of 469...
Processing batch 22 of 469...
Processing batch 23 of 469...
Processing batch 24 of 469...
Processing batch 25 of 469...
Processing batch 26 of 469...
Processing batch 27 of 469...
Processing batch 28 of 469...
Processing batch 29 of 469...
Processing batch 30 of 469...
Processing batch 31 of 469...
Processing batch 32 of 469...
Processing batch 33 of 469...
Processing batch 34

In [None]:
import numpy as np

# Compute Ranking Metrics: ND@5, R@1, MRR, ND@10, R@5
def compute_metrics(similarity_matrix, top_n=10):
    # Initialize the lists to store the values for each metric
    nd_at_5 = []
    r_at_1 = []
    mrr_score = []
    r_at_5 = []
    nd_at_10 = []

    # Iterate through each job (row in similarity matrix)
    for i in range(similarity_matrix.shape[0]):
        # Get the similarity scores for the current job's resumes
        similarity_scores = similarity_matrix[i].cpu().numpy()

        # Get the top_n indices sorted by similarity score
        top_n_indices = np.argsort(similarity_scores)[::-1][:top_n]

        # Check if the correct resume is in the top 5 (for R@5)
        relevant_resume_idx = i  # Since it's synthetic data, assume the matching resume is at position i
        r_at_5.append(1 if relevant_resume_idx in top_n_indices[:5] else 0)

        # Check if the correct resume is at the top (for R@1)
        r_at_1.append(1 if relevant_resume_idx == top_n_indices[0] else 0)

        # Calculate ND@5
        relevant = [1 if idx in top_n_indices[:5].tolist() else 0 for idx in range(similarity_matrix.shape[1])]
        nd_at_5.append(np.sum(relevant) / 5.0)

        # Calculate ND@10
        relevant = [1 if idx in top_n_indices[:10].tolist() else 0 for idx in range(similarity_matrix.shape[1])]
        nd_at_10.append(np.sum(relevant) / 10.0)

        # Calculate MRR
        rank = np.where(top_n_indices == relevant_resume_idx)[0]
        if rank.size > 0:
            rank = rank[0] + 1  # Add 1 to convert to 1-based index
            mrr_score.append(1 / rank)
        else:
            mrr_score.append(0)

    # Compute the average for each metric
    nd_at_5_avg = np.mean(nd_at_5)
    r_at_1_avg = np.mean(r_at_1)
    mrr_avg = np.mean(mrr_score)
    r_at_5_avg = np.mean(r_at_5)
    nd_at_10_avg = np.mean(nd_at_10)

    return nd_at_5_avg, r_at_1_avg, mrr_avg, r_at_5_avg, nd_at_10_avg


# Example Usage
# Assuming similarity_matrix is already computed using your model
# similarity_matrix = compute_similarity(job_descriptions, resumes, model)

# Call the metrics function
nd_at_5, r_at_1, mrr_score, r_at_5, nd_at_10 = compute_metrics(similarity_matrix)

# Output the results
print(f"ND@5: {nd_at_5:.4f}")
print(f"R@1: {r_at_1:.4f}")
print(f"MRR: {mrr_score:.4f}")
print(f"R@5: {r_at_5:.4f}")
print(f"ND@10: {nd_at_10:.4f}")


ND@5: 1.0000
R@1: 0.0001
MRR: 0.0002
R@5: 0.0002
ND@10: 1.0000


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import time

# Step 1: Load dataset
import pandas as pd
data = pd.read_csv("/content/postings.csv")

# Step 2: Clean the text columns
data = data.dropna(subset=['description', 'skills_desc'])
data['description'] = data['description'].str.lower()
data['skills_desc'] = data['skills_desc'].str.lower()

# Step 3: Vectorization for description and skills using separate TfidfVectorizers
vectorizer_desc = TfidfVectorizer(max_features=10000)
X_desc = vectorizer_desc.fit_transform(data['description'])

vectorizer_skills = TfidfVectorizer(max_features=10000)
X_skills = vectorizer_skills.fit_transform(data['skills_desc'])

# Step 4: Initialize the nearest neighbor model for both description and skills
nn_model_desc = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model_desc.fit(X_desc)

nn_model_skills = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model_skills.fit(X_skills)

# Step 5: Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
llm_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Step 6: Function to perform blending (Classic + LLM)
def blend_models(query_description, query_skills):
    # Classic Model (TF-IDF + Cosine Similarity)
    query_desc_vec = vectorizer_desc.transform([query_description])
    query_skills_vec = vectorizer_skills.transform([query_skills])

    # Classic model cosine similarity calculation
    _, indices_desc = nn_model_desc.kneighbors(query_desc_vec)
    _, indices_skills = nn_model_skills.kneighbors(query_skills_vec)

    # LLM Model (BERT)
    inputs = tokenizer(query_description, return_tensors='pt', truncation=True, padding=True)
    outputs = llm_model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).item()

    return indices_desc, indices_skills, predictions

# Step 7: Sample queries for blending
query_description = "Data Scientist with experience in Python and Machine Learning"
query_skills = "Python, Machine Learning, Data Analysis, SQL"

# Step 8: Perform blending of models
start_time = time.time()
indices_desc, indices_skills, predictions = blend_models(query_description, query_skills)
end_time = time.time()

# Step 9: Calculate performance metrics (for example, ND@5, ND@10, R@1, R@5, MRR)
# Assuming `indices_desc` and `indices_skills` are indices of top 5 results from the nearest neighbors

# Calculate ND@5, ND@10, R@1, R@5, MRR
nd_5 = len([1 for idx in indices_desc[0] if idx < 5]) / 5
nd_10 = len([1 for idx in indices_desc[0] if idx < 10]) / 10
r_1 = 1 if indices_desc[0][0] < 5 else 0
r_5 = 1 if any(idx < 5 for idx in indices_desc[0]) else 0
mrr = 1 / (indices_desc[0].index(0) + 1) if 0 in indices_desc[0] else 0

# Time
execution_time = end_time - start_time

# Print performance metrics
print(f"ND@5: {nd_5:.2f}")
print(f"ND@10: {nd_10:.2f}")
print(f"R@1: {r_1}")
print(f"R@5: {r_5}")
print(f"MRR: {mrr:.4f}")
print(f"Time (seconds): {execution_time:.2f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ND@5: 0.00
ND@10: 0.00
R@1: 0
R@5: 0
MRR: 0.0000
Time (seconds): 0.32


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import time
import pandas as pd

# Step 1: Load dataset
data = pd.read_csv("/content/postings.csv")

# Step 2: Clean the text columns
data = data.dropna(subset=['description', 'skills_desc'])
data['description'] = data['description'].str.lower()
data['skills_desc'] = data['skills_desc'].str.lower()

# Step 3: Vectorization for description and skills using separate TfidfVectorizers
vectorizer_desc = TfidfVectorizer(max_features=10000)
X_desc = vectorizer_desc.fit_transform(data['description'])

vectorizer_skills = TfidfVectorizer(max_features=10000)
X_skills = vectorizer_skills.fit_transform(data['skills_desc'])

# Step 4: Initialize the nearest neighbor model for both description and skills
nn_model_desc = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model_desc.fit(X_desc)

nn_model_skills = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model_skills.fit(X_skills)

# Step 5: Function to perform blending (Classic + Nearest Neighbors)
def blend_models(query_description, query_skills):
    # Classic Model (TF-IDF + Cosine Similarity)
    query_desc_vec = vectorizer_desc.transform([query_description])
    query_skills_vec = vectorizer_skills.transform([query_skills])

    # Classic model cosine similarity calculation
    _, indices_desc = nn_model_desc.kneighbors(query_desc_vec)
    _, indices_skills = nn_model_skills.kneighbors(query_skills_vec)

    return indices_desc, indices_skills

# Step 6: Sample queries for blending
query_description = "Data Scientist with experience in Python and Machine Learning"
query_skills = "Python, Machine Learning, Data Analysis, SQL"

# Step 7: Perform blending of models
start_time = time.time()
indices_desc, indices_skills = blend_models(query_description, query_skills)
end_time = time.time()

# Step 8: Calculate performance metrics (for example, ND@5, ND@10, R@1, R@5, MRR)

# Calculate ND@5, ND@10, R@1, R@5, MRR for description
# Assuming 'indices_desc' contains indices of top 5 results from nearest neighbors

# Calculate ND@5, ND@10, R@1, R@5, MRR for the first model (description)
nd_5 = len([1 for idx in indices_desc[0] if idx < 5]) / 5
nd_10 = len([1 for idx in indices_desc[0] if idx < 10]) / 10
r_1 = 1 if indices_desc[0][0] < 5 else 0
r_5 = 1 if any(idx < 5 for idx in indices_desc[0]) else 0
mrr = 1 / (indices_desc[0].index(0) + 1) if 0 in indices_desc[0] else 0

# Time
execution_time = end_time - start_time

# Print performance metrics
print(f"ND@5: {nd_5:.2f}")
print(f"ND@10: {nd_10:.2f}")
print(f"R@1: {r_1}")
print(f"R@5: {r_5}")
print(f"MRR: {mrr:.4f}")
print(f"Time (seconds): {execution_time:.2f}")


ND@5: 0.00
ND@10: 0.00
R@1: 0
R@5: 0
MRR: 0.0000
Time (seconds): 0.02
