In [None]:
import pandas as pd
import random

# Load datasets
postings_df = pd.read_csv("/content/postings.csv")
resumes_df = pd.read_csv("/content/resume_data.csv")

# Define skill sets based on job categories
skills_pool = {
    "Marketing": ["SEO", "Content Creation", "Adobe Photoshop", "Google Analytics", "Social Media Marketing"],
    "Data Science": ["Python", "Machine Learning", "SQL", "Pandas", "Deep Learning"],
    "Software Engineering": ["Java", "C++", "JavaScript", "React", "Node.js"],
    "Therapy": ["Cognitive Behavioral Therapy", "Crisis Intervention", "Patient Assessment", "EMDR", "Psychotherapy"],
    "General": ["Communication", "Problem Solving", "Leadership", "Time Management", "Teamwork"]
}

# Function to assign random skills
def generate_skills(category, num_skills=3):
    return ", ".join(random.sample(skills_pool.get(category, skills_pool["General"]), num_skills))

# Process job postings
for i, row in postings_df.iterrows():
    category = "Marketing" if "Marketing" in str(row.get("title", "")) else "Therapy" if "Therapist" in str(row.get("title", "")) else "General"
    postings_df.at[i, "skills_desc"] = generate_skills(category, 4)

# Process resumes
for i, row in resumes_df.iterrows():
    category = "Marketing" if row.get("job_id") == 921716 else "Therapy" if row.get("job_id") == 1829192 else "General"
    resumes_df.at[i, "skills_desc"] = generate_skills(category, 2)

# Save updated datasets
postings_df.to_csv("updated_postings.csv", index=False)
resumes_df.to_csv("updated_resumes.csv", index=False)

print("Updated datasets saved as 'updated_postings.csv' and 'updated_resumes.csv'")


Updated datasets saved as 'updated_postings.csv' and 'updated_resumes.csv'


In [None]:
import pandas as pd
import numpy as np
import torch
import time
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
postings = pd.read_csv("/content/updated_postings.csv")  # Adjust path for Colab
df_resume = pd.read_csv("/content/updated_resumes.csv")

# Ensure 'skills_desc' column is present
assert 'skills_desc' in postings.columns and 'skills_desc' in df_resume.columns, "Missing 'skills_desc' column in datasets"

# Initialize TRBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate BERT embeddings in batches
def get_bert_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

# Generate embeddings using BERT
start_time = time.time()
postings["bert_embeddings"] = list(get_bert_embeddings(postings["skills_desc"]))
df_resume["bert_embeddings"] = list(get_bert_embeddings(df_resume["skills_desc"]))
print(f"BERT Embeddings Generated in {time.time() - start_time:.2f} sec.")

# LSM (Latent Semantic Matching) using TF-IDF + LSA (SVD)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
lsa = TruncatedSVD(n_components=300)

# Fit LSM model on job postings and transform
postings_tfidf = vectorizer.fit_transform(postings["skills_desc"])
postings_lsm = lsa.fit_transform(postings_tfidf)

# Transform resumes using the same model
resumes_tfidf = vectorizer.transform(df_resume["skills_desc"])
resumes_lsm = lsa.transform(resumes_tfidf)

# Placeholder for TallREC Model (Assuming pre-trained embeddings)
def get_tallrec_embeddings(texts):
    return np.random.rand(len(texts), 300)  # Replace with actual TallREC model

postings["tallrec_embeddings"] = list(get_tallrec_embeddings(postings["skills_desc"]))
df_resume["tallrec_embeddings"] = list(get_tallrec_embeddings(df_resume["skills_desc"]))

# Fusion: Concatenation of all three embeddings
def fuse_embeddings(bert_emb, lsm_emb, tallrec_emb):
    return np.hstack([bert_emb, lsm_emb, tallrec_emb])

postings["fused_embeddings"] = list(map(fuse_embeddings, postings["bert_embeddings"], postings_lsm, postings["tallrec_embeddings"]))
df_resume["fused_embeddings"] = list(map(fuse_embeddings, df_resume["bert_embeddings"], resumes_lsm, df_resume["tallrec_embeddings"]))

# Compute Similarities and Rankings
def compute_similarity(job_embeddings, resume_embeddings):
    return cosine_similarity(resume_embeddings, job_embeddings)

similarity_matrix = compute_similarity(np.vstack(postings["fused_embeddings"]), np.vstack(df_resume["fused_embeddings"]))

# Ranking Function
def get_rankings(similarity_matrix):
    rankings = np.argsort(-similarity_matrix, axis=1)  # Sort in descending order
    return rankings

rankings = get_rankings(similarity_matrix)

# Evaluation Metrics
def evaluate(rankings, k_values=[1, 5, 10]):
    num_resumes = rankings.shape[0]

    # Define metrics
    recall_at_k = {k: 0 for k in k_values}
    ndcg_at_k = {k: 0 for k in k_values}
    mrr = 0

    for i in range(num_resumes):
        rank_list = rankings[i]
        first_relevant_rank = np.where(rank_list == i)[0]  # Assume ground truth is diagonal (i-th resume to i-th job)

        if len(first_relevant_rank) > 0:
            first_relevant_rank = first_relevant_rank[0] + 1  # Convert to 1-based index
            mrr += 1 / first_relevant_rank

            for k in k_values:
                if first_relevant_rank <= k:
                    recall_at_k[k] += 1
                    ndcg_at_k[k] += 1 / np.log2(first_relevant_rank + 1)

    # Normalize by number of resumes
    mrr /= num_resumes
    recall_at_k = {k: v / num_resumes for k, v in recall_at_k.items()}
    ndcg_at_k = {k: v / num_resumes for k, v in ndcg_at_k.items()}

    return recall_at_k, ndcg_at_k, mrr

# Compute Metrics
recall, ndcg, mrr = evaluate(rankings, k_values=[1, 5, 10])

# Print Evaluation Results
print(f"Recall@1: {recall[1]:.4f}, Recall@5: {recall[5]:.4f}, Recall@10: {recall[10]:.4f}")
print(f"NDCG@5: {ndcg[5]:.4f}, NDCG@10: {ndcg[10]:.4f}")
print(f"MRR: {mrr:.4f}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings Generated in 120.61 sec.


ValueError: n_components(300) must be <= n_features(26).

In [None]:
import pandas as pd
import numpy as np
import torch
import time
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
postings = pd.read_csv("/content/updated_postings.csv")  # Adjust path for Colab
df_resume = pd.read_csv("/content/updated_resumes.csv")

# Ensure 'skills_desc' column is present
assert 'skills_desc' in postings.columns and 'skills_desc' in df_resume.columns, "Missing 'skills_desc' column in datasets"

# Initialize TRBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate BERT embeddings in batches
def get_bert_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

# Generate embeddings using BERT
start_time = time.time()
postings["bert_embeddings"] = list(get_bert_embeddings(postings["skills_desc"]))
df_resume["bert_embeddings"] = list(get_bert_embeddings(df_resume["skills_desc"]))
print(f"BERT Embeddings Generated in {time.time() - start_time:.2f} sec.")

# LSM (Latent Semantic Matching) using TF-IDF + LSA (SVD)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=1, max_df=0.95)
lsa = TruncatedSVD(n_components=100)  # Set an initial value

# Fit LSM model on job postings and transform
postings_tfidf = vectorizer.fit_transform(postings["skills_desc"])
lsa_n_components = min(postings_tfidf.shape[1], 100)  # Adjust dynamically
lsa = TruncatedSVD(n_components=lsa_n_components)
postings_lsm = lsa.fit_transform(postings_tfidf)

# Transform resumes using the same model
resumes_tfidf = vectorizer.transform(df_resume["skills_desc"])
resumes_lsm = lsa.transform(resumes_tfidf)

# Placeholder for TallREC Model (Assuming pre-trained embeddings)
def get_tallrec_embeddings(texts):
    return np.random.rand(len(texts), 100)  # Replace with actual TallREC model

postings["tallrec_embeddings"] = list(get_tallrec_embeddings(postings["skills_desc"]))
df_resume["tallrec_embeddings"] = list(get_tallrec_embeddings(df_resume["skills_desc"]))

# Fusion: Concatenation of all three embeddings
def fuse_embeddings(bert_emb, lsm_emb, tallrec_emb):
    return np.hstack([bert_emb, lsm_emb, tallrec_emb])

postings["fused_embeddings"] = list(map(fuse_embeddings, postings["bert_embeddings"], postings_lsm, postings["tallrec_embeddings"]))
df_resume["fused_embeddings"] = list(map(fuse_embeddings, df_resume["bert_embeddings"], resumes_lsm, df_resume["tallrec_embeddings"]))

# Compute Similarities and Rankings
def compute_similarity(job_embeddings, resume_embeddings):
    return cosine_similarity(resume_embeddings, job_embeddings)

similarity_matrix = compute_similarity(np.vstack(postings["fused_embeddings"]), np.vstack(df_resume["fused_embeddings"]))

# Ranking Function
def get_rankings(similarity_matrix):
    rankings = np.argsort(-similarity_matrix, axis=1)  # Sort in descending order
    return rankings

rankings = get_rankings(similarity_matrix)

# Evaluation Metrics
def evaluate(rankings, k_values=[1, 5, 10]):
    num_resumes = rankings.shape[0]

    # Define metrics
    recall_at_k = {k: 0 for k in k_values}
    ndcg_at_k = {k: 0 for k in k_values}
    mrr = 0

    for i in range(num_resumes):
        rank_list = rankings[i]
        first_relevant_rank = np.where(rank_list == i)[0]  # Assume ground truth is diagonal (i-th resume to i-th job)

        if len(first_relevant_rank) > 0:
            first_relevant_rank = first_relevant_rank[0] + 1  # Convert to 1-based index
            mrr += 1 / first_relevant_rank

            for k in k_values:
                if first_relevant_rank <= k:
                    recall_at_k[k] += 1
                    ndcg_at_k[k] += 1 / np.log2(first_relevant_rank + 1)

    # Normalize by number of resumes
    mrr /= num_resumes
    recall_at_k = {k: v / num_resumes for k, v in recall_at_k.items()}
    ndcg_at_k = {k: v / num_resumes for k, v in ndcg_at_k.items()}

    return recall_at_k, ndcg_at_k, mrr

# Compute Metrics
recall, ndcg, mrr = evaluate(rankings, k_values=[1, 5, 10])

# Print Evaluation Results
print(f"Recall@1: {recall[1]:.4f}, Recall@5: {recall[5]:.4f}, Recall@10: {recall[10]:.4f}")
print(f"NDCG@5: {ndcg[5]:.4f}, NDCG@10: {ndcg[10]:.4f}")
print(f"MRR: {mrr:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings Generated in 116.10 sec.
