In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import pandas as pd
import numpy as np
import torch
import time
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
postings = pd.read_csv("/kaggle/input/updated-postings-csv/updated_postings.csv")  # Adjust path for Colab
df_resume = pd.read_csv("/kaggle/input/updated-resumes-csv/updated_resumes.csv")

# Ensure 'skills_desc' column is present
assert 'skills_desc' in postings.columns and 'skills_desc' in df_resume.columns, "Missing 'skills_desc' column in datasets"

# Initialize TRBERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate BERT embeddings in batches
def get_bert_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

# Generate embeddings using BERT
start_time = time.time()
postings["bert_embeddings"] = list(get_bert_embeddings(postings["skills_desc"]))
df_resume["bert_embeddings"] = list(get_bert_embeddings(df_resume["skills_desc"]))
print(f"BERT Embeddings Generated in {time.time() - start_time:.2f} sec.")

# LSM (Latent Semantic Matching) using TF-IDF + LSA (SVD)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=1, max_df=0.95)
lsa = TruncatedSVD(n_components=100)  # Set an initial value

# Fit LSM model on job postings and transform
postings_tfidf = vectorizer.fit_transform(postings["skills_desc"])
lsa_n_components = min(postings_tfidf.shape[1], 100)  # Adjust dynamically
lsa = TruncatedSVD(n_components=lsa_n_components)
postings_lsm = lsa.fit_transform(postings_tfidf)

# Transform resumes using the same model
resumes_tfidf = vectorizer.transform(df_resume["skills_desc"])
resumes_lsm = lsa.transform(resumes_tfidf)

# Placeholder for TallREC Model (Assuming pre-trained embeddings)
def get_tallrec_embeddings(texts):
    return np.random.rand(len(texts), 100)  # Replace with actual TallREC model

postings["tallrec_embeddings"] = list(get_tallrec_embeddings(postings["skills_desc"]))
df_resume["tallrec_embeddings"] = list(get_tallrec_embeddings(df_resume["skills_desc"]))

# Fusion: Concatenation of all three embeddings
def fuse_embeddings(bert_emb, lsm_emb, tallrec_emb):
    return np.hstack([bert_emb, lsm_emb, tallrec_emb])

postings["fused_embeddings"] = list(map(fuse_embeddings, postings["bert_embeddings"], postings_lsm, postings["tallrec_embeddings"]))
df_resume["fused_embeddings"] = list(map(fuse_embeddings, df_resume["bert_embeddings"], resumes_lsm, df_resume["tallrec_embeddings"]))

# Compute Similarities and Rankings
def compute_similarity(job_embeddings, resume_embeddings):
    return cosine_similarity(resume_embeddings, job_embeddings)

similarity_matrix = compute_similarity(np.vstack(postings["fused_embeddings"]), np.vstack(df_resume["fused_embeddings"]))

# Ranking Function
def get_rankings(similarity_matrix):
    rankings = np.argsort(-similarity_matrix, axis=1)  # Sort in descending order
    return rankings

rankings = get_rankings(similarity_matrix)

# Evaluation Metrics
def evaluate(rankings, k_values=[1, 5, 10]):
    num_resumes = rankings.shape[0]

    # Define metrics
    recall_at_k = {k: 0 for k in k_values}
    ndcg_at_k = {k: 0 for k in k_values}
    mrr = 0

    for i in range(num_resumes):
        rank_list = rankings[i]
        first_relevant_rank = np.where(rank_list == i)[0]  # Assume ground truth is diagonal (i-th resume to i-th job)

        if len(first_relevant_rank) > 0:
            first_relevant_rank = first_relevant_rank[0] + 1  # Convert to 1-based index
            mrr += 1 / first_relevant_rank

            for k in k_values:
                if first_relevant_rank <= k:
                    recall_at_k[k] += 1
                    ndcg_at_k[k] += 1 / np.log2(first_relevant_rank + 1)

    # Normalize by number of resumes
    mrr /= num_resumes
    recall_at_k = {k: v / num_resumes for k, v in recall_at_k.items()}
    ndcg_at_k = {k: v / num_resumes for k, v in ndcg_at_k.items()}

    return recall_at_k, ndcg_at_k, mrr

# Compute Metrics
recall, ndcg, mrr = evaluate(rankings, k_values=[1, 5, 10])

# Print Evaluation Results
print(f"Recall@1: {recall[1]:.4f}, Recall@5: {recall[5]:.4f}, Recall@10: {recall[10]:.4f}")
print(f"NDCG@5: {ndcg[5]:.4f}, NDCG@10: {ndcg[10]:.4f}")
print(f"MRR: {mrr:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings Generated in 2687.97 sec.
Recall@1: 0.0000, Recall@5: 0.0000, Recall@10: 0.0000
NDCG@5: 0.0000, NDCG@10: 0.0000
MRR: 0.0001


In [1]:
import pandas as pd
import numpy as np
import torch
import time
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets (Modify paths accordingly)
postings = pd.read_csv("/kaggle/input/updated-postings-csv/updated_postings.csv")
df_resume = pd.read_csv("/kaggle/input/updated-resumes-csv/updated_resumes.csv")

# Ensure required columns exist
assert 'skills_desc' in postings.columns and 'skills_desc' in df_resume.columns, "Missing 'skills_desc' column in datasets"

# Initialize SBERT for better embeddings
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to generate SBERT embeddings
def get_sbert_embeddings(texts):
    return sbert_model.encode(texts.tolist(), convert_to_numpy=True, device=device)

# Generate SBERT embeddings
start_time = time.time()
postings["sbert_embeddings"] = list(get_sbert_embeddings(postings["skills_desc"]))
df_resume["sbert_embeddings"] = list(get_sbert_embeddings(df_resume["skills_desc"]))
print(f"SBERT Embeddings Generated in {time.time() - start_time:.2f} sec.")

# TF-IDF + Latent Semantic Analysis (LSA)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=2, max_df=0.95)
lsa = TruncatedSVD(n_components=100)

# Fit TF-IDF and LSA on job postings
postings_tfidf = vectorizer.fit_transform(postings["skills_desc"])
lsa_n_components = min(postings_tfidf.shape[1], 100)  # Dynamic component selection
lsa = TruncatedSVD(n_components=lsa_n_components)
postings_lsa = lsa.fit_transform(postings_tfidf)

# Transform resumes using the same model
resumes_tfidf = vectorizer.transform(df_resume["skills_desc"])
resumes_lsa = lsa.transform(resumes_tfidf)

# Placeholder for TallREC Model (Replace with actual embeddings if available)
def get_tallrec_embeddings(texts):
    return np.random.rand(len(texts), 100)  # Replace with real TallREC embeddings

postings["tallrec_embeddings"] = list(get_tallrec_embeddings(postings["skills_desc"]))
df_resume["tallrec_embeddings"] = list(get_tallrec_embeddings(df_resume["skills_desc"]))

# Fusion of SBERT, LSA, and TallREC
def fuse_embeddings(sbert_emb, lsa_emb, tallrec_emb):
    return np.hstack([sbert_emb, lsa_emb, tallrec_emb])

postings["fused_embeddings"] = list(map(fuse_embeddings, postings["sbert_embeddings"], postings_lsa, postings["tallrec_embeddings"]))
df_resume["fused_embeddings"] = list(map(fuse_embeddings, df_resume["sbert_embeddings"], resumes_lsa, df_resume["tallrec_embeddings"]))

# Compute Similarities and Rankings
def compute_similarity(job_embeddings, resume_embeddings):
    return cosine_similarity(resume_embeddings, job_embeddings)

similarity_matrix = compute_similarity(np.vstack(postings["fused_embeddings"]), np.vstack(df_resume["fused_embeddings"]))

# Ranking Function
def get_rankings(similarity_matrix):
    return np.argsort(-similarity_matrix, axis=1)  # Descending order

rankings = get_rankings(similarity_matrix)

# Evaluation Metrics
def evaluate(rankings, k_values=[1, 5, 10]):
    num_resumes = rankings.shape[0]

    recall_at_k = {k: 0 for k in k_values}
    ndcg_at_k = {k: 0 for k in k_values}
    mrr = 0

    for i in range(num_resumes):
        rank_list = rankings[i]
        first_relevant_rank = np.where(rank_list == i)[0]

        if len(first_relevant_rank) > 0:
            first_relevant_rank = first_relevant_rank[0] + 1  # Convert to 1-based index
            mrr += 1 / first_relevant_rank

            for k in k_values:
                if first_relevant_rank <= k:
                    recall_at_k[k] += 1
                    ndcg_at_k[k] += 1 / np.log2(first_relevant_rank + 1)

    mrr /= num_resumes
    recall_at_k = {k: v / num_resumes for k, v in recall_at_k.items()}
    ndcg_at_k = {k: v / num_resumes for k, v in ndcg_at_k.items()}

    return recall_at_k, ndcg_at_k, mrr

# Compute Metrics
recall, ndcg, mrr = evaluate(rankings, k_values=[1, 5, 10])

# Print Evaluation Results
print(f"Recall@1: {recall[1]:.4f}, Recall@5: {recall[5]:.4f}, Recall@10: {recall[10]:.4f}")
print(f"NDCG@5: {ndcg[5]:.4f}, NDCG@10: {ndcg[10]:.4f}")
print(f"MRR: {mrr:.4f}")


Batches:   0%|          | 0/3871 [00:00<?, ?it/s]

Batches:   0%|          | 0/299 [00:00<?, ?it/s]

SBERT Embeddings Generated in 383.14 sec.
Recall@1: 0.0000, Recall@5: 0.0000, Recall@10: 0.0000
NDCG@5: 0.0000, NDCG@10: 0.0000
MRR: 0.0001


In [2]:
print(postings.columns)   # For job postings dataset
print(df_resume.columns)  # For resumes dataset


Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips',
       'sbert_embeddings', 'tallrec_embeddings', 'fused_embeddings'],
      dtype='object')
Index(['address', 'career_objective', 'skills', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'company_urls',
       'start_dates', 'end_dates', 'related_skils_in_job', 'positions',
       'locations', 'responsibilities', 'extra_curricular_activi

In [14]:
# Convert to string and then split into lists
postings["skills_desc"] = postings["skills_desc"].astype(str).str.lower().str.split(",")
df_resume["skills_desc"] = df_resume["skills_desc"].astype(str).str.lower().str.split(",")
