In [1]:
import pandas as pd
from typing import List


In [2]:
class SHLDataLoader:
    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.df = None

    def load_dataset(self):
        self.df = pd.read_excel(self.dataset_path)

        required_columns = {"Query", "Assessment_url"}
        if not required_columns.issubset(set(self.df.columns)):
            raise ValueError(
                f"Dataset must contain columns: {required_columns}"
            )

        self.df = self.df.dropna(subset=["Query", "Assessment_url"])
        return self.df

    def get_queries(self):
        if self.df is None:
            raise RuntimeError("Call load_dataset() first.")
        return self.df["Query"].astype(str).tolist()

    def get_ground_truth(self):
        if self.df is None:
            raise RuntimeError("Call load_dataset() first.")

        ground_truth = []
        for urls in self.df["Assessment_url"]:
            split_urls = [u.strip() for u in str(urls).split(",") if u.strip()]
            ground_truth.append(split_urls)
        return ground_truth

    def build_assessment_corpus(self):
        if self.df is None:
            raise RuntimeError("Call load_dataset() first.")

        corpus = set()
        for urls in self.df["Assessment_url"]:
            split_urls = [u.strip() for u in str(urls).split(",") if u.strip()]
            corpus.update(split_urls)
        return sorted(list(corpus))


In [3]:
import pandas as pd
DATASET_PATH = "/kaggle/input/gen-ai-shl-dataset/Gen_AI shl Dataset.xlsx"


loader = SHLDataLoader(DATASET_PATH)
df = loader.load_dataset()

queries = loader.get_queries()
ground_truth = loader.get_ground_truth()
assessment_corpus = loader.build_assessment_corpus()

print("Total Queries:", len(queries))
print("Unique Assessments:", len(assessment_corpus))
print("\nSample Query:", queries[0])
print("Ground Truth:", ground_truth[0])


Total Queries: 65
Unique Assessments: 54

Sample Query: I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.
Ground Truth: ['https://www.shl.com/solutions/products/product-catalog/view/automata-fix-new/']


In [4]:
!pip install -q sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
class EmbeddingEngine:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        """
        model_name: Sentence-BERT model
        """
        self.model = SentenceTransformer(model_name)

    def encode_queries(self, queries):
        """
        Encode list of query strings
        """
        return self.model.encode(
            queries,
            show_progress_bar=True,
            convert_to_numpy=True
        )

    def encode_assessments(self, assessment_corpus):
        """
        Encode list of assessment URLs (corpus)
        """
        return self.model.encode(
            assessment_corpus,
            show_progress_bar=True,
            convert_to_numpy=True
        )


2025-12-16 11:07:06.570244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765883226.592409     198 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765883226.598988     198 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
!pip install -q --upgrade protobuf==3.20.3
!pip install -q --upgrade sentence-transformers
from sentence_transformers import SentenceTransformer
import numpy as np
class EmbeddingEngine:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def encode_queries(self, queries):
        return self.model.encode(
            queries,
            convert_to_numpy=True,
            show_progress_bar=True
        )

    def encode_assessments(self, assessment_corpus):
        return self.model.encode(
            assessment_corpus,
            convert_to_numpy=True,
            show_progress_bar=True
        )


In [8]:
embedder = EmbeddingEngine()

query_embeddings = embedder.encode_queries(queries)
assessment_embeddings = embedder.encode_assessments(assessment_corpus)

print("Query embeddings:", query_embeddings.shape)
print("Assessment embeddings:", assessment_embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Query embeddings: (65, 384)
Assessment embeddings: (54, 384)


In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
class Retriever:
    def __init__(self, assessment_corpus, assessment_embeddings):
        """
        assessment_corpus: list of SHL assessment URLs
        assessment_embeddings: numpy array of embeddings
        """
        self.assessment_corpus = assessment_corpus
        self.assessment_embeddings = assessment_embeddings

    def retrieve_top_k(self, query_embedding, k=5):
        """
        Returns top-k assessment URLs for a single query embedding
        """
        # Compute cosine similarity
        scores = cosine_similarity(
            query_embedding.reshape(1, -1),
            self.assessment_embeddings
        )[0]

        # Sort by similarity score
        top_k_indices = np.argsort(scores)[::-1][:k]

        # Fetch URLs
        top_k_urls = [self.assessment_corpus[i] for i in top_k_indices]

        return top_k_urls


In [13]:
# Initialize retriever
retriever = Retriever(assessment_corpus, assessment_embeddings)

# Test with first query
sample_query = queries[0]
sample_query_embedding = query_embeddings[0]

top_5_results = retriever.retrieve_top_k(sample_query_embedding, k=5)

print("Query:")
print(sample_query)

print("\nTop-5 Recommended SHL Assessments:")
for url in top_5_results:
    print(url)


Query:
I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.

Top-5 Recommended SHL Assessments:
https://www.shl.com/solutions/products/product-catalog/view/global-skills-assessment/
https://www.shl.com/solutions/products/product-catalog/view/core-java-advanced-level-new/
https://www.shl.com/solutions/products/product-catalog/view/core-java-entry-level-new/
https://www.shl.com/solutions/products/product-catalog/view/java-8-new/
https://www.shl.com/solutions/products/product-catalog/view/occupational-personality-questionnaire-opq32r/


In [14]:
def recall_at_k(true_urls, predicted_urls, k):
    """
    true_urls: list of ground truth URLs
    predicted_urls: list of predicted URLs (ranked)
    """
    true_set = set(true_urls)
    pred_set = set(predicted_urls[:k])

    if len(true_set) == 0:
        return 0.0

    return len(true_set & pred_set) / len(true_set)


def mean_recall_at_k(
    queries,
    ground_truth,
    query_embeddings,
    retriever,
    k=5
):
    recalls = []

    for i in range(len(queries)):
        query_embedding = query_embeddings[i]
        true_urls = ground_truth[i]

        predicted_urls = retriever.retrieve_top_k(
            query_embedding, k=k
        )

        r = recall_at_k(true_urls, predicted_urls, k)
        recalls.append(r)

    return sum(recalls) / len(recalls)


In [15]:
for k in [1, 3, 5, 10]:
    mr = mean_recall_at_k(
        queries,
        ground_truth,
        query_embeddings,
        retriever,
        k=k
    )
    print(f"Mean Recall@{k}: {mr:.4f}")


Mean Recall@1: 0.0615
Mean Recall@3: 0.2000
Mean Recall@5: 0.3077
Mean Recall@10: 0.4308


In [16]:
import pandas as pd

def generate_submission_csv(
    queries,
    query_embeddings,
    retriever,
    k=10,
    output_path="submission.csv"
):
    """
    Generates submission CSV in SHL Appendix-3 format
    """

    submission_rows = []

    for i, query in enumerate(queries):
        predicted_urls = retriever.retrieve_top_k(
            query_embeddings[i],
            k=k
        )

        submission_rows.append({
            "Query": query,
            "Predicted_Assessment_URLs": ",".join(predicted_urls)
        })

    submission_df = pd.DataFrame(submission_rows)
    submission_df.to_csv(output_path, index=False)

    return submission_df


In [17]:
submission_df = generate_submission_csv(
    queries=queries,
    query_embeddings=query_embeddings,
    retriever=retriever,
    k=10,
    output_path="shl_submission.csv"
)

submission_df.head()


Unnamed: 0,Query,Predicted_Assessment_URLs
0,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
1,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
2,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
3,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
4,I am hiring for Java developers who can also c...,https://www.shl.com/solutions/products/product...
