In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
# Keep it light and conflict-free.
!pip install --quiet beautifulsoup4 requests lxml rank-bm25 lightgbm==4.1.0 scikit-learn==1.4.2

In [10]:
import os, re, json, pickle, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from collections import defaultdict, Counter

import requests
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from rank_bm25 import BM25Okapi
import lightgbm as lgb

In [11]:
# Adjust these paths if you uploaded to kaggle datasets; below are common ones:
TRAIN_PATH = "/kaggle/input/shldatasetrev/Training Data.csv"     # rename to your dataset filename
TEST_PATH  = "/kaggle/input/shldatasetrev/Testing_data.csv"

# Fallback if names differ:
if not os.path.exists(TRAIN_PATH):
    TRAIN_PATH = "/kaggle/input/training-data/Training Data.csv"
if not os.path.exists(TEST_PATH):
    TEST_PATH = "/kaggle/input/testing-data/Testing_data.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# Clean “Unnamed” columns from Excel-style exports
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df  = test_df.loc[:,  ~test_df.columns.str.contains('^Unnamed')]

assert {"Query","Assessment_url"}.issubset(set(train_df.columns)), "Training CSV must have Query and Assessment_url columns"
assert {"Query"}.issubset(set(test_df.columns)), "Testing CSV must have Query column"

print(train_df.shape, train_df.head(3))
print(test_df.shape, test_df.head(3))

(65, 2)                                                Query  \
0  I am hiring for Java developers who can also c...   
1  I am hiring for Java developers who can also c...   
2  I am hiring for Java developers who can also c...   

                                      Assessment_url  
0  https://www.shl.com/solutions/products/product...  
1  https://www.shl.com/solutions/products/product...  
2  https://www.shl.com/solutions/products/product...  
(9, 1)                                                Query
0  Looking to hire mid-level professionals who ar...
1  Job Description\n\n Join a community that is s...
2  I am hiring for an analyst and wants applicati...


In [12]:
SESSION = requests.Session()
SESSION.headers.update({"User-Agent":"Mozilla/5.0"})

def fetch_html(url: str, timeout: int = 10) -> str:
    try:
        r = SESSION.get(url, timeout=timeout)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"[WARN] fetch fail {url}: {e}")
        return ""

def parse_assessment(url: str) -> Dict:
    html = fetch_html(url) if url.startswith("http") else ""
    soup = BeautifulSoup(html, "lxml") if html else None

    # Name from URL as fallback
    name = url.strip("/").split("/")[-1].replace("-", " ").title()
    description = ""
    duration = ""
    test_type = ""

    if soup:
        # heuristic text extraction
        blocks = []
        for tag in soup.find_all(["h1","h2","p","li","div"]):
            txt = tag.get_text(" ", strip=True)
            if txt and len(txt) > 40:
                blocks.append(txt)
        description = " ".join(blocks)[:1200]

        # duration heuristic
        m = re.search(r"(\d+)\s*(?:min|minutes|minute|hr|hour|hours)", description, flags=re.I)
        duration = m.group(0) if m else ""

    # test-type heuristic from URL/text
    lower_blob = (name + " " + description + " " + url).lower()
    if any(k in lower_blob for k in ["opq","personality","behavior","behaviour"]):
        test_type = "P"
    elif any(k in lower_blob for k in ["cognitive","numerical","verbal","inductive","deductive","reasoning","aptitude"]):
        test_type = "C"
    else:
        test_type = "K"

    # keyword bag
    skill_lex = [
        "java","python","sql","javascript","js","html","css","selenium","excel","tableau",
        "communication","leadership","collaboration","stakeholder","analytical","problem solving",
        "sales","marketing","customer","support","data","programming","coding","qa","quality"
    ]
    keywords = sorted({w for w in skill_lex if w in lower_blob})

    return {
        "url": url,
        "name": name,
        "description": description,
        "duration": duration,
        "test_type": test_type,
        "keywords": keywords
    }

# Build catalog strictly from training URLs (works when internet off)
catalog_urls = sorted(train_df["Assessment_url"].dropna().unique().tolist())
catalog = []
for u in catalog_urls:
    catalog.append(parse_assessment(u))

catalog_df = pd.DataFrame(catalog)
print("Catalog size:", len(catalog_df))
catalog_df.head(3)

Catalog size: 54


Unnamed: 0,url,name,description,duration,test_type,keywords
0,https://www.shl.com/products/product-catalog/v...,Business Communication Adaptive,Outdated browser detected We recommend upgradi...,,K,"[communication, support]"
1,https://www.shl.com/products/product-catalog/v...,English Comprehension New,Outdated browser detected We recommend upgradi...,,K,[support]
2,https://www.shl.com/products/product-catalog/v...,Enterprise Leadership Report 2 0,Outdated browser detected We recommend upgradi...,,K,"[leadership, support]"


In [13]:
# Unified text for each assessment
def assessment_text(row) -> str:
    parts = [
        (row.get("name") or ""),
        (row.get("description") or ""),
        " ".join(row.get("keywords") or []),
        row.get("test_type") or ""
    ]
    return " ".join(parts)

catalog_df["text"] = catalog_df.apply(assessment_text, axis=1)

# Tokenization for BM25
def tokenize(s: str) -> List[str]:
    s = s.lower()
    s = re.sub(r"[^a-z0-9+.# ]+", " ", s)
    return s.split()

# Build BM25 corpus
bm25_corpus = [tokenize(t) for t in catalog_df["text"].tolist()]
bm25 = BM25Okapi(bm25_corpus)

# TF-IDF
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.9)
tfidf_mat = tfidf.fit_transform(catalog_df["text"])

In [14]:
# Lexicons for skills and soft-skills to detect "diverse" needs
TECH = set(["java","python","sql","javascript","js","html","css","selenium","excel","tableau","react","node","mongodb","mysql","postgresql"])
SOFT = set(["communication","leadership","teamwork","collaboration","stakeholder","problem solving","analytical","creativity","adaptability","management","interpersonal"])

def extract_requirements(query: str) -> Dict:
    q = query.lower()
    tech = sorted({w for w in TECH if w in q})
    soft = sorted({w for w in SOFT if w in q})
    needs_diversity = bool(tech and soft)
    # duration
    dur = None
    m = re.search(r"(\d+)\s*(min|minute|hour|hr)", q)
    if m:
        dur = int(m.group(1)) * (60 if "hour" in m.group(2) or "hr" in m.group(2) else 1)
    return {"tech":tech,"soft":soft,"needs_diversity":needs_diversity,"duration":dur}

def jaccard(a: List[str], b: List[str]) -> float:
    A, B = set(a), set(b)
    if not A and not B: return 0.0
    return len(A & B) / len(A | B)

def features_for(query: str, topN: int = 80) -> Tuple[np.ndarray, List[int]]:
    """Generate candidate set (BM25) then compute dense features for reranking."""
    q_tokens = tokenize(query)
    bm25_scores = bm25.get_scores(q_tokens)
    # Top candidates by BM25
    cand_idx = np.argsort(-bm25_scores)[:topN]

    # TF-IDF cosine vs catalog rows
    q_vec = tfidf.transform([query])
    cosines = cosine_similarity(q_vec, tfidf_mat[cand_idx]).ravel()

    req = extract_requirements(query)
    feats = []
    for rank_pos, i in enumerate(cand_idx):
        row = catalog_df.iloc[i]
        text_tok = bm25_corpus[i]

        f = []
        # 1) BM25 score
        f.append(bm25_scores[i])
        # 2) TF-IDF cosine
        f.append(cosines[rank_pos])
        # 3) token overlap
        f.append(len(set(q_tokens) & set(text_tok)))
        f.append(jaccard(q_tokens, text_tok))
        # 4) keyword overlaps
        f.append(sum(1 for w in req["tech"] if w in (row["text"].lower())))
        f.append(sum(1 for w in req["soft"] if w in (row["text"].lower())))
        # 5) test type match preferences (if both -> neutral; if only tech -> bias K; only soft -> bias P)
        tt = row.get("test_type","")
        prefer_k = bool(req["tech"] and not req["soft"])
        prefer_p = bool(req["soft"] and not req["tech"])
        f.append(1.0 if (prefer_k and tt=="K") else 0.0)
        f.append(1.0 if (prefer_p and tt=="P") else 0.0)
        # 6) url signal (token hits inside url)
        url = (row["url"] or "").lower()
        f.append(sum(1 for w in req["tech"] if w in url))
        f.append(sum(1 for w in req["soft"] if w in url))

        feats.append(f)

    return np.array(feats, dtype=np.float32), cand_idx.tolist()

In [15]:
# For each unique training query, mark positives from labels and sample negatives from catalog.
train_queries = train_df["Query"].unique().tolist()
q_records = []

for q in train_queries:
    pos_urls = set(train_df.loc[train_df["Query"]==q, "Assessment_url"].dropna().tolist())

    X, cand_idx = features_for(q, topN=min(120, len(catalog_df)))
    urls = catalog_df.iloc[cand_idx]["url"].tolist()

    y = np.array([1 if u in pos_urls else 0 for u in urls], dtype=np.int32)

    # Keep all positives + a set of hardest negatives (top BM25 but not in pos)
    keep_mask = (y==1)
    neg_candidates = np.where((y==0))[0].tolist()
    # take top pile of negatives (already sorted by BM25 order due to cand list)
    neg_keep = neg_candidates[:max(10, min(50, len(neg_candidates)))]
    keep_mask[neg_keep] = True

    X_keep = X[keep_mask]
    y_keep = y[keep_mask]
    urls_keep = np.array(urls)[keep_mask].tolist()

    q_records.append({"query": q, "X": X_keep, "y": y_keep, "urls": urls_keep})

# Concatenate into LGBMRanker input
X_list, y_list, group = [], [], []
for rec in q_records:
    X_list.append(rec["X"])
    y_list.append(rec["y"])
    group.append(len(rec["y"]))
X_train = np.vstack(X_list)
y_train = np.concatenate(y_list)

print("Train matrix:", X_train.shape, " Positives:", int(y_train.sum()), " Groups:", len(group))

Train matrix: (540, 10)  Positives: 65  Groups: 10


In [16]:
def recall_at_k_for_query(q: str, booster, k=10) -> float:
    feats, cidx = features_for(q, topN=min(120, len(catalog_df)))
    scores = booster.predict(feats)
    order = np.argsort(-scores)
    top_idx = [cidx[i] for i in order[:k]]
    rec_urls = set(catalog_df.iloc[top_idx]["url"].tolist())

    gold = set(train_df.loc[train_df["Query"]==q, "Assessment_url"].tolist())
    if not gold: return 0.0
    return len(rec_urls & gold) / len(gold)

def mean_recall_at_k(booster, k=10) -> float:
    vals = [recall_at_k_for_query(q, booster, k=k) for q in train_queries]
    return float(np.mean(vals)) if vals else 0.0

params = dict(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    num_leaves=63,
    learning_rate=0.06,
    min_data_in_leaf=10,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=1,
    max_depth=-1,
    verbose=-1
)

dtrain = lgb.Dataset(X_train, label=y_train, group=group, free_raw_data=False)

RECALL_K = 10
EVAL_EVERY = 10
NUM_BOOST_ROUNDS = 200

recall_history = []

def recall_callback(env):
    # Called at the end of each iteration
    iter_ = env.iteration + 1
    if iter_ % EVAL_EVERY == 0 or iter_ == 1:
        booster = env.model
        r = mean_recall_at_k(booster, k=RECALL_K)
        recall_history.append((iter_, r))
        print(f"[Round {iter_:3d}] Mean Recall@{RECALL_K}: {r:.4f}")

ranker = lgb.train(
    params,
    dtrain,
    num_boost_round=NUM_BOOST_ROUNDS,
    valid_sets=[dtrain],
    valid_names=["train"],
    callbacks=[lgb.log_evaluation(period=EVAL_EVERY), recall_callback]
)

print("\nFinal Mean Recall@10:", f"{mean_recall_at_k(ranker, k=10):.4f}")

[Round   1] Mean Recall@10: 0.5811
[Round  10] Mean Recall@10: 0.6856
[10]	train's ndcg@1: 0.9	train's ndcg@2: 0.861315	train's ndcg@3: 0.8	train's ndcg@4: 0.749562	train's ndcg@5: 0.716818
[Round  20] Mean Recall@10: 0.7356
[20]	train's ndcg@1: 1	train's ndcg@2: 0.922629	train's ndcg@3: 0.846928	train's ndcg@4: 0.805412	train's ndcg@5: 0.75222
[Round  30] Mean Recall@10: 0.7722
[30]	train's ndcg@1: 1	train's ndcg@2: 0.961315	train's ndcg@3: 0.876536	train's ndcg@4: 0.81323	train's ndcg@5: 0.759012
[Round  40] Mean Recall@10: 0.7922
[40]	train's ndcg@1: 1	train's ndcg@2: 0.961315	train's ndcg@3: 0.876536	train's ndcg@4: 0.81323	train's ndcg@5: 0.759012
[Round  50] Mean Recall@10: 0.7989
[50]	train's ndcg@1: 1	train's ndcg@2: 1	train's ndcg@3: 0.88268	train's ndcg@4: 0.835154	train's ndcg@5: 0.77806
[Round  60] Mean Recall@10: 0.7733
[60]	train's ndcg@1: 1	train's ndcg@2: 1	train's ndcg@3: 0.88268	train's ndcg@4: 0.851967	train's ndcg@5: 0.779546
[Round  70] Mean Recall@10: 0.7900
[70]	

In [24]:
def recommend(query: str, k: int = 10, balance=True) -> List[str]:
    feats, cidx = features_for(query, topN=min(200, len(catalog_df)))
    scores = ranker.predict(feats)
    order = np.argsort(-scores)

    if not balance:
        top_idx = [cidx[i] for i in order[:k]]
        return catalog_df.iloc[top_idx]["url"].tolist()

    req = extract_requirements(query)
    # split by test_type
    ranked = [{"idx": cidx[i], "score": float(scores[i]),
               "tt": catalog_df.iloc[cidx[i]]["test_type"]} for i in order]

    if req["needs_diversity"]:
        # 50% K, 30% P, 20% C
        tk = max(1, int(0.5 * k))
        tp = max(1, int(0.3 * k))
        tc = max(1, k - tk - tp)
        buckets = {"K":[], "P":[], "C":[]}
        for r in ranked:
            buckets.get(r["tt"], []).append(r)

        take = []
        take += buckets["K"][:tk]
        take += buckets["P"][:tp]
        take += buckets["C"][:tc]

        if len(take) < k:
            # fill remainder by global score
            used = set(x["idx"] for x in take)
            for r in ranked:
                if r["idx"] not in used:
                    take.append(r)
                if len(take) == k: break

        take = sorted(take, key=lambda x: -x["score"])[:k]
        top_idx = [x["idx"] for x in take]
    else:
        top_idx = [cidx[i] for i in order[:k]]

    return catalog_df.iloc[top_idx]["url"].tolist()

In [25]:
def eval_on_training(k=10):
    recalls = []
    for q in train_queries:
        gold = set(train_df.loc[train_df["Query"]==q, "Assessment_url"].tolist())
        pred = set(recommend(q, k=k, balance=True))
        rec = 0.0 if not gold else len(gold & pred) / len(gold)
        recalls.append(rec)
        print(f"Recall@{k}: {rec:.3f} | {q[:80]}...")
    print(f"\nMean Recall@{k}: {np.mean(recalls):.4f}")

eval_on_training(k=10)

Recall@10: 0.800 | I am hiring for Java developers who can also collaborate effectively with my bus...
Recall@10: 0.889 | I want to hire new graduates for a sales role in my company, the budget is for a...
Recall@10: 0.667 | I am looking for a COO for my company in China and I want to see if they are cul...
Recall@10: 0.800 | KEY RESPONSIBITILES:

Manage the sound-scape of the station through appropriate ...
Recall@10: 0.600 | Content Writer required, expert in English and SEO....
Recall@10: 0.556 | Find me 1 hour long assesment for the below job at SHL
Job Description

 Join a ...
Recall@10: 0.667 | ICICI Bank Assistant Admin, Experience required 0-2 years, test should be 30-40 ...
Recall@10: 1.000 | We're looking for a Marketing Manager who can drive Recro’s brand positioning, c...
Recall@10: 1.000 | Based on the JD below recommend me assessment for the Consultant position in my ...
Recall@10: 0.800 | I want to hire a Senior Data Analyst with 5 years of experience and expertise in...

In [26]:
pred_rows = []
for _, row in test_df.iterrows():
    q = row["Query"]
    urls = recommend(q, k=10, balance=True)
    for u in urls:
        pred_rows.append({"Query": q, "Assessment_url": u})

pred_df = pd.DataFrame(pred_rows)
pred_df.to_csv("predictions.csv", index=False)
print("Saved predictions.csv with", len(pred_df), "rows")

pred_df.head(12)

Saved predictions.csv with 90 rows


Unnamed: 0,Query,Assessment_url
0,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
1,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
2,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
3,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
4,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
5,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
6,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
7,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
8,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...
9,Looking to hire mid-level professionals who ar...,https://www.shl.com/solutions/products/product...


In [27]:
artifacts = {
    "catalog_df": catalog_df.to_dict(orient="list"),
    "tfidf_vocab": tfidf.vocabulary_,
    "tfidf_idf": tfidf.idf_.tolist(),
    "params": params,
}

# LightGBM model
ranker.save_model("lgbm_ltr.txt")

# BM25 components
with open("bm25_tokens.pkl", "wb") as f:
    pickle.dump(bm25_corpus, f)

with open("artifacts.json", "w") as f:
    json.dump(artifacts, f)

print("Saved: lgbm_ltr.txt, bm25_tokens.pkl, artifacts.json, predictions.csv")

Saved: lgbm_ltr.txt, bm25_tokens.pkl, artifacts.json, predictions.csv
