In [1]:
# === 📦 Step 1: Imports ===
import pandas as pd
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util

In [3]:
# === 🧹 Step 2: Load CSV and Preprocess ===
df = pd.read_csv("cleaned_shl_assessments.csv")

In [5]:
# Ensure duration is integer
if df['duration'].dtype != 'int64':
    df['duration_minutes'] = df['duration'].astype(str).str.extract(r'(\d+)').astype(float).fillna(0).astype(int)
else:
    df['duration_minutes'] = df['duration']

In [7]:
# Fill NaNs in descriptions
df['description'] = df['description'].fillna("")

In [8]:
# === 🔍 Step 3: Setup BM25 ===
tokenizer = lambda x: x.lower().split()
df['tokenized_desc'] = df['description'].apply(tokenizer)
bm25 = BM25Okapi(df['tokenized_desc'].tolist())

In [9]:
# === 🤖 Step 4: Load Embedding Model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
# === 💡 Step 5: Define Recommendation Function ===
def recommend_assessments(query, df, bm25_model, tokenizer, embedding_model, top_k=5):
    # 1. Extract duration from query
    def extract_duration_limit(text):
        match = re.search(r"(under|within|less than|max(?:imum)? of)?\s*(\d{1,3})", text.lower())
        if match:
            return int(match.group(2))
        return None

    duration_limit = extract_duration_limit(query)

    # 2. BM25 scoring
    tokenized_query = tokenizer(query.lower())
    bm25_scores = bm25_model.get_scores(tokenized_query)

    # 3. Embedding scoring
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    desc_embeddings = embedding_model.encode(df['description'].tolist(), convert_to_tensor=True)
    embedding_scores = util.cos_sim(query_embedding, desc_embeddings)[0].cpu().numpy()

    # 4. Normalize scores
    df_copy = df.copy()
    df_copy['bm25_score'] = bm25_scores
    df_copy['embedding_score'] = embedding_scores
    df_copy['bm25_score_norm'] = (df_copy['bm25_score'] - df_copy['bm25_score'].min()) / (df_copy['bm25_score'].max() - df_copy['bm25_score'].min())
    df_copy['embedding_score_norm'] = (df_copy['embedding_score'] - df_copy['embedding_score'].min()) / (df_copy['embedding_score'].max() - df_copy['embedding_score'].min())
    df_copy['final_score'] = 0.5 * df_copy['bm25_score_norm'] + 0.5 * df_copy['embedding_score_norm']

    # 5. Filter by duration
    if duration_limit:
        filtered = df_copy[df_copy['duration_minutes'] <= duration_limit]
        if filtered.empty:
            filtered = df_copy  # fallback if no assessment under duration
    else:
        filtered = df_copy

    # 6. Return top-k results in tabular format
    top_k = min(len(filtered), max(1, top_k))
    return filtered.sort_values(by='final_score', ascending=False).head(top_k)[['name', 'duration_minutes', 'final_score']].reset_index(drop=True)

In [14]:
# === 🧪 Step 6: Try a Query ===
query = "I am hiring for an analyst and want applications to screen using Cognitive and personality tests, what options are available within 45 mins."
results = recommend_assessments(query, df, bm25, tokenizer, model)
display(results)

Unnamed: 0,name,duration_minutes,final_score
0,OPQ Universal Competency Report 1.0 | SHL,20,0.864851
1,Verify - Deductive Reasoning | SHL,20,0.856291
2,HiPo Assessment Report 2.0 | SHL,20,0.855825
3,HiPo Assessment Report 1.0 | SHL,20,0.84173
4,Verify Interactive Process Monitoring | SHL,18,0.830088
