# Method 1

In [8]:
# Install dependencies
!pip install sentence-transformers faiss-cpu pandas

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re



In [9]:
# Load the dataset
df = pd.read_csv("shl_catalog_detailed.csv")
print(f"Loaded {len(df)} rows from shl_catalog_detailed.csv")

Loaded 518 rows from shl_catalog_detailed.csv


In [10]:
# Preprocess Assessment Length
def parse_duration(text):
    if pd.isna(text) or text == "":
        return np.nan
    text = str(text).strip().lower()
    if text in ["variable", "untimed"]:
        return text
    match = re.search(r'(\d+)', text)  # Extract first number
    return float(match.group(1)) if match else np.nan

In [11]:
df["Assessment Length Parsed"] = df["Assessment Length"].apply(parse_duration)
print("Sample of parsed durations:")
print(df[["Assessment Length", "Assessment Length Parsed"]].head(10))

Sample of parsed durations:
  Assessment Length Assessment Length Parsed
0                49                     49.0
1                36                     36.0
2                51                     51.0
3                30                     30.0
4                20                     20.0
5                35                     35.0
6                45                     45.0
7                45                     45.0
8                43                     43.0
9                49                     49.0


In [12]:
# Combine fields for embedding
def combine_text(row):
    return f"{row['Pre-packaged Job Solutions']}. {row['Description']} Test Type: {row['Test Type']}"

df["combined_text"] = df.apply(combine_text, axis=1)

In [13]:
# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df["combined_text"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings, dtype='float32')
print(f"Generated embeddings shape: {embeddings.shape}")

# Setup FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

# Save index and DataFrame
faiss.write_index(index, "shl_assessments_index.faiss")
df.to_csv("shl_catalog_with_text.csv", index=False)
print("Saved FAISS index and updated CSV")

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Generated embeddings shape: (518, 384)
FAISS index built with 518 vectors
Saved FAISS index and updated CSV


In [14]:
# Retrieval function
def retrieve_assessments(query, k=10, max_duration=None):
    query_lower = query.lower()
    wants_untimed = "untimed" in query_lower
    wants_variable = "variable" in query_lower

    # Embed the query
    query_embedding = model.encode([query], show_progress_bar=False)[0]
    query_embedding = np.array([query_embedding], dtype='float32')

    # Search FAISS index
    distances, indices = index.search(query_embedding, k * 2)  # Get extra to filter
    results = df.iloc[indices[0]].copy()
    results["similarity_score"] = 1 - distances[0] / 2

    # Filter based on duration and query intent
    if max_duration is not None or wants_untimed or wants_variable:
        filtered = []
        for _, row in results.iterrows():
            duration = row["Assessment Length Parsed"]
            if pd.isna(duration):  # Missing duration
                filtered.append(row)
            elif isinstance(duration, str):  # "variable" or "untimed"
                if (duration == "untimed" and wants_untimed) or \
                   (duration == "variable" and wants_variable):
                    filtered.append(row)
            elif isinstance(duration, float) and max_duration is not None:  # Numeric
                if duration <= max_duration:
                    filtered.append(row)
            else:
                filtered.append(row)  # Include if no specific filter applies
        results = pd.DataFrame(filtered)

    # Sort and limit
    results = results.sort_values("similarity_score", ascending=False).head(k)
    return results[["id", "Pre-packaged Job Solutions", "URL", "Remote Testing (y/n)",
                    "Adaptive/IRT (y/n)", "Assessment Length", "Test Type", "similarity_score"]]

In [15]:
# Test the pipeline
sample_queries = [
    "Java developers, 40 mins",
    "untimed test for managers",
    "variable length coding test"
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'Java developers, 40 mins'
       id                      Pre-packaged Job Solutions  \
205  4034                Core Java (Advanced Level) (New)   
206  4032                   Core Java (Entry Level) (New)   
161  4160                       Android Development (New)   
237  4063                     Enterprise Java Beans (New)   
431  4130                    Salesforce Development (New)   
223  4221       Digital Readiness Development Report - IC   
224  4239  Digital Readiness Development Report - Manager   
298  4085                      Job Control Language (New)   
384  4156                    Oracle WebLogic Server (New)   
478   251                          Time Management (U.S.)   

                                                   URL Remote Testing (y/n)  \
205  https://www.shl.com/solutions/products/product...                  Yes   
206  https://www.shl.com/solutions/products/product...                  Yes   
161  https://www.shl.com/solutions/products/produ

In [18]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes"
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes'
       id                         Pre-packaged Job Solutions  \
502  4215         Virtual Assessment and Development Centers   
130   492  Technology Professional 8.0 Job Focused Assess...   
46    497                Graduate 8.0 Job Focused Assessment   
487  3733                       Verify - Deductive Reasoning   
454   116                         Software Business Analysis   
298  4085                         Job Control Language (New)   
44    496              Graduate + 8.0 Job Focused Assessment   
45    494                Graduate 7.1 Job Focused Assessment   
399  3484                               PJM Selection Report   
501  4290              Verify Interactive Process Monitoring   

                                                   URL Remote Testing (y/n)  \
502  https://www.shl.com/solutions/produc

# Method 2

In [19]:
# Install dependencies
!pip install sentence-transformers faiss-cpu pandas google-generativeai

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import google.generativeai as genai
import time
import re



In [29]:
# Configure Gemini API
API_KEY = "AIzaSyBxFG2RWw6yBa2_CIqTCrEXVfyMWfwBbZo"
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')  # Using 1.5-flash as proxy; update to 2.0 if available

In [30]:
# Load the dataset
df = pd.read_csv("shl_catalog_detailed.csv")
print(f"Loaded {len(df)} rows from shl_catalog_detailed.csv")

Loaded 518 rows from shl_catalog_detailed.csv


In [31]:
# Preprocess Assessment Length
def parse_duration(text):
    if pd.isna(text) or text == "":
        return np.nan
    text = str(text).strip().lower()
    if text in ["variable", "untimed"]:
        return "flexible duration"  # Pre-interpret for LLM consistency
    match = re.search(r'(\d+)', text)
    return float(match.group(1)) if match else np.nan

df["Assessment Length Parsed"] = df["Assessment Length"].apply(parse_duration)
print("Sample of parsed durations:")
print(df[["Assessment Length", "Assessment Length Parsed"]].head(10))

Sample of parsed durations:
  Assessment Length Assessment Length Parsed
0                49                     49.0
1                36                     36.0
2                51                     51.0
3                30                     30.0
4                20                     20.0
5                35                     35.0
6                45                     45.0
7                45                     45.0
8                43                     43.0
9                49                     49.0


In [32]:
# Combine fields for LLM input
def combine_text(row):
    duration = row["Assessment Length"] if pd.notna(row["Assessment Length"]) else "unknown"
    return f"{row['Pre-packaged Job Solutions']}. {row['Description']} Test Type: {row['Test Type']}, Duration: {duration}"

df["combined_text"] = df.apply(combine_text, axis=1)

In [33]:
# LLM preprocessing with Gemini
def llm_summarize(text):
    prompt = "Summarize this assessment text, retaining key skills, test type, and duration. Use ‘flexible duration’ for ‘Variable’ or ‘Untimed’. Keep it short and concise."
    try:
        response = model.generate_content(prompt + "\n\n" + text)
        return response.text.strip()
    except Exception as e:
        print(f"LLM error: {e}")
        return text  # Fallback to raw text if API fails

In [36]:
# Process dataset with rate limiting (15 RPM = 4 seconds delay)
summaries = []
for i, text in enumerate(df["combined_text"]):
    summaries.append(llm_summarize(text))
    if (i + 1) % 15 == 0:  # Respect 15 RPM
        time.sleep(60)
    print(f"Processed {i + 1}/{len(df)} rows", end="\r")
df["llm_summary"] = summaries
print("\nLLM preprocessing complete. Sample summaries:")
print(df[["combined_text", "llm_summary"]].head(5))





LLM error: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
Processed 518/518 rows
LLM preprocessing complete. Sample summaries:
                                       combined_text  \
0  Account Manager Solution. The Account Manager ...   
1  Administrative Professional - Short Form. The ...   
2  Agency Manager Solution. The Agency Manager so...   
3  Apprentice + 8.0 Job Focused Assessment. The A...   
4  Apprentice 8.0 Job Focused Assessment. The App...   

                                         llm_summary  
0  The Account Manager assessment (CPAB type, fle...  
1  This 36-minute Ability-Knowledge-Personality (...  
2  This assessment, using A B P S test type, eval...  
3  The Apprentice+ 8.0 is a 30-minute, globally a...  
4 

In [37]:
# Generate embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(df["llm_summary"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings, dtype='float32')
print(f"Generated embeddings shape: {embeddings.shape}")

# Setup FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

# Save index and DataFrame
faiss.write_index(index, "shl_assessments_index.faiss")
df.to_csv("shl_catalog_with_summaries.csv", index=False)
print("Saved FAISS index and updated CSV")

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Generated embeddings shape: (518, 384)
FAISS index built with 518 vectors
Saved FAISS index and updated CSV


In [38]:
# Retrieval function
def retrieve_assessments(query, k=10, max_duration=None):
    query_lower = query.lower()
    wants_flexible = any(x in query_lower for x in ["untimed", "variable", "flexible"])

    # LLM preprocess query
    prompt = "Rephrase this query to highlight skills, test type, and duration preferences."
    try:
        response = model.generate_content(prompt + "\n\n" + query)
        processed_query = response.text.strip()
    except Exception as e:
        print(f"Query LLM error: {e}")
        processed_query = query

    # Embed query
    query_embedding = embedding_model.encode([processed_query], show_progress_bar=False)[0]
    query_embedding = np.array([query_embedding], dtype='float32')

    # Search FAISS
    distances, indices = index.search(query_embedding, k * 2)  # Extra for filtering
    results = df.iloc[indices[0]].copy()
    results["similarity_score"] = 1 - distances[0] / 2

    # Filter by duration
    if max_duration is not None or wants_flexible:
        filtered = []
        for _, row in results.iterrows():
            duration = row["Assessment Length Parsed"]
            if pd.isna(duration):
                filtered.append(row)  # Keep unknown durations
            elif duration == "flexible duration" and wants_flexible:
                filtered.append(row)
            elif isinstance(duration, float) and max_duration is not None and duration <= max_duration:
                filtered.append(row)
        results = pd.DataFrame(filtered) if filtered else results

    # Sort and limit
    results = results.sort_values("similarity_score", ascending=False).head(k)
    return results[["id", "Pre-packaged Job Solutions", "URL", "Remote Testing (y/n)",
                    "Adaptive/IRT (y/n)", "Assessment Length", "Test Type", "similarity_score"]]

In [40]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
]
for query in sample_queries:
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None
    results = retrieve_assessments(query, k=10, max_duration=max_duration)
    print(f"\nSample query: '{query}'")
    print(results)

print("\nRAG pipeline built successfully!")


Sample query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.'
       id              Pre-packaged Job Solutions  \
298  4085              Job Control Language (New)   
206  4032           Core Java (Entry Level) (New)   
454   116              Software Business Analysis   
205  4034        Core Java (Advanced Level) (New)   
328  4208  Microsoft Excel 365 - Essentials (New)   
329  4207               Microsoft Excel 365 (New)   
432  4131         SAP ABAP (Advanced Level) (New)   
227  4062                            Drupal (New)   
431  4130            Salesforce Development (New)   
189  4037                           BizTalk (New)   

                                                   URL Remote Testing (y/n)  \
298  https://www.shl.com/solutions/products/product...                  Yes   
206  https://www.shl.com/solutions/products/product...                  Yes   
454

In [41]:
# now if we shorten

In [42]:
# Test the pipeline
sample_queries = [
    "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes."
]

# LLM preprocessing function for query
def llm_shorten_query(query):
    prompt = "Shorten this query, retaining key skills, test type, and duration preferences. Keep it concise."
    try:
        response = model.generate_content(prompt + "\n\n" + query)
        return response.text.strip()
    except Exception as e:
        print(f"Query LLM error: {e}")
        return query  # Fallback to raw query if API fails

for query in sample_queries:
    # Shorten query with LLM
    shortened_query = llm_shorten_query(query)
    print(f"Original query: '{query}'")
    print(f"Shortened query: '{shortened_query}'")

    # Extract max_duration from original query
    max_duration = float(re.search(r'(\d+)\s*min', query).group(1)) if "min" in query else None

    # Retrieve results using shortened query
    results = retrieve_assessments(shortened_query, k=10, max_duration=max_duration)
    print(f"\nSample query (shortened): '{shortened_query}'")
    print(results)

print("\nRAG pipeline built successfully!")

Original query: 'I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.'
Shortened query: 'Hire Java developers with strong collaboration skills.  Need a 40-minute assessment.'

Sample query (shortened): 'Hire Java developers with strong collaboration skills.  Need a 40-minute assessment.'
       id              Pre-packaged Job Solutions  \
206  4032           Core Java (Entry Level) (New)   
298  4085              Job Control Language (New)   
227  4062                            Drupal (New)   
205  4034        Core Java (Advanced Level) (New)   
179  4222                      Automata Front End   
454   116              Software Business Analysis   
303   205             Linux Programming (General)   
388  4108                  Pega Development (New)   
328  4208  Microsoft Excel 365 - Essentials (New)   
329  4207               Microsoft Excel 365 (New)   

                     

In [43]:
print("Nice")

Nice
