In [6]:
!pip install fastapi nest-asyncio pyngrok uvicorn scikit-learn flask


Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.7-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.7


In [2]:
!pip install gradio sentence-transformers numpy


Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URL = "https://shl.com"
PAGE_STEP = 12
MAX_WORKERS = 8

def extract_data_from_page(soup):
    data = []
    rows = soup.select("tr[data-course-id], tr[data-entity-id]")
    for row in rows:
        course_id = row.get("data-course-id")
        entity_id = row.get("data-entity-id")
        item_id = course_id or entity_id
        item_type = "course" if course_id else "entity"

        tds = row.find_all("td")
        if len(tds) < 4:
            continue

        link_tag = tds[0].select_one("a[href]")
        link = BASE_URL + link_tag["href"] if link_tag else None
        name = link_tag.text.strip() if link_tag else None

        # Safe span check using class list
        def has_yes_icon(td):
            span = td.find("span")
            return span and "-yes" in span.get("class", [])

        is_remote = has_yes_icon(tds[1])
        is_adaptive = has_yes_icon(tds[2])
        keys = [k.text.strip() for k in tds[3].select(".product-catalogue__key")]

        data.append({
            "type": item_type,
            "id": item_id,
            "name": name,
            "link": link,
            "remote": is_remote,
            "adaptive_or_irt": is_adaptive,
            "keys": keys
        })
    return data



def scrape_all_pages_by_type(content_type):
    results = []
    start = 0
    while True:
        url = f"{BASE_URL}/products/product-catalog/?start={start}&type={content_type}"
        print(f"Scraping page: {url}")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        page_data = extract_data_from_page(soup)
        if not page_data:
            break  # No more data

        results.extend(page_data)

        # Check if there's a next page
        next_button = soup.select_one("li.-next a")
        if not next_button:
            break

        start += PAGE_STEP
        time.sleep(1)

    return results

if __name__ == "__main__":
    all_data = []
    for content_type in [1, 2]:  # Adjust this if there are more types
        data = scrape_all_pages_by_type(content_type)
        all_data.extend(data)

    for item in all_data:
        print(item)


Scraping page: https://shl.com/products/product-catalog/?start=0&type=1
Scraping page: https://shl.com/products/product-catalog/?start=12&type=1
Scraping page: https://shl.com/products/product-catalog/?start=24&type=1
Scraping page: https://shl.com/products/product-catalog/?start=36&type=1
Scraping page: https://shl.com/products/product-catalog/?start=48&type=1
Scraping page: https://shl.com/products/product-catalog/?start=60&type=1
Scraping page: https://shl.com/products/product-catalog/?start=72&type=1
Scraping page: https://shl.com/products/product-catalog/?start=84&type=1
Scraping page: https://shl.com/products/product-catalog/?start=96&type=1
Scraping page: https://shl.com/products/product-catalog/?start=108&type=1
Scraping page: https://shl.com/products/product-catalog/?start=120&type=1
Scraping page: https://shl.com/products/product-catalog/?start=132&type=1
Scraping page: https://shl.com/products/product-catalog/?start=144&type=1
Scraping page: https://shl.com/products/product-

In [None]:
import json

# Assuming your scraped data is stored in `all_data`
data_to_save = all_data  # or `results` or whatever you named it

with open("shl_data_basic.json", "w", encoding="utf-8") as f:
    json.dump(data_to_save, f, ensure_ascii=False, indent=2)

print("[✓] Scraped data saved to 'shl_data_basic.json'")


[✓] Scraped data saved to 'shl_data_basic.json'


In [None]:
import requests
from bs4 import BeautifulSoup
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_URL = "https://shl.com"

def extract_detail_fields(item):
    detail_url = item.get("link")
    if not detail_url:
        return item

    if detail_url.startswith("/"):
        detail_url = BASE_URL + detail_url

    try:
        response = requests.get(detail_url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        detail = {
            "description": None,
            "job_levels": None,
            "languages": None,
            "assessment_length": None
        }

        rows = soup.select("div.product-catalogue-training-calendar__row.typ")
        for row in rows:
            heading = row.find("h4")
            value = row.find("p")
            if not heading or not value:
                continue

            key = heading.get_text(strip=True).lower()
            val = value.get_text(strip=True)

            if "description" in key:
                detail["description"] = val
            elif "job level" in key:
                detail["job_levels"] = val
            elif "language" in key:
                detail["languages"] = val
            elif "assessment length" in key or "completion time" in key:
                detail["assessment_length"] = val

        item.update(detail)
        return item

    except Exception as e:
        print(f"[ERROR] {item.get('name')} - {e}")
        return item  # Return at least the base item even if enrichment fails

if __name__ == "__main__":
    with open("shl_data_basic.json", "r") as f:
        data = json.load(f)

    enriched_data = []

    # Use ThreadPoolExecutor to speed up requests
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_detail_fields, item) for item in data]

        for i, future in enumerate(as_completed(futures), 1):
            result = future.result()
            enriched_data.append(result)
            print(f"[{i}/{len(data)}] Processed: {result.get('name')}")

    with open("shl_data_enriched.json", "w") as f:
        json.dump(enriched_data, f, indent=2)

    print("\n✅ Done! Saved enriched data to `shl_data_enriched.json`.")


[1/542] Processed: Account Manager Solution
[2/542] Processed: Bank Collections Agent - Short Form
[3/542] Processed: Bank Operations Supervisor - Short Form
[4/542] Processed: Apprentice 8.0 Job Focused Assessment
[5/542] Processed: Agency Manager Solution
[6/542] Processed: Administrative Professional - Short Form
[7/542] Processed: Apprentice + 8.0 Job Focused Assessment
[8/542] Processed: Bilingual Spanish Reservation Agent Solution
[9/542] Processed: Bank Administrative Assistant - Short Form
[10/542] Processed: Bookkeeping, Accounting, Auditing Clerk Short Form
[11/542] Processed: .NET MVC (New)
[12/542] Processed: .NET MVVM (New)
[13/542] Processed: .NET XAML (New)
[14/542] Processed: Branch Manager - Short Form
[15/542] Processed: Cashier Solution
[16/542] Processed: Global Skills Development Report
[17/542] Processed: .NET WCF (New)
[18/542] Processed: Accounts Payable (New)
[19/542] Processed: .NET WPF (New)
[20/542] Processed: .NET Framework 4.5
[21/542] Processed: Accounts 

In [None]:
import json
import pandas as pd
import os

# Get the current working directory
current_directory = os.getcwd()

# Construct the correct file path
file_path = os.path.join(current_directory, "shl_data_enriched.json")


# Load the cleaned JSON data
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert the JSON data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file in the current directory
excel_file = os.path.join(current_directory, "shl_data_enriched.xlsx")
df.to_excel(excel_file, index=False)

print(f"Data successfully saved to {excel_file}")

Data successfully saved to /content/shl_data_enriched.xlsx


In [None]:
import json
import pandas as pd
import os

# Get the current working directory
current_directory = os.getcwd()

# Construct the correct file path
file_path = os.path.join(current_directory, "shl_data_basic.json")


# Load the cleaned JSON data
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert the JSON data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file in the current directory
excel_file = os.path.join(current_directory, "shl_data_basic.xlsx")
df.to_excel(excel_file, index=False)

print(f"Data successfully saved to {excel_file}")

Data successfully saved to /content/shl_data_basic.xlsx


In [4]:
import json
import re
import numpy as np
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ----------------------
# CONSTANTS
# ----------------------
KEY_MAPPING = {
    "A": "Ability & Aptitude",
    "B": "Biodata & Situational Judgement",
    "C": "Competencies",
    "D": "Development & 360",
    "E": "Assessment Exercises",
    "K": "Knowledge & Skills",
    "P": "Personality & Behavior",
    "S": "Simulations"
}

# ----------------------
# LOAD & PREPROCESS DATA
# ----------------------

def load_and_prepare_data(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    for item in data:
        item["embedding_text"] = f"{item['name']} {item['description']}"

        try:
            item["duration"] = int(item.get("assessment_length", "0").split('=')[-1].strip())
        except:
            item["duration"] = 0

        item["adaptive_support"] = "Yes" if item.get("adaptive_or_irt") else "No"
        item["remote_support"] = "Yes" if item.get("remote") else "No"

        keys = item.get("keys", [])
        item["test_type"] = [KEY_MAPPING.get(k, k) for k in keys]

        item["job_level"] = (item.get("job_levels") or "").strip().rstrip(",") if isinstance(item.get("job_levels"), str) else item.get("job_levels")
        item["languages"] = (item.get("languages") or "").strip().rstrip(",") if isinstance(item.get("languages"), str) else item.get("languages")

    return data

# Load and preprocess the SHL assessments
assessments = load_and_prepare_data("/content/shl_data_enriched.json")

# ----------------------
# EMBEDDINGS
# ----------------------

model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')
corpus_embeddings = embedding_model.encode([a["embedding_text"] for a in assessments])

# ----------------------
# RECOMMENDER
# ----------------------

def extract_duration_from_query(query):
    match = re.search(r'(\d+)\s*minutes?', query.lower())
    return int(match.group(1)) if match else None

def normalize_name(name):
    return name.lower().strip()

def recommend_assessments(user_query, max_duration=None):
    query_embedding = model.encode(user_query, convert_to_tensor=False)
    similarities = cosine_similarity([query_embedding], corpus_embeddings)[0]

    top_indices = np.argsort(similarities)[::-1]
    results = []

    for i in top_indices:
        a = assessments[i]
        if max_duration is not None and a.get("duration", 0) > max_duration:
            continue

        results.append({
            "Assessment Name": a.get('name', ''),
            "Job Level": a.get("job_level", ''),
            "Duration (min)": a.get("duration", 0),
            "Remote Testing": a.get("remote_support", ''),
            "Adaptive/IRT": a.get("adaptive_support", ''),
            "Test Type(s)": ', '.join(a.get("test_type", [])),
            "Languages": a.get("languages", ''),
            "Description": (a.get("description", '')[:200] + "...") if a.get("description") else ''
        })

        if len(results) == 10:
            break

    return pd.DataFrame(results)

# ----------------------
# ACCURACY METRICS
# ----------------------

def calculate_recall_at_k(relevant_assessments, recommended_assessments, k=3):
    recommended_set = set(recommended_assessments[:k])
    relevant_set = set(relevant_assessments)
    return len(recommended_set.intersection(relevant_set)) / len(relevant_set) if len(relevant_set) > 0 else 0

def calculate_map_at_k(relevant_assessments, recommended_assessments, k=3):
    ap_k = 0
    num_relevant = 0
    for i in range(min(k, len(recommended_assessments))):
        if recommended_assessments[i] in relevant_assessments:
            num_relevant += 1
            ap_k += num_relevant / (i + 1)
    return ap_k / min(k, len(relevant_assessments)) if len(relevant_assessments) > 0 else 0

def evaluate_model(test_queries, ground_truth, k=3):
    recalls_at_k = []
    map_at_k = []

    for query, relevant in zip(test_queries, ground_truth):
        max_duration = extract_duration_from_query(query)
        recommended_df = recommend_assessments(query, max_duration=max_duration)

        recommended_names = [normalize_name(a) for a in recommended_df["Assessment Name"].tolist()]
        relevant_names = [normalize_name(r) for r in relevant]

        recalls_at_k.append(calculate_recall_at_k(relevant_names, recommended_names, k))
        map_at_k.append(calculate_map_at_k(relevant_names, recommended_names, k))

    return np.mean(recalls_at_k), np.mean(map_at_k)

# ----------------------
# MAIN TESTING BLOCK
# ----------------------

if __name__ == "__main__":
    test_queries = [
        "I am hiring for Java developers who can also collaborate effectively with my business teams. Looking for an assessment(s) that can be completed in 40 minutes.",
        "I want to hire new graduates for a sales role in my company, the budget is for about an hour for each test. Give me some options.",
        "I am looking for a COO for my company in China and I want to see if they are culturally a right fit for our company. Suggest me an assessment that they can complete in about an hour.",
        "Content Writer required, expert in English and SEO.",
        "Find me 1 hour long assesment for the below job at SHL Job Description Join a community that is shaping the future of work! SHL, People Science. People Answers. Are you a seasoned QA Engineer with a flair for innovation? Are you ready to shape the future of talent assessment and empower organizations to unlock their full potential? If so, we want you to be a part of the SHL Team! As a QA Engineer, you will be involved in creating and implementing software solutions that contribute to the development of our groundbreaking products. An excellent benefit package is offered in a culture where career development, with ongoing manager guidance, collaboration, flexibility, diversity, and inclusivity are all intrinsic to our culture. There is a huge investment in SHL currently so there’s no better time to become a part of something transformational. What You Will Be Doing Getting involved in engineering quality assurance and providing inputs when required. Create and develop test plans for various forms of testing. Conducts and/or participates in formal and informal test case reviews. Develop and initiate functional tests and regression tests. Rolling out improvements for testing and quality processes. Essential What we are looking for from you: Development experience – Java or JavaScript, CSS, HTML (Automation) Selenium WebDriver and page object design pattern (Automation) SQL server knowledge Test case management experience. Manual Testing Desirable Knowledge the basic concepts of testing Strong solution-finding experience Strong verbal and written communicator. Get In Touch Find out how this one-off opportunity can help you achieve your career goals by making an application to our knowledgeable and friendly Talent Acquisition team. Choose a new path with SHL. #CareersAtSHL #SHLHiringTalent #TechnologyJobs #QualityAssuranceJobs #CareerOpportunities #JobOpportunities About Us We unlock the possibilities of businesses through the power of people, science and technology. We started this industry of people insight more than 40 years ago and continue to lead the market with powerhouse product launches, ground-breaking science and business transformation. When you inspire and transform people’s lives, you will experience the greatest business outcomes possible. SHL’s products insights, experiences, and services can help achieve growth at scale. What SHL Can Offer You Diversity, equity, inclusion and accessibility are key threads in the fabric of SHL’s business and culture (find out more about DEI and accessibility at SHL ) Employee benefits package that takes care of you and your family. Support, coaching, and on-the-job development to achieve career success A fun and flexible workplace where you’ll be inspired to do your best work (find out more LifeAtSHL ) The ability to transform workplaces around the world for others. SHL is an equal opportunity employer. We support and encourage applications from a diverse range of candidates. We can, and do make adjustments to make sure our recruitment process is as inclusive as possible. SHL is an equal opportunity employer.",
        "ICICI Bank Assistant Admin, Experience required 0-2 years, test should be 30-40 mins long",
        "SKEY RESPONSIBITILES: Manage the sound-scape of the station through appropriate creative and marketing interventions to Increase or Maintain the listenership Acts as an interface between Programming & sales team, thereby supporting the sales team by providing creative inputs in order to increase the overall ad spends by clients Build brand Mirchi by ideating fresh programming initiatives on air campaigns, programming led onground events & new properties to ensure brand differentiation & thus increase brand recall at station level Invest time in local RJs to grow & develop them as local celebrities Through strong networking, must focus on identifying the best of local talent and ensure to bring the creative minds from the market on board with Mirchi Build radio as a category for both listeners & advertisers People Management Identifying the right talent and investing time in developing them by frequent feedback on their performance Monitor, Coach and mentor team members on a regular basis Development of Jocks as per guidelines Must have an eye to spot the local talent to fill up vacancies locally TECHNICAL SKILLS & QUALIFICATION REQUIRED: Graduation / Post Graduation (Any specialisation) with 8 -12 years of relevant experience Experience in digital content conceptualisation Strong branding focus Must be well-read in variety of areas and must keep up with the latest events in the city / cluster / country Must know to read, write & speak English PERSONAL ATTRIBUTES: Excellent communication skills Good interpersonal skills People management Suggest me some tests for the above jd. The duration should be at most 90 mins"
    ]

    ground_truth = [
        [
            "Automata - Fix (New)",
            "Core Java (Entry Level) (New)",
            "Java 8 (New)",
            "Core Java (Advanced Level) (New)",
            "Agile Software Development",
            "Technology Professional 8.0 Job Focused Assessment",
            "Computer Science (New)"
        ],
        [
            "Entry level Sales 7.1 (International)",
            "Entry Level Sales Sift Out 7.1",
            "Entry Level Sales Solution",
            "Sales Representative Solution",
            "Sales Support Specialist Solution",
            "Technical Sales Associate Solution",
            "SVAR - Spoken English (Indian Accent) (New)",
            "Sales & Service Phone Solution",
            "Sales & Service Phone Simulation",
            "English Comprehension (New)"
        ],
        [
            "Motivation Questionnaire MQM5",
            "Global Skills Assessment",
            "Graduate 8.0 Job Focused Assessment"
        ],
        [
            "Drupal (New)",
            "Search Engine Optimization (New)",
            "Administrative Professional - Short Form",
            "Entry Level Sales Sift Out 7.1",
            "General Entry Level – Data Entry 7.0 Solution"
        ],
        [
            "Automata Selenium",
            "Automata - Fix (New)",
            "Automata Front End",
            "JavaScript (New)",
            "HTML/CSS (New)",
            "HTML5 (New)",
            "CSS3 (New)",
            "Selenium (New)",
            "SQL Server (New)",
            "Automata - SQL (New)",
            "Manual Testing (New)"
        ],
        [
            "Administrative Professional - Short Form",
            "Verify - Numerical Ability",
            "Financial Professional - Short Form",
            "Bank Administrative Assistant - Short Form",
            "General Entry Level – Data Entry 7.0 Solution",
            "Basic Computer Literacy (Windows 10) (New)",
            "Verify - Verbal Ability - Next Generation",
            "SHL Verify Interactive - Inductive Reasoning",
            "Occupational Personality Questionnaire OPQ32r"
        ],
        [
            "SHL Verify Interactive - Inductive Reasoning",
            "Occupational Personality Questionnaire OPQ32r"
        ]
    ]


    recall_k, map_k = evaluate_model(test_queries, ground_truth)
    print(f"Mean Recall@K: {recall_k:.3f}")
    print(f"Mean MAP@K: {map_k:.3f}")


Mean Recall@K: 0.059
Mean MAP@K: 0.135


In [36]:
import json
import numpy as np
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ----------------------
# CONSTANTS
# ----------------------

KEY_MAPPING = {
    "A": "Ability & Aptitude",
    "B": "Biodata & Situational Judgement",
    "C": "Competencies",
    "D": "Development & 360",
    "E": "Assessment Exercises",
    "K": "Knowledge & Skills",
    "P": "Personality & Behavior",
    "S": "Simulations"
}

# ----------------------
# LOAD & PREPROCESS DATA
# ----------------------

def load_and_prepare_data(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    for item in data:
        item["embedding_text"] = f"{item['name']} {item['description']}"

        try:
            item["duration"] = int(item.get("assessment_length", "0").split('=')[-1].strip())
        except:
            item["duration"] = 0

        item["adaptive_support"] = "Yes" if item.get("adaptive_or_irt") else "No"
        item["remote_support"] = "Yes" if item.get("remote") else "No"

        keys = item.get("keys", [])
        item["test_type"] = [KEY_MAPPING.get(k, k) for k in keys]

        item["job_level"] = (item.get("job_levels") or "").strip().rstrip(",")
        item["languages"] = (item.get("languages") or "").strip().rstrip(",")

        # Add a URL to each assessment (if available in the dataset)
        item["url"] = item.get("url", "No URL available")  # If no URL, assign a placeholder

    return data

assessments = load_and_prepare_data("/content/shl_data_enriched.json")

# ----------------------
# EMBEDDINGS
# ----------------------

model = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = model.encode([a["embedding_text"] for a in assessments], convert_to_tensor=False)

# ----------------------
# RECOMMENDER
# ----------------------

def recommend_assessments(user_query):
    query_embedding = model.encode(user_query, convert_to_tensor=False)
    similarities = cosine_similarity([query_embedding], corpus_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:10]

    results = []
    for i in top_indices:
        a = assessments[i]
        results.append({
            "Assessment Name": a['name'],
            "Job Level": a["job_level"],
            "Duration (min)": a["duration"],
            "Remote Testing": a["remote_support"],
            "Adaptive/IRT": a["adaptive_support"],
            "Test Type(s)": ', '.join(a["test_type"]),
            "Languages": a["languages"],
            "Description": a["description"][:200] + "...",
            "URL": a["link"]  # Include the URL in the result
        })

    return pd.DataFrame(results)

# ----------------------
# GRADIO UI
# ----------------------

demo = gr.Interface(
    fn=recommend_assessments,
    inputs=gr.Textbox(lines=3, label="Enter job description or query"),
    outputs=gr.Dataframe(label="Recommended SHL Assessments"),
    title="🔍 SHL Assessment Recommendation System",
    description="Paste a natural language job query or JD text. The system will recommend the most relevant SHL assessments (up to 10)."
)

# ----------------------
# RUN APP
# ----------------------

if __name__ == "__main__":
    demo.launch(share=True)  # Use share=True to generate public URL


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9f9c78fa70f32f940e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [24]:
from pyngrok import conf
from pyngrok import ngrok
ngrok.kill()
conf.get_default().auth_token = "2wAvyLm35oEfWS97ZpinVeRYms5_2Nv7RqV5is1MegWuaB8d9"


In [37]:
import json
import re
import numpy as np
from flask import Flask, request, jsonify
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from pyngrok import ngrok
from threading import Thread

KEY_MAPPING = {
    "A": "Ability & Aptitude",
    "B": "Biodata & Situational Judgement",
    "C": "Competencies",
    "D": "Development & 360",
    "E": "Assessment Exercises",
    "K": "Knowledge & Skills",
    "P": "Personality & Behavior",
    "S": "Simulations"
}

# Load and preprocess assessments
with open("shl_data_enriched.json", "r", encoding="utf-8") as f:
    assessments = json.load(f)

for item in assessments:
    item["embedding_text"] = f"{item['name']} {item['description']}"
    try:
        item["duration"] = int(item.get("assessment_length", "0").split('=')[-1].strip())
    except:
        item["duration"] = 0
    item["adaptive_support"] = "Yes" if item.get("adaptive_or_irt") else "No"
    item["remote_support"] = "Yes" if item.get("remote") else "No"
    keys = item.get("keys", [])
    item["test_type"] = [KEY_MAPPING.get(k, k) for k in keys]

    # Ensure URL is present in each assessment (if available)
    item["url"] = item.get("url", "No URL available")  # Provide a default URL if missing

model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')
corpus_embeddings = model.encode([a["embedding_text"] for a in assessments])

def extract_duration_from_query(query):
    match = re.search(r'(\d+)\s*minutes?', query.lower())
    return int(match.group(1)) if match else None

def recommend(query, max_duration=None):
    query_embedding = model.encode(query, convert_to_tensor=False)
    similarities = cosine_similarity([query_embedding], corpus_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1]
    results = []

    for i in top_indices:
        a = assessments[i]
        if max_duration is not None and a.get("duration", 0) > max_duration:
            continue
        results.append({
            "Assessment Name": a.get("name", ""),
            "Job Level": a.get("job_level", "Unknown"),
            "Duration (min)": a.get("duration", 0),
            "Remote Testing": a.get("remote_support", "No"),
            "Adaptive/IRT": a.get("adaptive_support", "No"),
            "Test Type(s)": ', '.join(a.get("test_type", [])),
            "Languages": a.get("languages", "Not specified"),
            "Description": a.get("description", "")[:500],  # Truncate description for brevity
            "URL": a.get("link", "No URL available")  # Include URL in the result
        })
        if len(results) == 10:
            break
    return results

# Flask app
app = Flask(__name__)

@app.route("/shl-ai-apis/recommend", methods=["POST"])
def recommend_endpoint():
    data = request.get_json()
    query = data.get("query", "")
    if not query:
        return jsonify({"error": "Query is required"}), 400
    max_duration = extract_duration_from_query(query)
    recommended_assessments = recommend(query, max_duration)
    return jsonify({"recommended_assessments": recommended_assessments})

@app.route("/shl-ai-apis/health", methods=["GET"])
def health_check():
    return jsonify({"status": "ok"}), 200

# Start Flask app and ngrok in the right order
# Run Flask with ngrok
def run_app():
    app.run(port=5015)  # or any other free port like 5002, 8080, etc.

def start_ngrok():
    public_url = ngrok.connect(5015)
    print(f"🚀 Your public endpoint is: {public_url}")

# Start the Flask app and Ngrok in a thread
Thread(target=run_app).start()  # runs Flask in the background
start_ngrok()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5015
INFO:werkzeug:[33mPress CTRL+C to quit[0m


🚀 Your public endpoint is: NgrokTunnel: "https://f540-35-199-157-109.ngrok-free.app" -> "http://localhost:5015"


In [38]:
import requests
import json

# Making a GET request to the health endpoint
res = requests.get("https://f540-35-199-157-109.ngrok-free.app/shl-ai-apis/health")
print(res.status_code)
# Pretty print the JSON response from the health check endpoint
print(json.dumps(res.json(), indent=4))

# Query to send in the POST request
query = {
    "query": "I want to hire a content writer who is good at SEO and English"
}

# Making a POST request to the recommend endpoint
res = requests.post("https://f540-35-199-157-109.ngrok-free.app/shl-ai-apis/recommend", json=query)
print(res.status_code)
# Pretty print the JSON response from the recommend endpoint
print(json.dumps(res.json(), indent=4))


INFO:werkzeug:127.0.0.1 - - [09/May/2025 16:24:23] "GET /shl-ai-apis/health HTTP/1.1" 200 -


200
{
    "status": "ok"
}


INFO:werkzeug:127.0.0.1 - - [09/May/2025 16:24:23] "POST /shl-ai-apis/recommend HTTP/1.1" 200 -


200
{
    "recommended_assessments": [
        {
            "Adaptive/IRT": "No",
            "Assessment Name": "Smart Interview Live",
            "Description": "SHL Live Video Interview is areal-timevideo interview tool. It allows recruiters and line managers to remotely engage with candidates and reach hiring decisions faster. \r\nDelight shortlisted candidates with a face2face digital interview that goes way beyond a conference call. Interact with whiteboards, instant file-sharing, draw from an expertly-curated question bank and provide a remote end-to-end interview experience that increases your chance of conversion.Language AvailabilityThe Candidate a",
            "Duration (min)": 0,
            "Job Level": "Unknown",
            "Languages": "English (USA),",
            "Remote Testing": "Yes",
            "Test Type(s)": "Personality & Behavior",
            "URL": "https://shl.com/products/product-catalog/view/smart-interview-live/"
        },
        {
            "Ada