<a href="https://colab.research.google.com/github/LarshVakil/Semantic-Search-Engine/blob/main/SemanticSearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#To import faiss for GPU (AI CODE)
import torch
import os

# Detect CUDA version to pick the right package
if torch.cuda.is_available() and torch.version.cuda is not None:
    cuda_version = torch.version.cuda.split('.')[0]
    package_name = f"faiss-gpu-cu{cuda_version}"
    print(f"CUDA detected, installing {package_name}...")
else:
    package_name = "faiss-cpu"
    print("CUDA not detected or not available, installing faiss-cpu...")

!apt install -y libomp-dev
!pip install {package_name} sentence-transformers datasets

import faiss

print(f"FAISS version {faiss.__version__} installed successfully!")

CUDA detected, installing faiss-gpu-cu12...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libomp-dev is already the newest version (1:14.0-55~exp2).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
FAISS version 1.13.2 installed successfully!


In [2]:
#Other Imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses, CrossEncoder, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
import time

In [3]:
#Load the dataset

dataset = load_dataset("nthakur/msmarco-passage-sampled-100k")

#Split into training validation and testing
split_1 = dataset['train'].train_test_split(train_size=0.90, seed=0)
split_2 = split_1['test'].train_test_split(test_size=0.50, seed=0)

train_ds = split_1['train']
valid_ds = split_2['train']
test_ds  = split_2['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
#Preprocessing
#Renaming columns for sentene - transformers
train_ds = train_ds.rename_columns({"query": "anchor", "positive_passages": "positive", "negative_passages": "negative"})
valid_ds = valid_ds.rename_columns({"query": "anchor", "positive_passages": "positive", "negative_passages": "negative"})
test_ds = test_ds.rename_columns({"query": "anchor", "positive_passages": "positive", "negative_passages": "negative"})

#Get text from lists
def flatten_data(example):
    example['positive'] = example['positive'][0]['text']
    example['negative'] = example['negative'][0]['text']
    return example

train_ds = train_ds.map(flatten_data)
valid_ds = valid_ds.map(flatten_data)
test_ds = test_ds.map(flatten_data)

In [5]:
# Load recommended model and set up loss function
model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model)

# Ensure datasets have the correct column order
train_ds = train_ds.select_columns(['anchor', 'positive', 'negative'])
valid_ds = valid_ds.select_columns(['anchor', 'positive', 'negative'])

# Configure training parameters optimized for Colab T4 with 100k samples
args = SentenceTransformerTrainingArguments(
    output_dir="TunedModel",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_steps=150,
    fp16=True,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
)

#Start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    loss=train_loss,
)

trainer.train()
model.save("TunedModel")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
500,0.074791,0.053843
1000,0.063178,0.054482


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Build FAISS index
train_ds = train_ds.select_columns(['anchor', 'positive', 'negative'])

corpus_sentences = [row['positive'] for row in train_ds.select(range(50000))]

print("Converting documents into searchable vectors")
corpus_embeddings = model.encode(
    corpus_sentences,
    show_progress_bar=True,
    convert_to_numpy=True
)

d = corpus_embeddings.shape[1]
index = faiss.IndexFlatL2(d)

index.add(corpus_embeddings.astype('float32'))

print(f"Index built successfully with {index.ntotal} documents")
print(f"First document in library: '{corpus_sentences[0][:150]}'")

Converting documents into searchable vectors


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Index built successfully with 50000 documents
First document in library: 'American Meaning: The name Lincoln is an American baby name. In American the meaning of the name Lincoln is: From the settlement by the pool. English '


In [7]:
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def advanced_search(query, top_k_faiss=50, final_k=5):
    query_vector = model.encode([query]).astype('float32')
    distances, indices = index.search(query_vector, top_k_faiss)

    # Get the candidate documents
    candidate_texts = [corpus_sentences[i] for i in indices[0]]

    # Rerank them to find the best matches
    pairs = [[query, doc] for doc in candidate_texts]
    scores = reranker.predict(pairs)

    results = sorted(zip(candidate_texts, scores), key=lambda x: x[1], reverse=True)

    return results[:final_k]

my_query = "how to improve search engine ranking"
print(f"Searching for: {my_query}\n")

results = advanced_search(my_query)

print("Results:\n")
for i, (text, score) in enumerate(results, 1):
    print(f"{i}. ({score:.2f}) {text}\n")

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Searching for: how to improve search engine ranking

Results:

1. (-3.60) chumsearch.com is a fake Internet search engine that supposedly enhances the Internet browsing experience by generating improved search results.

2. (-5.24) Search engine industry going through big changes. Google, Yahoo! and MSN dominated search engine world changing based on user behavior and new competition.New styles of search have been launched by: Clusty.com, Blinkx, Snap, A9.com, Info.com, and a much-revived and revamped Ask Jeeves, reports EContent.oogle, Yahoo! and MSN dominated search engine world changing based on user behavior and new competition. New styles of search have been launched by: Clusty.com, Blinkx, Snap, A9.com, Info.com, and a much-revived and revamped Ask Jeeves, reports EContent.

3. (-6.00) There are five ranks per level, but again only four action points. The one exception is upon reaching level 20, when the character gets the final 4 points all at once, since there are no more ranks 

In [None]:
print("Adding test documents to the index...")

test_positives = [row['positive'] for row in test_ds.select(range(100))]
test_queries = [row['anchor'] for row in test_ds.select(range(100))]

start_index_id = len(corpus_sentences)
corpus_sentences.extend(test_positives)

test_embeddings = model.encode(test_positives, convert_to_numpy=True)
index.add(test_embeddings.astype('float32'))

print(f"Index updated! Now contains {index.ntotal} documents")

In [None]:
def evaluate_metrics(queries, true_positives, k_values=[1, 5, 10, 20]):
    results = {
        'mrr': [],
        'ndcg': {k: [] for k in k_values},
        'recall': {k: [] for k in k_values},
        'precision': {k: [] for k in k_values}
    }

    print(f"Evaluating {len(queries)} queries...")

    for count, (query, gold_text) in enumerate(zip(queries, true_positives)):
        search_results = advanced_search(query, top_k_faiss=100, final_k=20)
        retrieved_texts = [res[0] for res in search_results]

        gold_clean = gold_text.strip().lower()

        rank = -1
        for i, text in enumerate(retrieved_texts):
            if gold_clean in text.strip().lower():
                rank = i + 1
                break

        results['mrr'].append(1/rank if rank != -1 else 0)

        for k in k_values:
            top_k_retrieved = [t.strip().lower() for t in retrieved_texts[:k]]

            is_relevant = 0
            for t in top_k_retrieved:
                if gold_clean in t:
                    is_relevant = 1
                    break

            results['precision'][k].append(is_relevant / k)
            results['recall'][k].append(is_relevant)
            results['ndcg'][k].append(1/np.log2(rank + 1) if (rank != -1 and rank <= k) else 0)

    print(f"\nMRR: {np.mean(results['mrr']):.4f}")
    print(f"{'Metric':<12} | {'@1':<8} | {'@5':<8} | {'@10':<8} | {'@20':<8}")

    for metric in ['ndcg', 'recall', 'precision']:
        row = f"{metric.upper():<12}"
        for k in k_values:
            row += f" | {np.mean(results[metric][k]):.4f}"
        print(row)

evaluate_metrics(test_queries, test_positives)

In [None]:
def run_cli():
    print("Search Engine")
    print("Type 'exit' to quit\n")

    while True:
        query = input("Enter your search: ")
        if query.lower() == 'exit':
            break

        if not query.strip():
            continue

        try:
            k_input = input("How many results? (press enter for 5): ")
            k = int(k_input) if k_input.strip() else 5
        except ValueError:
            k = 5

        start = time.time()
        results = advanced_search(query, final_k=k)
        duration = (time.time() - start) * 1000

        print(f"\nFound {len(results)} results ({duration:.0f}ms)\n")
        for i, (text, score) in enumerate(results, 1):
            print(f"{i}. ({score:.2f}) {text[:200]}...\n")

run_cli()

Search Engine CLI
Type 'exit' to quit.

Enter query: HELLO
Number of results (default 5): 5

Found 5 results in 120.67ms:
1. [Score: 2.3043] Greet(verb) to weep; to cry; to lament. Greet(noun) mourning. Greet(verb) to address with salutations or expressions of kind wishes; to salute; to hail; to welcome; to accost with friendship; to pay r...
2. [Score: -1.4369] Aloha in the Hawaiian language means affection, peace, compassion, and mercy. Since the middle of the 19th century, it also has come to be used as an English greeting to say goodbye and hello.Aloha is...
3. [Score: -5.6749] Newbie, newb, noob, or n00b is a slang term for a novice or newcomer, or somebody inexperienced in a profession or activity. Contemporary use can particularly refer to a beginner or new user of comput...
4. [Score: -5.7980] from The American HeritageÂ® Dictionary of the English Language, 4th Edition. 1  adj. Warm and sincere; friendly: a cordial greeting; cordial relations. 2  adj. Strongly felt; fervent: a 