<a href="https://colab.research.google.com/github/Meetlalwani01/Document-Similarity-Search-API/blob/main/submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fastapi uvicorn faiss-cpu sentence-transformers scikit-learn nltk nest_asyncio pyngrok


Collecting fastapi
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manyli

In [2]:
import nltk

# Download NLTK data for tokenization, stopwords, lemmatization
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


print("NLTK data downloaded. Ready to proceed!")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK data downloaded. Ready to proceed!


In [3]:
import re
import numpy as np
import faiss

# For building the API
from fastapi import FastAPI, Query, Body
from pydantic import BaseModel
from typing import List

# For embeddings
from sentence_transformers import SentenceTransformer

# For dataset
from sklearn.datasets import fetch_20newsgroups

# For advanced preprocessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For running in Colab
import nest_asyncio
from pyngrok import ngrok
import uvicorn

nest_asyncio.apply()

# Global variables
EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # smaller, fast model
model = SentenceTransformer(EMBEDDING_MODEL)

# We'll store the documents and embeddings
documents = []
doc_embeddings = None
faiss_index = None

# FAISS hyperparameters
nlist = 50   # number of centroids
m = 8        # number of sub-vectors for product quantization
nprobe = 10  # clusters to search per query

print("Imports and global variables set up successfully.")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Imports and global variables set up successfully.


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def advanced_preprocess(text: str) -> str:
    """
    1. Lowercase
    2. Tokenize
    3. Remove non-alphanumeric tokens
    4. Stopword removal
    5. Lemmatization
    """
    text = text.lower().strip()
    tokens = word_tokenize(text)
    processed_tokens = []

    for token in tokens:
        # Keep only alphanumeric tokens
        if not re.match(r"^[a-z0-9]+$", token):
            continue
        # Remove stopwords
        if token in stop_words:
            continue
        # Lemmatize
        lemma = lemmatizer.lemmatize(token)
        processed_tokens.append(lemma)

    return " ".join(processed_tokens)

def embed_texts(texts: List[str]) -> np.ndarray:
    """
    Converts a list of text documents into embeddings.
    Applies advanced_preprocess() to clean text first.
    """
    cleaned = [advanced_preprocess(t) for t in texts]
    print(f"Embedding {len(cleaned)} text(s) ...")
    embeddings = model.encode(cleaned, show_progress_bar=False)
    return np.array(embeddings).astype("float32")

def build_ivfpq_index(embeddings: np.ndarray, nlist=50, m=8, nprobe=10) -> faiss.IndexIVFPQ:
    """
    Build a FAISS IndexIVFPQ with product quantization.
    """
    dim = embeddings.shape[1]
    quantizer = faiss.IndexFlatL2(dim)  # base quantizer
    index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8)  # 8 bits per sub-vector

    print("Training IVFPQ index...")
    index.train(embeddings)

    print(f"Adding {embeddings.shape[0]} vectors to the index...")
    index.add(embeddings)
    index.nprobe = nprobe

    print("FAISS index built successfully!")
    return index

print("Helper functions defined.")


Helper functions defined.


In [13]:
from datasets import load_dataset

def initialize_data_and_index(max_docs=1000):
    global documents, doc_embeddings, faiss_index

    print("Fetching Wikipedia dataset...")
    wiki_data = load_dataset("wikipedia", "20220301.simple", split=f"train[:{max_docs}]")
    raw_docs = [entry['text'] for entry in wiki_data if entry['text'].strip()]
    print(f"Total fetched Wikipedia docs: {len(raw_docs)}")

    documents = raw_docs[:max_docs]

    # Embed
    doc_embeddings = embed_texts(documents)

    # Build index
    faiss_index = build_ivfpq_index(doc_embeddings, nlist=nlist, m=m, nprobe=nprobe)
    print("Wikipedia Data and FAISS index are ready!")



In [14]:
from fastapi import Request

# Pydantic models for request/response
class SearchResponse(BaseModel):
    query: str
    top_k: int
    results: List[str]

class AddDocRequest(BaseModel):
    text: str

# Create the FastAPI instance
app = FastAPI(
    title="Colab Document Similarity API",
    description="A simple doc similarity search using 20 Newsgroups, FAISS, and SentenceTransformers.",
)

@app.get("/api/search", response_model=SearchResponse)
def search_documents(q: str = Query(..., description="User query"),
                     top_k: int = Query(5, description="Number of documents to retrieve")):
    global faiss_index, documents
    if not q:
        return {"query": q, "top_k": top_k, "results": []}

    print(f"Received search query: '{q}' with top_k={top_k}")
    query_emb = embed_texts([q])  # shape (1, dim)

    distances, indices = faiss_index.search(query_emb, top_k)
    result_docs = [documents[idx] for idx in indices[0]]

    print(f"Returning {len(result_docs)} results.")
    return {
        "query": q,
        "top_k": top_k,
        "results": result_docs
    }

@app.post("/api/add")
def add_document(request: AddDocRequest):
    """
    Add a new document to the corpus and re-build the FAISS index (simple approach).
    """
    global documents, doc_embeddings, faiss_index

    new_doc = request.text
    print("Adding new doc:", new_doc[:60], "...")
    documents.append(new_doc)

    new_embedding = embed_texts([new_doc])

    # Extend the embeddings array
    if doc_embeddings is None:
        doc_embeddings = new_embedding
    else:
        doc_embeddings = np.concatenate((doc_embeddings, new_embedding), axis=0)

    # Rebuild the entire index
    faiss_index = build_ivfpq_index(doc_embeddings, nlist=nlist, m=m, nprobe=nprobe)

    return {
        "message": "Document added successfully!",
        "current_doc_count": len(documents)
    }

print("FastAPI app defined with two endpoints: /api/search and /api/add.")


FastAPI app defined with two endpoints: /api/search and /api/add.


In [None]:
from pyngrok import ngrok

# Set your authtoken
ngrok.set_auth_token("2ue0XJ2nJVH6EHlEEl7YAFTpXW1_3fy2sidqWa1s6equJvHVu")  # Replace with your actual authtoken

# Create a public URL
public_url = ngrok.connect(8000)
print("Public URL:", public_url.public_url)

# Run uvicorn
print("Starting server on port 8000...")
initialize_data_and_index(max_docs=600)
uvicorn.run(app, host="0.0.0.0", port=8000)

Public URL: https://b248-34-87-33-111.ngrok-free.app
Starting server on port 8000...
Fetching Wikipedia dataset...


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train-00000-of-00001.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/205328 [00:00<?, ? examples/s]

Total fetched Wikipedia docs: 600
Embedding 600 text(s) ...


INFO:     Started server process [555]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Training IVFPQ index...
Adding 600 vectors to the index...
FAISS index built successfully!
Wikipedia Data and FAISS index are ready!
INFO:     14.139.38.154:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     14.139.38.154:0 - "GET /openapi.json HTTP/1.1" 200 OK
