In [1]:
# ============================
# SEARCH ENGINES FOR AG NEWS
# ============================

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# ----------------------------
# 1. Load Dataset
# ----------------------------
dataset = load_dataset("ag_news")
texts = [x["text"] for x in dataset["train"]]

print("Number of documents:", len(texts))


# ----------------------------
# 2. TF-IDF Search Engine
# ----------------------------
print("\nBuilding TF-IDF index...")

tfidf = TfidfVectorizer(max_features=20000)
X_tfidf = tfidf.fit_transform(texts)

def search_tfidf(query, top_k=5):
    q_vec = tfidf.transform([query])
    sims = cosine_similarity(q_vec, X_tfidf)[0]
    idx = sims.argsort()[::-1][:top_k]
    return [(i, sims[i], texts[i]) for i in idx]


# ----------------------------
# 3. Embedding Search (MiniLM)
# ----------------------------
print("Building Embedding index...")

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)

# Normalize for cosine similarity
emb_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

def search_embed(query, top_k=5):
    q_emb = embedder.encode([query])[0]
    q_emb = q_emb / np.linalg.norm(q_emb)
    sims = emb_norm @ q_emb
    idx = sims.argsort()[::-1][:top_k]
    return [(i, sims[i], texts[i]) for i in idx]


# ----------------------------
# 4. Hybrid Search (TF-IDF + Embedding)
# ----------------------------
def search_hybrid(query, top_k=5, alpha=0.5):
    # TF-IDF scores
    q_vec = tfidf.transform([query])
    tfidf_sims = cosine_similarity(q_vec, X_tfidf)[0]

    # Embedding scores
    q_emb = embedder.encode([query])[0]
    q_emb = q_emb / np.linalg.norm(q_emb)
    embed_sims = emb_norm @ q_emb

    # Normalize both (simple min-max normalization)
    def normalize(arr):
        return (arr - arr.min()) / (arr.max() - arr.min() + 1e-9)

    combined = alpha * normalize(tfidf_sims) + (1 - alpha) * normalize(embed_sims)

    idx = combined.argsort()[::-1][:top_k]
    return [(i, combined[i], texts[i]) for i in idx]


# ----------------------------
# 5. Test all search engines
# ----------------------------

query = "latest technology trends"

print("\n=== TF-IDF Search ===")
for r in search_tfidf(query):
    print(f"Score: {r[1]:.3f} | {r[2][:100]}...")

print("\n=== Embedding Search ===")
for r in search_embed(query):
    print(f"Score: {r[1]:.3f} | {r[2][:100]}...")

print("\n=== Hybrid Search ===")
for r in search_hybrid(query):
    print(f"Score: {r[1]:.3f} | {r[2][:100]}...")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Number of documents: 120000

Building TF-IDF index...
Building Embedding index...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/3750 [00:00<?, ?it/s]


=== TF-IDF Search ===
Score: 0.297 | Retailers Stock Up on Latest Gadgets (AP) AP - Some of the biggest fashion trends at department stor...
Score: 0.284 | Infocus: Trends in Web Application Security This article discusses current trends in penetration tes...
Score: 0.259 | IBM beats Wall Street #39;s expectations IBM, a barometer of trends in the computing business, deliv...
Score: 0.254 | IBM #39;s 3rd-Quarter Profit Beats Analysts #39; Predictions I.BM, a barometer of trends in the comp...
Score: 0.253 | IBM #39;s profit growth shows tech recovery hanging on IBM, a barometer of trends in the computing b...

=== Embedding Search ===
Score: 0.572 | It Takes Time to Judge the True Impact of New Technology About this time last year, I wrote a column...
Score: 0.550 | Tech 2005: What's New and  What's Next The products you use are about to get smarter, faster, smalle...
Score: 0.526 | WiMAX just hype for now?  quot;These applications will not be large enough to sustain the multitude ...

## LLM-Judge Evaluation (Query 1)

For the query **"latest technology trends"**, the LLM-judge rated the search engine outputs as follows:

{"tfidf": 1, "embed": 3, "hybrid": 2}


**Interpretation:**
- **Embedding Search (3)** performed best — highly relevant articles.
- **Hybrid Search (2)** was moderately relevant.
- **TF-IDF (1)** was the least relevant.
