In [1]:
!pip install -q faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import re
import torch
import numpy as np
import faiss
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer

# ============================================================
# DEVICE
# ============================================================

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# ============================================================
# MODULE 1 — DATA INGESTION (NO SPACY)
# ============================================================

def load_raw_wikipedia(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def normalize_text(text: str) -> str:
    lines = text.splitlines()
    cleaned = [line.strip() for line in lines if line.strip()]
    return "\n".join(cleaned)

# Simple sentence splitting (stable with NumPy 2.x)
def split_into_sentences(text: str) -> List[str]:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def ingest_wikipedia(path: str) -> List[str]:
    raw = load_raw_wikipedia(path)
    normalized = normalize_text(raw)
    return split_into_sentences(normalized)

# ============================================================
# MODULE 2 — CHUNKING
# ============================================================

def count_tokens(text: str) -> int:
    return int(len(text.split()) / 0.75)

def build_chunks(
    sentences: List[str],
    min_tokens=300,
    max_tokens=600,
    overlap_tokens=100
) -> List[Dict]:

    chunks = []
    current = []
    current_tokens = 0
    chunk_id = 0

    for sentence in sentences:
        t = count_tokens(sentence)

        if current_tokens + t > max_tokens:
            if current_tokens >= min_tokens:
                chunks.append({
                    "chunk_id": chunk_id,
                    "text": " ".join(current),
                    "token_count": current_tokens
                })
                chunk_id += 1

            overlap = []
            overlap_count = 0
            for s in reversed(current):
                st = count_tokens(s)
                if overlap_count + st > overlap_tokens:
                    break
                overlap.insert(0, s)
                overlap_count += st

            current = overlap
            current_tokens = overlap_count

        current.append(sentence)
        current_tokens += t

    if current_tokens >= min_tokens:
        chunks.append({
            "chunk_id": chunk_id,
            "text": " ".join(current),
            "token_count": current_tokens
        })

    return chunks

# ============================================================
# MODULE 3 — RETRIEVAL (E5-LARGE)
# ============================================================

def load_embedding_model():
    return SentenceTransformer("intfloat/e5-large", device=DEVICE)

def embed_chunks(model, chunks):
    texts = ["passage: " + c["text"] for c in chunks]
    embeddings = model.encode(
        texts,
        batch_size=16,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    return np.array(embeddings).astype("float32")

def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index

def retrieve_top_k(question, model, index, chunks, k=3):
    query_embedding = model.encode(
        ["query: " + question],
        normalize_embeddings=True
    ).astype("float32")

    scores, ids = index.search(query_embedding, k)

    results = []
    for idx, score in zip(ids[0], scores[0]):
        results.append((chunks[idx], float(score)))

    return results

# ============================================================
# MODULE 4 — FLAN-T5 ANSWERER
# ============================================================

def load_answerer_model():
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
    model.to(DEVICE)
    model.eval()
    return tokenizer, model

def build_prompt(question, retrieved_chunks):
    context = "\n\n".join(chunk["text"] for chunk in retrieved_chunks)

    return (
        "Answer the question using ONLY the provided text.\n"
        "Do not add information.\n"
        "If the answer is not in the text, say:\n"
        "\"Not enough information in the Simple Wikipedia dataset.\"\n"
        "Use simple English and write at most 3 short sentences.\n\n"
        f"Question: {question}\n\n"
        f"Text: {context}"
    )

def generate_raw_answer(question, retrieved_chunks, tokenizer, model):
    prompt = build_prompt(question, retrieved_chunks)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=False,
            temperature=0.0
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ============================================================
# MODULE 5 — POST PROCESSING
# ============================================================

def post_process(answer, retrieved_chunks):

    answer = re.sub(r"\([^)]*\)", "", answer)

    sentences = re.split(r'(?<=[.!?])\s+', answer)
    sentences = [s.strip() for s in sentences if s.strip()]
    sentences = sentences[:3]

    final = " ".join(sentences).strip()

    if not final:
        return "Not enough information in the Simple Wikipedia dataset."

    retrieved_text = " ".join(c["text"] for c in retrieved_chunks).lower()
    retrieved_words = set(retrieved_text.split())
    answer_words = final.lower().split()

    unseen = [w for w in answer_words if w not in retrieved_words]

    if len(unseen) / max(len(answer_words), 1) > 0.2:
        return "Not enough information in the Simple Wikipedia dataset."

    return final

# ============================================================
# PIPELINE
# ============================================================

def answer_question(question, embedding_model, index, chunks, tokenizer, model):

    retrieval = retrieve_top_k(question, embedding_model, index, chunks, k=3)
    retrieved_chunks = [c for c, _ in retrieval]

    raw = generate_raw_answer(question, retrieved_chunks, tokenizer, model)
    final = post_process(raw, retrieved_chunks)

    return final

# ============================================================
# RUN
# ============================================================

wiki_path = "/kaggle/input/datasets/namanatgoel/allcombined/AllCombined.txt"

print("Loading dataset...")
sentences = ingest_wikipedia(wiki_path)

print("Building chunks...")
chunks = build_chunks(sentences)

print("Loading embedding model...")
embedding_model = load_embedding_model()

print("Embedding chunks...")
embeddings = embed_chunks(embedding_model, chunks)

print("Building FAISS index...")
index = build_faiss_index(embeddings)

print("Loading FLAN-T5...")
tokenizer, answer_model = load_answerer_model()

print("\nTest Question:")
print(answer_question("What is gravity?", embedding_model, index, chunks, tokenizer, answer_model))


2026-02-18 18:16:27.259740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771438587.415020      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771438587.459408      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771438587.829169      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771438587.829206      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771438587.829209      24 computation_placer.cc:177] computation placer alr

Using device: cuda
GPU: Tesla T4
Loading dataset...
Building chunks...
Loading embedding model...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Embedding chunks...


Batches:   0%|          | 0/4878 [00:00<?, ?it/s]

Building FAISS index...
Loading FLAN-T5...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Test Question:
The term gravity is used to describe the force that pulls objects together.


In [3]:
import os

for root, dirs, files in os.walk("/kaggle/input"):
    for name in files:
        print(os.path.join(root, name))

/kaggle/input/datasets/namanatgoel/allcombined/AllCombined.txt
