In [None]:
!pip install -q faiss-cpu sentence-transformers transformers pdfminer.six accelerate

from pdfminer.high_level import extract_text

pdf_path = "a.pdf"
raw_text = extract_text(pdf_path)

print("✅ Extracted text from PDF.")


from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_text(raw_text)

print(f"✅ Split into {len(texts)} chunks.")


from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(texts, convert_to_numpy=True)

# ✅ Step 4: Store embeddings in FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("✅ FAISS index created.")

# ✅ Step 5: Load an open-source language model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "tiiuae/falcon-7b-instruct"  # Swap to mistral if you have Colab Pro+
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype=torch.float16
)

print("✅ Loaded Falcon-7B model.")

# ✅ Step 6: Ask a question or generate summary
def ask_question(query, top_k=3):
    query_embedding = embed_model.encode([query])
    D, I = index.search(query_embedding, top_k)
    retrieved_chunks = [texts[i] for i in I[0]]
    context = "\n".join(retrieved_chunks)

    prompt = f"""Based on the context below, answer the question.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()

# ✅ Ask for a summary of the document
summary = ask_question("Summarize the main points of the document.")
print("\n📄 Document Summary:\n", summary)


✅ Extracted text from PDF.




✅ Split into 69 chunks.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ FAISS index created.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


✅ Loaded Falcon-7B model.

📄 Document Summary:
 The document discusses a vast repository of handwritten images representing the efforts of 3,600 people, containing accurate ground truth classifications. It is a dynamic repository providing ideas and direction for further data collection activities. Advanced software tools simplify image administration and processing, making it useful for exploration and analysis.
