# These packages for reading files from AWS S3, computing embeddings, and building a fast vector store.

In [1]:
# === Colab setup cell ===
!pip install -q boto3 s3fs sentence-transformers faiss-cpu transformers pdfplumber python-docx PyPDF2 nltk rouge-score fastapi uvicorn mangum

# Small notes:
# - sentence-transformers: for embeddings (all-MiniLM-L6-v2)
# - faiss-cpu: vector index on CPU
# - transformers: for a small Flan-T5 generator (local)
# - pdfplumber / PyPDF2 / python-docx: file readers
# - rouge-score: evaluation helper

# Configure AWS credentials (replace with your values or set env via Colab secrets)
import os
os.environ['AWS_ACCESS_KEY_ID'] = "AKIAQQNOG36HFIRA6BKQ"
os.environ['AWS_SECRET_ACCESS_KEY'] = "/qgeupj42BfFdLI5/8tye+6aJKn2EwaZbLu/lhC+"
os.environ['AWS_REGION'] = "eu-west-1"   # Dublin region (change if needed)

# Set your S3 bucket and prefix where you uploaded the files:
BUCKET = "ceader-eu-ai-act"
S3_PREFIX = "path/to/your/files"  # e.g., "rag-data" or "" if files at bucket root


# This cell reads all DOCX files under a prefix and returns plain text per document in S3 Bucket.

In [2]:
import s3fs

fs = s3fs.S3FileSystem(anon=False)

# Replace with your actual bucket name
BUCKET = "ceader-eu-ai-act"

# List everything in the bucket
files = fs.ls(BUCKET)
print("All files in bucket:")
for f in files:
    print(f)


All files in bucket:
ceader-eu-ai-act/Attention_is_all_you_need.docx
ceader-eu-ai-act/EU AI Act Doc (1).docx
ceader-eu-ai-act/path



#RAG needs text. Read DOCXs and store their text. We keep the S3 path as an identity to trace answers back to source files.

In [3]:
# === Read DOCX files from S3 ===
import s3fs, io
from docx import Document

fs = s3fs.S3FileSystem(anon=False)

def list_s3_files(bucket, prefix):
    path = f"{bucket}/{prefix}".rstrip('/')
    return fs.ls('ceader-eu-ai-act/Attention_is_all_you_need.docx')  # returns full paths like "bucket/prefix/file.docx"

def read_docx_from_s3(s3_path):
    with fs.open(s3_path, "rb") as f:
        data = f.read()
    doc = Document(io.BytesIO(data))
    paras = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(paras)

# Collect DOCX docs
s3_files = list_s3_files(BUCKET, S3_PREFIX)
docs = {}
for s3_path in s3_files:
    if s3_path.lower().endswith(".docx"):
        docs[s3_path] = read_docx_from_s3(s3_path)
    else:
        print("Skipping (not docx):", s3_path)

print("Loaded documents:", list(docs.keys()))


Loaded documents: ['ceader-eu-ai-act/Attention_is_all_you_need.docx']


#Preprocessing & Chunking —
#Clean: remove extra spaces/newlines, unify whitespace. This makes text consistent.
#Lowercase: usually helpful for embedding models
#Chunk: Splitting long documents into small pieces with overlap (200 chars).
#Why? LLMs and embeddings work better on focused passages. Overlap ensures an answer spanning a boundary will not be lost.
#Store metadata: track which chunk came from which file and page so we can show sources.

In [4]:
# === Preprocess + chunk ===
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Download the missing resource
from nltk.tokenize import sent_tokenize

def normalize_text(text):
    # simple normalization
    text = text.replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)        # collapse whitespace
    text = text.strip()
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    # simple sentence-aware chunking:
    sents = sent_tokenize(text)
    chunks = []
    cur = ""
    for s in sents:
        if len(cur) + len(s) <= chunk_size:
            cur += " " + s
        else:
            chunks.append(cur.strip())
            # start new chunk with overlap
            cur = " ".join((cur.split()[-int(overlap/5):])) + " " + s  # heuristic overlap by words
    if cur.strip():
        chunks.append(cur.strip())
    # fallback if any chunk > chunk_size -> cut
    final = []
    for c in chunks:
        if len(c) <= chunk_size:
            final.append(c)
        else:
            # naive split by chars
            for i in range(0, len(c), chunk_size - overlap):
                final.append(c[i:i + chunk_size])
    return final

# build chunk database
all_chunks = []
meta = []  # parallel list with metadata
for s3_path, text in docs.items():
    norm = normalize_text(text)
    chunks = chunk_text(norm, chunk_size=1000, overlap=200)
    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        meta.append({"source": s3_path, "chunk_id": i})

print("Total chunks:", len(all_chunks))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total chunks: 54


In [5]:
# === Embeddings and FAISS ===
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # small & fast
embeddings = embed_model.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)

# normalize for cosine similarity
faiss.normalize_L2(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)   # inner product = cosine if vectors are normalized
index.add(embeddings)
print("FAISS index size:", index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

FAISS index size: 54


Retrieval function — show query retrieving relevant chunks

In [6]:
def retrieve(query, k=3):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)  # D: scores, I: indexes
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({"score": float(score), "chunk": all_chunks[idx], "meta": meta[idx]})
    return results

# Example
q = "Tell me about the eu ai act?"   # replace with a real question for your docs
res = retrieve(q, k=3)
for r in res:
    print("score", r['score'], "source", r['meta']['source'])
    print(r['chunk'][:400], "...\n")


score 0.2145099639892578 source ceader-eu-ai-act/Attention_is_all_you_need.docx
of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and ...

score 0.20840615034103394 source ceader-eu-ai-act/Attention_is_all_you_need.docx
but should be just - this is what we are missing , in my opinion <pad> The Law will never be perfect , but application should be just - this is what we are missing , in my opinion . <EOS> <pad> The Law will never be perfect , but its application should be just - this is what we are missing , in my opinion . <EOS> <pad> The Law will never be perfect , but its application should be just - this is wh ...

score 0.19435080885887146 s

Give the LLM focused context and tell it to use the context. That reduces hallucinations. through usning Open AI

In [7]:
# (Optional) Use OpenAI + retrieved context (requires OPENAI_API_KEY)
# pip install openai
import os, openai
openai.api_key = os.environ.get("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")

def answer_with_openai(query, k=3):
    hits = retrieve(query, k=k)
    context = "\n\n".join([h['chunk'] for h in hits])
    prompt = f"Use the context to answer the question as factually as possible. Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    resp = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role":"user","content":prompt}],
        temperature=0
    )
    return {"answer": resp['choices'][0]['message']['content'], "retrieved": hits}


In [8]:
pip install openai==0.28




Build embeddings + FAISS index

In [9]:

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load embedding model (small + fast, good for Colab)
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all your document chunks
embeddings = embed_model.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)

# Normalize embeddings (for cosine similarity)
faiss.normalize_L2(embeddings)

# Create FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)   # inner product = cosine similarity
index.add(embeddings)

print("FAISS index size:", index.ntotal)


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

FAISS index size: 54


In [10]:
# === Retrieval function ===
def retrieve(query, k=3):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)  # D: similarity scores, I: indexes of chunks
    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "score": float(score),
            "chunk": all_chunks[idx],
            "meta": meta[idx]
        })
    return results

# Quick test
print(retrieve("What is the EU AI act", k=2))


[{'score': 0.19234076142311096, 'chunk': 'Ilya Sutskever. Neural GPUs learn algorithms. In International Conference on Learning Representations (ICLR), 2016. Nal Kalchbrenner, Lasse Espeholt, Karen Simonyan, Aaron van den Oord, Alex Graves, and Ko- ray Kavukcuoglu. Neural machine translation in linear time. arXiv preprint arXiv:1610.10099v2, 2017. Yoon Kim, Carl Denton, Luong Hoang, and Alexander M. Rush. Structured attention networks. In International Conference on Learning Representations, 2017. Diederik Kingma and Jimmy Ba. Adam: A method for stochastic optimization. In ICLR, 2015. Oleksii Kuchaiev and Boris Ginsburg. Factorization tricks for LSTM networks. arXiv preprint arXiv:1703.10722, 2017. Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130, 2017. Minh-Thang Luong, Quoc V. Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. Multi-task sequence t

Integrating Open AI API KEY FOR DEPLOYING

In [11]:
import os
os.environ["OPENAI_API_KEY"] = "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"   # replace with your actual key

import openai
openai.api_key = os.environ["OPENAI_API_KEY"]


Using local machine since i exceeded free quota or subscription plan on OpenAI, Switch fully to a local mode.

In [12]:
# === Simple evaluation helpers ===
import re
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def retrieval_hit_rate(test_queries, ground_truth_texts, k=3):
    hits = 0
    for q, gt in zip(test_queries, ground_truth_texts):
        hits_k = retrieve(q, k=k)  # your local retrieve()
        joined = " ".join([h['chunk'] for h in hits_k])
        if gt.lower() in joined.lower():
            hits += 1
    return hits / len(test_queries)

def faithfulness_score(generated, retrieved_chunks):
    a_words = set(re.sub(r'\W+',' ', generated.lower()).split())
    context_words = set(re.sub(r'\W+',' ', " ".join(retrieved_chunks).lower()).split())
    if not a_words:
        return 0.0
    return len(a_words & context_words) / len(a_words)

# Example queries & ground-truth snippets
test_qs = [
    "English Constituency Parsing?",
    "Encoder and Decoder Stacks?"
]
gt_texts = [
    "To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input.",
    "The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- wise fully connected feed-forward network, Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack."
]

# 1️⃣ Retrieval evaluation
print("Retrieval hit rate (k=3):", retrieval_hit_rate(test_qs, gt_texts, k=3))

# 2️⃣ Generation using local Flan-T5 (no API)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load model (small for Colab)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

def answer_with_local_llm(query, k=3, max_length=200):
    hits = retrieve(query, k=k)
    context = "\n\n".join([h['chunk'] for h in hits])
    prompt = f"Use the context to answer the question factually.\nContext:\n{context}\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_length=max_length)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"answer": answer, "retrieved": hits}

# Run local LLM evaluation
for q in test_qs:
    out = answer_with_local_llm(q, k=3)
    fscore = faithfulness_score(out['answer'], [h['chunk'] for h in out['retrieved']])
    r = scorer.score(" ".join(gt_texts[:1]), out['answer'])

    print("\nQuery:", q)
    print("Answer:", out['answer'])
    print("Faithfulness (overlap) score:", fscore)
    print("Rouge-L (toy):", r)


Retrieval hit rate (k=3): 0.5

Query: English Constituency Parsing?
Answer: [4].
Faithfulness (overlap) score: 1.0
Rouge-L (toy): {'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}

Query: Encoder and Decoder Stacks?
Answer: self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.
Faithfulness (overlap) score: 1.0
Rouge-L (toy): {'rougeL': Score(precision=0.1875, recall=0.08108108108108109, fmeasure=0.11320754716981132)}


Model giving desired output, well suppurted by the ground truth dataset. Score is 1, meaning perfect.

 Save FAISS index and chunk metadata to AWS S3

In [13]:
# Save FAISS index and chunk metadata
import faiss, json
faiss.write_index(index, "faiss_index.bin")
with open("chunks_meta.json", "w", encoding="utf-8") as f:
    json.dump({"chunks": all_chunks, "meta": meta}, f)

# upload to S3
fs.put("faiss_index.bin", f"{BUCKET}/{S3_PREFIX}/faiss_index.bin")
fs.put("chunks_meta.json", f"{BUCKET}/{S3_PREFIX}/chunks_meta.json")
print("Uploaded index + metadata to S3")


Uploaded index + metadata to S3


checking folder path on aws S3

In [16]:
import s3fs
fs = s3fs.S3FileSystem(anon=False)
fs.ls("ceader-eu-ai-act/path")  # should list faiss_index.bin and chunks_meta.json


['ceader-eu-ai-act/path/to']