This cell is a template pipeline for the RAG end-to-end system, this cell does not work for now.

In [None]:
import os
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Ensure NLTK's sentence tokenizer is downloaded
nltk.download('punkt')

# Step 1: Read all `.txt` files from a directory
def load_text_files(directory):
    docs = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                docs.append(file.read())
    return docs

# Step 2: Chunk the documents with overlap
def chunk_text(text, chunk_size=50, overlap_size=5):
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    
    # Split sentences into chunks with overlap
    for i in range(0, len(sentences), chunk_size - overlap_size):
        chunk = sentences[i:i + chunk_size]
        chunks.append(' '.join(chunk))  # Join sentences back into a chunk of text
        if i + chunk_size >= len(sentences):
            break  # Avoid index overflow
    return chunks

# Step 3: Load documents from a directory and chunk them with overlap
def process_directory(directory, chunk_size=50, overlap_size=5):
    all_chunks = []
    docs = load_text_files(directory)
    for doc in docs:
        all_chunks.extend(chunk_text(doc, chunk_size, overlap_size))  # Chunk each document with overlap
    return all_chunks

# Step 4: Embed the chunks and build the FAISS index
def build_faiss_index(chunks):
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    chunk_embeddings = embedder.encode(chunks)
    dimension = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(chunk_embeddings))
    return index, embedder

# Step 5: Retrieve top-k relevant chunks based on query
def retrieve_top_k_chunks(query, index, chunks, embedder, k=5):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

# Step 6: Generate answer based on the top-k chunks
def generate_answer(query, top_chunks, model):
    context = "\n".join([chunk for chunk, _ in top_chunks])
    prompt = f"Question: {query}\n\nContext:\n{context}\n\nAnswer:"
    return model(prompt, max_length=10000, num_return_sequences=1)[0]['generated_text']

# Step 7: Combine everything in a RAG pipeline
def rag_pipeline(query, directory, k=5, chunk_size=50, overlap_size=5):
    # Process the directory and chunk the documents with overlap
    chunks = process_directory(directory, chunk_size, overlap_size)
    
    # Build the FAISS index
    index, embedder = build_faiss_index(chunks)
    
    # Retrieve top-k relevant chunks
    top_k_chunks = retrieve_top_k_chunks(query, index, chunks, embedder, k)
    
    # Load a pre-trained text generation model

    model = pipeline("text-generation", model="openai-community/gpt2")
    
    # Generate the answer based on the top-k chunks
    answer = generate_answer(query, top_k_chunks, model)
    
    return answer

# Example usage
if __name__ == "__main__":
    # Specify the directory containing .txt files
    directory = "/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test"
    
    # Define the query
    query = "How many super bowls did the Steelers win?"
    
    # Run the RAG pipeline with overlapping chunks
    result = rag_pipeline(query, directory, k=5, chunk_size=50, overlap_size=5)
    
    # Print the result
    print(result)