# 📘 RAG System with FAISS & OpenAI
This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using FAISS, LangChain, and OpenAI GPT.

In [1]:

# 📚 Step 1: Import libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from openai import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document


ModuleNotFoundError: No module named 'sklearn'

In [None]:

# 🔑 Step 2: Setup API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)


In [None]:

# 📊 Step 3: Load dataset
df = pd.read_excel("04LLM.xlsx")
df


In [None]:

# 🔄 Step 4: Convert rows into LangChain Documents
documents = [
    Document(page_content=row["Content"], metadata={"Title": row["Title"], "Tags": row["Tags"], "Source": row["Source"]})
    for _, row in df.iterrows()
]

# Split into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = splitter.split_documents(documents)

print(f"Total chunks: {len(docs)}")
docs[:2]  # preview first 2


In [None]:

# 🧠 Step 5: Build FAISS index with embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
db = FAISS.from_documents(docs, embeddings)


In [None]:

# 🔍 Step 6: Test retrieval
query = "What is RAG?"
results = db.similarity_search(query, k=2)

for r in results:
    print(f"📄 Title: {r.metadata['Title']}")
    print(f"🔎 Content: {r.page_content}")
    print("-" * 80)


In [None]:

# 🤖 Step 7: RAG function
def rag_query(query: str):
    results = db.similarity_search(query, k=2)
    context = "\n\n".join([f"{r.metadata['Title']}: {r.page_content}" for r in results])
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant that answers using the retrieved context."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
        ],
        temperature=0.3,
    )
    return response.choices[0].message.content

# Test it
rag_query("Explain LangChain")


In [None]:

# 📉 Step 8: Visualize embeddings with PCA
# Get embeddings for all chunks
vecs = [embeddings.embed_query(doc.page_content) for doc in docs]

# Reduce dimensions with PCA
pca = PCA(n_components=2)
reduced = pca.fit_transform(vecs)

# Scatter plot
plt.figure(figsize=(8,6))
plt.scatter(reduced[:,0], reduced[:,1], c="blue", alpha=0.6)
for i, doc in enumerate(docs):
    plt.text(reduced[i,0]+0.02, reduced[i,1], doc.metadata["Title"], fontsize=9)
plt.title("📊 Embedding Visualization of Knowledge Base")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.show()
