## The following RAG notebook covers the following: 
### 1. Vectorizing all the content being fetched (Data Ingestion)
### 2. Storing all the embeddings into a vector database and retrieving it (Retrieval)
### 3. Querying the LLM (Generation)

## Part 1: Vectorization of all the content being fetched

### Loading and preparing the JSON Data (Data Ingestion)

Install the below dependencies first

In [1]:
#pip install sentence-transformers 

In [2]:
#!pip install tf-keras

In [18]:
import os
import json
import glob
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from typing import List, Dict, Any, Tuple

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    """
    Splits the input text into smaller chunks.
    
    Args:
        text: The input text to split.
        chunk_size: Maximum number of words per chunk.
        overlap: Number of overlapping words between consecutive chunks.
        
    Returns:
        A list of text chunks.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks


def process_documents(folder_path, chunk_size=500, overlap=50):
    """Process all PDF and DOCX files with chunking for large documents."""
    extracted_items = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.lower().endswith('.pdf'):
            print(f"Processing PDF: {filename}")
            pdf_text = extract_text_from_pdf(file_path)
            if pdf_text.strip():
                # Split into chunks if the text is long
                words = pdf_text.split()
                if len(words) > chunk_size:
                    chunks = chunk_text(pdf_text, chunk_size, overlap)
                    for i, chunk in enumerate(chunks):
                        content_item = {
                            "video_id": filename,
                            "frame": f"chunk_{i+1}",
                            "timestamp": f"chunk_{i+1}",
                            "timestamp_seconds": i,
                            "content": chunk
                        }
                        extracted_items.append(content_item)
                else:
                    
                    content_item = {
                        "video_id": filename,
                        "frame": "full_text",
                        "timestamp": "0:00",
                        "timestamp_seconds": 0,
                        "content": pdf_text
                    }
                    extracted_items.append(content_item)
        
        elif filename.lower().endswith('.docx'):
            print(f"Processing Word Document: {filename}")
            word_text = extract_text_from_word(file_path)
            if word_text.strip():
                # Split into chunks if the text is long
                words = word_text.split()
                if len(words) > chunk_size:
                    chunks = chunk_text(word_text, chunk_size, overlap)
                    for i, chunk in enumerate(chunks):
                        content_item = {
                            "video_id": filename,
                            "frame": f"chunk_{i+1}",
                            "timestamp": f"chunk_{i+1}",
                            "timestamp_seconds": i,
                            "content": chunk
                        }
                        extracted_items.append(content_item)
                else:
                    
                    content_item = {
                        "video_id": filename,
                        "frame": "full_text",
                        "timestamp": "0:00",
                        "timestamp_seconds": 0,
                        "content": word_text
                    }
                    extracted_items.append(content_item)
    
    return extracted_items



def load_json_files(json_folder: str) -> List[Dict[str, Any]]:
    """
    Load all JSON files from a folder and extract their content.
    Handles both YouTube processed JSON files and document extraction JSON files.
    
    Args:
        json_folder: Path to folder containing JSON files
        
    Returns:
        List of dictionaries with extracted content.
    """
    json_files = glob.glob(os.path.join(json_folder, "*.json"))
    all_content = []
    
    for json_file in json_files:
        file_basename = os.path.basename(json_file)
        print(f"Processing: {file_basename}")
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Determine type of JSON file based on structure
            if isinstance(data, list):
                # Check if this might be our PDF/DOCX extraction format
                if len(data) > 0 and all(isinstance(item, dict) and "video_id" in item and "content" in item for item in data):
                    print(f"  Detected document extraction JSON format")
                    all_content.extend(data)
                    print(f"  Added {len(data)} content items from {file_basename}")
                else:
                    # Process as YouTube formatted JSON
                    file_content = []
                    for i, frame in enumerate(data):
                        frame_name = frame.get("frame", f"frame_{i}")
                        text_content = []
                        
                        if frame.get("caption"):
                            text_content.append(f"Caption: {frame['caption']}")
                        if frame.get("extracted_text"):
                            text_content.append(f"Text: {frame['extracted_text']}")
                        if frame.get("label"):
                            text_content.append(f"Visual: {frame['label']}")
                        
                        if text_content:
                            frame_time = 0
                            try:
                                time_str = frame_name.replace("frame_", "").replace(".jpg", "")
                                frame_time = int(time_str)
                            except:
                                frame_time = i * 5
                            
                            time_min = frame_time // 60
                            time_sec = frame_time % 60
                            time_str = f"{time_min}:{time_sec:02d}"
                            
                            content_item = {
                                "video_id": file_basename.replace("_processed.json", ""),
                                "frame": frame_name,
                                "timestamp": time_str,
                                "timestamp_seconds": frame_time,
                                "content": " ".join(text_content)
                            }
                            
                            file_content.append(content_item)
                    
                    all_content.extend(file_content)
                    print(f"  Added {len(file_content)} content items from {file_basename}")
            else:
                print(f"  Skipping {file_basename}: Unrecognized format")
            
        except Exception as e:
            print(f"Error processing {file_basename}: {e}")
    
    print(f"Total content items extracted: {len(all_content)}")
    return all_content

### Conversion of .json files into embeddings 

We are using the all-miniLM-l6-v2 model for vectorizing the json files into embeddings. 

The hugging face link: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [19]:
def vectorize_content(content_items: List[Dict[str, Any]],
                      model_name: str = "all-MiniLM-L6-v2",
                      chunk_size: int = 500,
                      overlap: int = 50) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
    """
    Convert content items to vector embeddings. For long texts, perform chunking.
    
    Args:
        content_items: List of content items with text.
        model_name: Name of the SentenceTransformer model to use.
        chunk_size: Maximum number of words in each chunk.
        overlap: Number of overlapping words between chunks.
        
    Returns:
        Tuple of (embeddings array, updated content items with chunk info).
    """
    # Load the embedding model
    model = SentenceTransformer(model_name)
    print(f"Loaded embedding model: {model_name}")
    
    chunked_items = []
    for item in content_items:
        text = item["content"]
        # Chuking
        if len(text.split()) > chunk_size:
            chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
            for i, chunk in enumerate(chunks):
                new_item = item.copy()
                new_item["content"] = chunk
                new_item["chunk_index"] = i
                chunked_items.append(new_item)
        else:
            chunked_items.append(item)
    
    texts = [item["content"] for item in chunked_items]
    print(f"Generating embeddings for {len(texts)} chunks/items...")
    embeddings = model.encode(texts, show_progress_bar=True)
    
    return embeddings, chunked_items

def save_vectors(embeddings: np.ndarray, content_items: List[Dict[str, Any]], output_folder: str):
    """
    Save the vector embeddings and content items.
    
    Args:
        embeddings: NumPy array of embeddings.
        content_items: List of content items.
        output_folder: Folder to save the files.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    embeddings_path = os.path.join(output_folder, "embeddings.npy")
    np.save(embeddings_path, embeddings)
    
    content_path = os.path.join(output_folder, "content_items.pkl")
    with open(content_path, 'wb') as f:
        pickle.dump(content_items, f)
    
    print(f"Saved embeddings to {embeddings_path}")
    print(f"Saved content items to {content_path}")

### Main Function call for chunking, vectorizing data and storing it

In [20]:
def main():
    # Configuration folders
    json_folder = '/Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/Data to be considered'
    output_folder = '/Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/vectorized_data' 
    
    content_items = load_json_files(json_folder)
    print(f"Extracted {len(content_items)} total content items from all JSON files")
    
    embeddings, content_items = vectorize_content(content_items)
    print(f"Generated embeddings with shape: {embeddings.shape}")
    
    save_vectors(embeddings, content_items, output_folder)
    
    print("Vectorization complete!")

if __name__ == "__main__":
    main()

Processing: Ilg3gGewQ5U_processed_filtered.json
  Added 38 content items from Ilg3gGewQ5U_processed_filtered.json
Processing: extracted_text.json
  Detected document extraction JSON format
  Added 1 content items from extracted_text.json
Total content items extracted: 39
Extracted 39 total content items from all JSON files
Loaded embedding model: all-MiniLM-L6-v2
Generating embeddings for 42 chunks/items...


Batches: 100%|██████████| 2/2 [00:00<00:00, 12.32it/s]

Generated embeddings with shape: (42, 384)
Saved embeddings to /Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/vectorized_data/embeddings.npy
Saved content items to /Users/advaith/Desktop/MSBA Related coursework/Spring term/Deep Learning/Final Project/vectorized_data/content_items.pkl
Vectorization complete!





<br>

## Part 2: Storing all the embeddings into a vector database

We will use the FAISS for searching embeddings

Link for FAISS : https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/

In [7]:
#pip install faiss-cpu

In [21]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
import glob
from typing import List, Dict, Any, Tuple

def load_vectorized_data(embeddings_path: str, content_path: str):
    """
    Load the saved embeddings (NumPy array) and content items (pickle file).
    """
    embeddings = np.load(embeddings_path)
    with open(content_path, 'rb') as f:
        content_items = pickle.load(f)
    return embeddings, content_items

def create_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    """
    Create a FAISS index using cosine similarity (normalize + inner product).
    """
    embeddings = embeddings.astype("float32")
    
    # Normalizing vectors for cosine similarity
    faiss.normalize_L2(embeddings)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
    index.add(embeddings)
    print(f"FAISS index has {index.ntotal} vectors.")
    return index

def search_index(query: str, model: SentenceTransformer, index: faiss.Index, content_items: list, k: int = 5):
    """
    Convert the query to an embedding, search the FAISS index for the top-k nearest neighbors,
    and return the corresponding content items with their scores.
    """
    query_embedding = model.encode(query, show_progress_bar=False)
    query_embedding = np.array([query_embedding]).astype("float32")
    
    
    faiss.normalize_L2(query_embedding)
    
    scores, indices = index.search(query_embedding, k)
    
    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append((score, content_items[idx]))
    return results


## Change below accordingly 

embeddings_path = "vectorized_data/embeddings.npy"
content_path = "vectorized_data/content_items.pkl"

# Load vectorized data
embeddings, content_items = load_vectorized_data(embeddings_path, content_path)
print(f"Loaded embeddings with shape: {embeddings.shape}")
print(f"Loaded {len(content_items)} content items.")

# Create the FAISS index
index = create_faiss_index(embeddings)

# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Setup complete!")

Loaded embeddings with shape: (42, 384)
Loaded 42 content items.
FAISS index has 42 vectors.
Setup complete!


### Testing of the Index

In [22]:
# Define your query text
query = "What is confounder bias?"

results = search_index(query, model, index, content_items, k=5)

for rank, (dist, item) in enumerate(results, start=1):
    print(f"Rank {rank}:")
    print(f"  Distance: {dist:.4f}")
    print(f"  Video ID: {item['video_id']}")
    print(f"  Frame: {item['frame']}")
    print(f"  Timestamp: {item['timestamp']}")
    print(f"  Content: {item['content']}")
    print("")

Rank 1:
  Distance: 0.2824
  Video ID: AML Quiz 1 Study Guide.pdf
  Frame: full_text
  Timestamp: 0:00
  Content: 3 Types of Analytics 1. Descriptive - find human interpretable patterns that describe the data a. Large scale summaries, local patterns - finding fraud 2. Predictive - use variables to predict unknown or future values of other variables a. Regression, classification, ranking/recommendation 3. Prescriptive (may need causal reasoning) Analysis is often retrospective (not prospective) data was not collected in a methodical way that is tailored for the analytical task ● Data may be collected in the past without a specific analytical task in mind, which can lead to challenges because the data was not gathered with a methodology to ensure relevance or completeness for the analytics to be done - more risk for biases that are not controlled No Free Lunch - there is no universally best model, so tradeoffs need to be understood ● Deep nets and complex models are very general and powe

<br>

## Using a LLM To Implement the RAG

### Using Gemini 

In [3]:
#pip install google-generativeai

In [6]:
def generate_answer(query: str, retrieved_context: list) -> str:
    """
    Generate an answer using the retrieved context and Gemini model.
    
    Args:
        query: The user's query.
        retrieved_context: List of tuples (score, content_item) from retrieval.
        
    Returns:
        The generated answer as a string.
    """
    # Combine retrieved content into a single context string
    context_text = "\n\n".join([f"{item['content']}" for score, item in retrieved_context])
    
    # Construct the prompt by including the retrieved context and the user query
    prompt = (
        "You are an expert tutor. Use the context provided below to answer the following question.\n\n"
        "Context:\n"
        f"{context_text}\n\n"
        "Question:\n"
        f"{query}\n\n"
        "Answer:"
    )
    response = model.generate_content(prompt)
    
    # Extract the answer from the response
    answer = response.text.strip()
    return answer

In [16]:
import google.generativeai as genai
from getpass import getpass 
api_key = getpass("Enter your Gemini API key: ")
genai.configure(api_key = api_key)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
encoder_model = SentenceTransformer("all-MiniLM-L6-v2")

Enter your Gemini API key:  ········


In [17]:
query = "What is confounder bias?"
results = search_index(query, encoder_model, index, content_items, k=5)  # This comes from the FAISS Search
answer = generate_answer(query, results)
print("Generated Answer:\n", answer)

Generated Answer:
 Confounder bias occurs when a confounding variable is correlated with the independent variable (potentially causally) and also causally related to the dependent variable.  This leads to biased results and prevents the observation of the true relationship between the independent and dependent variables.  The provided example uses ice cream consumption and sunburns, where temperature is the confounding variable.  High temperatures increase both ice cream consumption and sunburns, creating a spurious correlation between the two that masks the lack of a direct causal relationship.
