# PDF RAG with Session Management and Gemini API\n
This notebook allows you to upload PDF documents, process them for Retrieval Augmented Generation (RAG), and chat with an AI model (Gemini) based on the content of your documents. Sessions are used to manage data, and they expire after a set duration.

In [None]:
!pip install -q sentence_transformers faiss-cpu PyPDF2 google-generativeai numpy

## 1. Setup and Imports

In [None]:
import os
import uuid
import shutil
from datetime import datetime, timedelta
import pickle
import numpy as np
import faiss
import PyPDF2
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import json
from IPython.display import display, Markdown, clear_output
import ipywidgets as widgets

# --- Constants ---
SESSION_DATA_DIR = \

UPLOAD_DIR_NAME = \

CHUNKS_FILE_NAME = \

INDEX_FILE_NAME = \

STATUS_FILE_NAME = \

TIMESTAMP_FILE_NAME = \

MAX_FILES = 20
MAX_TOTAL_SIZE_MB = 1024  # 1 GB
SESSION_EXPIRY_HOURS = 2

# --- Gemini API Configuration ---
# IMPORTANT: Set your GEMINI_API_KEY as an environment variable before running this notebook.
# You can also set it directly here for testing, but environment variables are safer.
GEMINI_API_KEY = os.environ.get(\
)
gemini_model = None
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    gemini_model = genai.GenerativeModel('gemini-pro')
else:
    print(\

# --- Sentence Transformer Model ---
try:
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
except Exception as e:
    print(f\
    sentence_model = None

os.makedirs(SESSION_DATA_DIR, exist_ok=True)
print(\

## 2. Helper Functions

In [None]:
def get_session_path(session_id):
    return os.path.join(SESSION_DATA_DIR, str(session_id))

def get_session_upload_path(session_id):
    return os.path.join(get_session_path(session_id), UPLOAD_DIR_NAME)

def ensure_session_dirs(session_id):
    session_path = get_session_path(session_id)
    os.makedirs(session_path, exist_ok=True)
    os.makedirs(get_session_upload_path(session_id), exist_ok=True)
    with open(os.path.join(session_path, TIMESTAMP_FILE_NAME), \
) as f:
        f.write(datetime.now().isoformat())
    print(f\

def get_session_status(session_id):
    session_path = get_session_path(session_id)
    status_file = os.path.join(session_path, STATUS_FILE_NAME)
    timestamp_file = os.path.join(session_path, TIMESTAMP_FILE_NAME)

    if not os.path.exists(session_path) or not os.path.exists(timestamp_file):
        return \


    try:
        with open(timestamp_file, \
) as f:
            created_time_str = f.read()
        created_time = datetime.fromisoformat(created_time_str)
        if datetime.now() > created_time + timedelta(hours=SESSION_EXPIRY_HOURS):
            return \
 
    except Exception as e:
        print(f\
        return \
 # Indicates an issue with the session's age check

    if os.path.exists(status_file):
        with open(status_file, \
) as f:
            return f.read().strip()
    return \
 # Default if status file not yet created but session exists

def update_session_status(session_id, status):
    session_path = get_session_path(session_id)
    if not os.path.exists(session_path):
        ensure_session_dirs(session_id) # Should ideally be created before status update
    with open(os.path.join(session_path, STATUS_FILE_NAME), \
) as f:
        f.write(status)
    print(f\

def extract_text_from_pdf_in_chunks(pdf_path, char_limit=1000):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        chunks_data = []
        current_chunk_text = \

        current_chunk_metadata = []
        for page_num, page in enumerate(pdf_reader.pages):
            page_text = page.extract_text()
            if not page_text: continue
            page_sentences = page_text.replace('\\n', ' ').split('. ')
            for sentence in page_sentences:
                sentence = sentence.strip()
                if not sentence: continue
                sentence += \

                if len(current_chunk_text) + len(sentence) <= char_limit:
                    current_chunk_text += \
 + sentence
                    if not current_chunk_metadata or current_chunk_metadata[-1]['page_number'] != page_num + 1:
                        current_chunk_metadata.append({'page_number': page_num + 1})
                else:
                    if current_chunk_text:
                        chunks_data.append((current_chunk_text.strip(), [{'page_number': meta['page_number']} for meta in current_chunk_metadata]))
                    current_chunk_text = sentence
                    current_chunk_metadata = [{'page_number': page_num + 1}]
        if current_chunk_text:
            chunks_data.append((current_chunk_text.strip(), [{'page_number': meta['page_number']} for meta in current_chunk_metadata]))
    return chunks_data

def process_pdfs_for_session(session_id, local_sentence_model):
    if not local_sentence_model:
        print(\
)
        update_session_status(session_id, \
)
        return
    update_session_status(session_id, \
)
    session_upload_path = get_session_upload_path(session_id)
    pdf_files = [os.path.join(session_upload_path, f) for f in os.listdir(session_upload_path) if f.lower().endswith(\
)]
    if not pdf_files:
        print(\
)
        update_session_status(session_id, \
)
        return
    print(f\
    all_chunks = []
    for pdf_file in pdf_files:
        print(f\
)
        try:
            all_chunks.extend(extract_text_from_pdf_in_chunks(pdf_file))
        except Exception as e:
            print(f\
            update_session_status(session_id, f\
)
            return
    if not all_chunks:
        print(\
)
        update_session_status(session_id, \
)
        return
    print(f\
)
    chunk_texts = [chunk[0] for chunk in all_chunks]
    try:
        chunk_embeddings = local_sentence_model.encode(chunk_texts, show_progress_bar=True)
        chunk_embeddings_float32 = np.array(chunk_embeddings, dtype=np.float32)
        index = faiss.IndexFlatL2(chunk_embeddings_float32.shape[1])
        index.add(chunk_embeddings_float32)
        session_path = get_session_path(session_id)
        faiss.write_index(index, os.path.join(session_path, INDEX_FILE_NAME))
        with open(os.path.join(session_path, CHUNKS_FILE_NAME), 'wb') as f:
            pickle.dump(all_chunks, f)
        update_session_status(session_id, \
)
        print(\
)
    except Exception as e:
        print(f\
        update_session_status(session_id, \
)

def retrieve_chunks_session(query, local_sentence_model, session_id, top_k=5):
    if not local_sentence_model:
        print(\
)
        return []
    session_path = get_session_path(session_id)
    index_path = os.path.join(session_path, INDEX_FILE_NAME)
    chunks_path = os.path.join(session_path, CHUNKS_FILE_NAME)
    if not os.path.exists(index_path) or not os.path.exists(chunks_path):
        print(\
)
        return []
    index = faiss.read_index(index_path)
    with open(chunks_path, 'rb') as f:
        chunks = pickle.load(f)
    query_embedding = local_sentence_model.encode([query])
    D, I = index.search(np.array(query_embedding, dtype=np.float32), top_k)
    retrieved_chunks_data = []
    for i, idx in enumerate(I[0]):
        if 0 <= idx < len(chunks):
            chunk_text = chunks[idx][0]
            page_info = f\