# Local RAG with Gemma2 and Streamlit

### Importing required libraries

In [None]:
import streamlit as st
import os
import tempfile
from uuid import uuid4
from langchain_community.document_loaders import PyPDFLoader, UnstructuredMarkdownLoader, JSONLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage

## Data Ingestion section

In [None]:
# Function to load the file, split it into chunks, and add them to the vector store
def add_to_vector_store(file, vector_store, chunk_size=1000, chunk_overlap=200):
    if file:
        # Use tempfile because Langchain Loaders only accept a file_path
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            tmp.write(file.getvalue())
            tmp_file_path = tmp.name

        # Use Langchain Loaders to load the file into a Document object (which stores page content and metadata)
        if file.type == "application/pdf":
            loader = PyPDFLoader(file_path = tmp_file_path)
        elif file.type == "application/json":
            loader = JSONLoader(file_path = tmp_file_path, jq_schema=".", text_content=False)
        elif file.type == "text/markdown":
            loader = UnstructuredMarkdownLoader(file_path = tmp_file_path)        
        else:
            loader = TextLoader(file_path = tmp_file_path)

        data = loader.load()

        # Replace temporary file name with original file name in documents' metadata
        for document in data:
            document.metadata["source"] = file.name

        print(f"Loaded {len(data)} documents from {file.name}")
        # Use Langchain Text Splitter to chunk the document into smaller pieces
        # From LangChain Docs (https://python.langchain.com/docs/how_to/recursive_text_splitter/):
        # This text splitter is the recommended one for generic text. 
        # It is parameterized by a list of characters. It tries to split on them in order until 
        # the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. 
        # This has the effect of trying to keep all paragraphs (and then sentences, and then words) 
        # together as long as possible, as those would generically seem to be the strongest semantically 
        # related pieces of text.
        splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 
                                                  chunk_overlap=chunk_overlap,
                                                  add_start_index=True,  # track index in original document
                                                )
        chunked_data = splitter.split_documents(data)
        
        print(f"Chunked {file.name} into {len(chunked_data)} pieces")

        # Upload the chunked data to the ChromaDB collection
        uuids = [file.name + str(uuid4()) for _ in range(len(chunked_data))]
        vector_store.add_documents(documents=chunked_data, ids=uuids)

        print(f"Uploaded {file.name} to ChromaDB")
        
        # Delete the temporary file
        tmp.close()
        os.unlink(tmp_file_path)