In [1]:
"""Install statements."""

!pip install -q -U langchain-community
!pip install -q sentence-transformers
!pip install -q pypdf
!pip install -q openai
!pip install -q chromadb
!pip install -q tiktoken
!pip install -q langchain faiss-cpu transformers
!pip install -q langchain-chroma
!pip install -q langchain-ollama
!pip install -q huggingface_hub
!pip install -q ipywidgets
!pip install -q langchain-huggingface
!pip install -q einops
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q unstructured

In [2]:
"""Import statements."""

from huggingface_hub import notebook_login
from langchain.document_loaders import TextLoader, PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline, ChatHuggingFace
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from time import time, gmtime, strftime
from sys import exit
from shutil import rmtree
import os

In [3]:
def load_and_chunk(splitter, pdf_loc=None, csv_loc=None):
    """Load and chunk documents from specified locations.

    Args:
      splitter: The text splitter to use.
      pdf_loc: Path to directory containing pdf file(s).
      csv_loc: Path to directory containing csv file(s).
    
    Returns:
      Dictionary of chunked documents
    """
    # Load pdf documents
    pdf_chunks = None
    if pdf_loc:
      pdf_loader = PyPDFDirectoryLoader(pdf_loc)
      pdf_pages = pdf_loader.load()
      pdf_chunks = splitter.split_documents(pdf_pages)

    # Load csv documents
    csv_chunks = None
    if csv_loc:
      csv_loader = DirectoryLoader(csv_loc)
      csv_pages = csv_loader.load()
      csv_chunks = splitter.split_documents(csv_pages)

    output = {'pdf': None, 'csv': None, 'txt': None}
    if pdf_chunks:
      output['pdf'] = pdf_chunks
    if csv_chunks:
      output['csv'] = csv_chunks

    return output

In [4]:
def answer_query(query, database, num_docs):
    """Given a query, create a prompt and receive a response.

    Args:
      query: The query to answer.
      database: The colleciton of documents to use for RAG (assumes ChromaDB).
      num_docs: How many documents should be given as context information.
    
    Returns:
      response received from the LLM model used
    """

    # Set up context
    docs_chroma = database.similarity_search_with_score(query, k=num_docs)
    context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])

    # Set up prompt
    prompt_template = """
    Answer the question based only on the following context:
    {context}
    Answer the question based on the above context: {question}.
    Add a new line after every sentence.
    Do not mention any information which is not contained within the context.
    """


    # Load context and query into prompt
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    prompt = prompt_template.format(context=context_text, question=query)

    # Get answer from LLM
    if (model_choice == "openai"):
        response = model.predict(query)
    else:
        if (model_choice in local_models):
            response = model.invoke(prompt)
        else:
            response = model.invoke(prompt)
    
    return response

In [5]:
def set_chroma_load(modified_times_loc, chroma_loc, cur_embedding, other_locs=list()):
    """Determine if Chroma should load from directory or start a new run.

    Args:
      modified_times_loc: Location of files saving last modified times for each embedding model.
      chroma_loc: Location of persistent directory for Chroma.
      cur_embedding: Current choice of embedding model, to check if a directory for said model exists.
      other_locs: List of locations with context docs to be checked for changes. Defaults to empty list.
    
    Returns:
      Boolean representing if Chroma should use saved files or create new files.
    """
    # Get last modified times for each of the directories holding context information
    context_times = [os.path.getmtime(folder) for folder in other_locs]

    # Get last modified time for specific embedding model, making file and setting time to 0 if none exists
    cur_embed_chroma_mod = None
    cur_embed_chroma_mod_loc = os.path.join(modified_times_loc, f"{cur_embedding}.txt")
    if not os.path.exists(cur_embed_chroma_mod_loc):
        # Make file with "time" of 0 if no file exists for this embedding model
        with open(cur_embed_chroma_mod_loc, "w") as outf:
            outf.write("0")
    with open(cur_embed_chroma_mod_loc, "r") as inf:
        for line in inf:
            cur_embed_chroma_mod = float(line.strip())
            break

    # Get booleans for determining if Chroma should load
    chroma_dir_exists = os.path.exists(os.path.join(chroma_loc, f"{cur_embedding}\\"))
    context_modified = any([mod > cur_embed_chroma_mod for mod in context_times])

    # Determine if Chroma should load
    if context_modified or not chroma_dir_exists:
        should_load = False
    else:
        should_load = True
    
    return should_load

In [6]:
"""Define variables."""

# File system navigation
EMBEDDING_ROOT = "D:\\Desktop\\AI\\Embeddings\\"
MODEL_ROOT = "D:\\Desktop\\AI\\LLMs\\"
PROJECT_ROOT = os.getcwd()
CONTEXT_ROOT = os.path.join(PROJECT_ROOT, "context_files\\")
PDF_ROOT = os.path.join(CONTEXT_ROOT, "pdf_files\\")
CSV_ROOT = os.path.join(CONTEXT_ROOT, "csv_files\\")
CHROMA_ROOT = os.path.join(PROJECT_ROOT, "chroma_db_files\\")
MODIFIED_ROOT = os.path.join(CHROMA_ROOT, "(0)modified-times\\")
OUTPUT_ROOT = os.path.join(PROJECT_ROOT, "output_files\\")

# Create structural directories, if they don't exist
context_roots = [PDF_ROOT, CSV_ROOT]
roots = [CHROMA_ROOT, MODIFIED_ROOT, OUTPUT_ROOT, CONTEXT_ROOT] + context_roots
for root in roots:
    if not os.path.exists(root):
        try:
            os.mkdir(root)
        except Exception as e:
            print(f"Error making {root}:\n{e}")
            exit(1)

# Determine if context directories are empty
need_context = False
for root in context_roots:
    if not os.listdir(root):
        need_context = True

# Check if context documents need to be added
if need_context:
    print("\nYou have not supplied any documents to be used as context information.")
    print("Please do so before using this system.")
    exit(1)

# Embedding to use, determines if running online
ollama_embeddings = ["nomic-embed-text", "mxbai-embed-large"]
local_embeddings = ["nomic-embed-text-v1.5", "bert-base-uncased"]
online_embeddings = ["openai"]
embeddings_choice = "mxbai-embed-large"

# Model to use, determines if running online
# Models: deepseek-r1:[7b|14b|32b|70b], llama3.3, mistral, mixtral:8x7b
ollama_models = ["deepseek-r1:7b", "deepseek-r1:14b", "deepseek-r1:32b", "deepseek-r1:70b", "llama3.3", "mistral", "mixtral:8x7b", "deepseek-r1:671b"]  # Don't use 671b
local_models = ["bert-base-uncased", "gpt2", "Mistral-7B-Instruct-v0.3", "zephyr-7b-beta", "DarkForest-20B-v3.0"]
online_models = ["openai"]
model_choice = "deepseek-r1:7b"

# Flag to determine if program is running locally or not
local = True
if ((embeddings_choice == "openai") or (model_choice == "openai")):
    local = False

# For api keys
%env OPENAI_API_KEY = "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"
OPENAI_KEY = "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"

env: OPENAI_API_KEY="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"


In [None]:
"""Generate embeddings and manage vectors."""

# Set up ChromaDB path and embedding based on embeddings_choice
cur_embed_db = os.path.join(CHROMA_ROOT, f"{embeddings_choice}")
if not os.path.exists(cur_embed_db):
    try:
        os.mkdir(cur_embed_db)
    except Exception as e:
        print(f"Error:\n{e}")
        exit(1)
if local:
    if (embeddings_choice in ollama_embeddings):
        embeddings = OllamaEmbeddings(model=embeddings_choice)
    elif (embeddings_choice in local_embeddings):
        model_kwargs = {'trust_remote_code': True}
        embeddings = HuggingFaceEmbeddings(model_name=f"{EMBEDDING_ROOT}{embeddings_choice}\\", model_kwargs=model_kwargs)
else:
    if (embeddings_choice == "openai"):
        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)

# Set up ChromaDB based on whether or not pre-saved information should be used
context_locs = [PDF_ROOT, CSV_ROOT]
chroma_load = set_chroma_load(MODIFIED_ROOT, CHROMA_ROOT, embeddings_choice, other_locs=context_locs)
if chroma_load:
    db_chroma = Chroma(embedding_function=embeddings, persist_directory=cur_embed_db)
else:
    # Remove old folder
    try:
        rmtree(cur_embed_db)
    except Exception as e:
        print(f"Error:\n{e}")
        exit(1)
    
    # Create new folder
    try:
        os.mkdir(cur_embed_db)
    except Exception as e:
        print(f"Error:\n{e}")
        exit(1)

    # Give context information to Chroma
    # Not sure best way to handle, so create Chroma with first set of documents, then add any other documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = load_and_chunk(text_splitter, pdf_loc=PDF_ROOT, csv_loc=CSV_ROOT)
    db_chroma = Chroma.from_documents(chunks['pdf'], embeddings, persist_directory=cur_embed_db)
    for key in chunks.keys():
        if chunks[key] is not None:
            db_chroma.add_documents(documents=chunks[key])

    # Save current time as last modified time for context information for this embedding
    with open(f"{MODIFIED_ROOT}{embeddings_choice}.txt", "w") as outf:
        outf.write(f"{time()}")

    # Flip chroma_load to True, to allow rerunning this section without remaking Chroma database
    chroma_load = True

# Set up model based on model_choice
if local:
    if (model_choice in ollama_models):
        model = ChatOllama(model=model_choice)
    elif (model_choice in local_models):
        llm = HuggingFacePipeline.from_model_id(model_id=f"{MODEL_ROOT}{model_choice}\\", task="text-generation", device=0)
        model = ChatHuggingFace(llm=llm)
else:
    if (model_choice == "openai"):
        model = ChatOpenAI(openai_api_key=OPENAI_KEY)

In [None]:
"""Receive answer to a query, with ability to save to .txt file."""

while True:
    # Receive response to query
    print("\n\nWelcome to the experimental RAG system! Enter a prompt, or 'exit' to exit.")
    print("\nHow can I help?\n")

    query = None
    try:
        query = input()
    except KeyboardInterrupt as e:
        if query is not None:
            pass
        else:
            break

    if query.lower() == "exit":
        break
    elif query == "":
        print("Please enter a query.")
        continue

    response = answer_query(query, db_chroma, 5)

    # Create output for question and response
    output = "\n"

    # Extract string of response, if needed
    if not isinstance(response, str):
        response = response.content

    # Add response to output
    if local:
        if (model_choice == "Mistral-7B-Instruct-v0.3"):
            prompt_end = response.find("[/INST]")
            output += response[(prompt_end + 7):]
        elif ("deepseek-r1" in model_choice):
            think_end = response.find("</think>")
            output += response[(think_end + 8):]
        else:
            output += response
    else:
        output += response

    # Print response for convenience
    print(output)

print("\nThank you for using the RAG system!")
exit()






Welcome to the experimental RAG system! Enter a prompt, or 'exit' to exit.

How can I help?




The role of NR4A1 in chondrocyte apoptosis during osteoarthritis involves promoting the death of cartilage cells through specific signaling pathways.

1. **Role Activation**: NR4A1 facilitates chondrocyte apoptosis, a process that leads to joint damage characteristic of osteoarthritis.
   
2. **Pathway Involvement**: It plays a key role in two primary pathways of apoptosis:
   - **Mitochondrial Apoptosis Pathway**: This pathway is associated with the release of cytochrome c and involves BAX activation, contributing to DNA fragmentation and cell death.
   - **p38 MAPK Pathway**: NR4A1 supports this pathway, which may be involved in regulating mitochondrial function and apoptosis.

3. **Cellular Stressors**: The activation or modulation of these pathways can be influenced by various stressors within the cartilage cells, such as oxidative stress, affecting their ability to maintain structu

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
