In [1]:
"""Install statements."""

!pip install -q -U langchain-community
!pip install -q sentence-transformers
!pip install -q pypdf
!pip install -q openai
!pip install -q chromadb
!pip install -q tiktoken
!pip install -q langchain faiss-cpu transformers
!pip install -q langchain-chroma
!pip install -q langchain-ollama
!pip install -q huggingface_hub
!pip install -q ipywidgets
!pip install -q langchain-huggingface
!pip install -q einops
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q unstructured

In [2]:
"""Import statements."""

from huggingface_hub import notebook_login
from langchain.document_loaders import TextLoader, PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline, ChatHuggingFace
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from time import time, gmtime, strftime
import os, sys

  warn(f"Failed to load image Python extension: {e}")


In [4]:
def load_and_chunk(pdf_loc=None, csv_loc=None):
    """Load and chunk documents from specified locations, for use with ChromaDB RAG system.

    Args:
      pdf_loc: path to directory containing pdf file(s)
      csv_loc: path to directory containing csv file(s)
    
    Returns:
      list of documents to use with Chroma
    """

    # Set up splitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    # Load pdf documents
    pdf_chunks = None
    if pdf_loc:
      pdf_loader = PyPDFDirectoryLoader(pdf_loc)
      pdf_pages = pdf_loader.load()
      pdf_chunks = text_splitter.split_documents(pdf_pages)

    # Load csv documents
    csv_chunks = None
    if csv_loc:
      csv_loader = DirectoryLoader(csv_loc)
      csv_pages = csv_loader.load()
      csv_chunks = text_splitter.split_documents(csv_pages)

    output = []
    if pdf_chunks:
      output.append(pdf_chunks)
    if csv_chunks:
      output.append(csv_chunks)

    return output

In [11]:
def answer_query(query, database):
    """Given a query, create a prompt and receive a response.

    Args:
      query: The query to answer.
      database: The colleciton of documents to use for RAG (assumes ChromaDB)
    
    Returns:
      response received from the LLM model used
    """

    # Set up context
    docs_chroma = database.similarity_search_with_score(query, k=5)
    context_text = "\n\n".join([doc.page_content for doc, _score in docs_chroma])

    # Set up prompt
    prompt_template = """
    Answer the question based only on the following context:
    {context}
    Answer the question based on the above context: {question}.
    Add a new line after every sentence.
    Do not mention any information which is not contained within the context.
    """


    # Load context and query into prompt
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    prompt = prompt_template.format(context=context_text, question=query)

    # Get answer from LLM
    if (model_choice == "openai"):
        response = model.predict(query)
    else:
        if (model_choice in local_models):
            response = model.invoke(prompt)
        else:
            response = model.invoke(prompt)
    
    return response

In [6]:
def set_chroma_load(modified_times_loc, chroma_loc, cur_embedding, other_locs=list()):
    """Determine if Chroma should load from directory or start a new run.

    Args:
      modified_times_loc: Location of files saving last modified times for each embedding model.
      chroma_loc: Location of persistent directory for Chroma.
      cur_embedding: Current choice of embedding model, to check if a directory for said model exists.
      other_locs: List of locations with context docs to be checked for changes. Defaults to empty list.
    
    Returns:
      Boolean representing if Chroma should use saved files or create new files.
    """
    # Get last modified times for each of the directories holding context information
    context_times = [os.path.getmtime(folder) for folder in other_locs]

    # Get last modified time for specific embedding model, making file and setting time to 0 if none exists
    cur_embedding_chroma_mod = None
    cur_embedding_chroma_mod_loc = f"{modified_times_loc}{cur_embedding}.txt"
    if not os.path.exists(cur_embedding_chroma_mod_loc):
        # Make file with "time" of 0 if no file exists for this embedding model
        with open(cur_embedding_chroma_mod_loc, "w") as outf:
            outf.write("0")
    with open(cur_embedding_chroma_mod_loc, "r") as inf:
        for line in inf:
            cur_embedding_chroma_mod = float(line.strip())
            break

    # Get booleans for determining if Chroma should load
    chroma_dir_exists = os.path.exists(f"{chroma_loc}{cur_embedding}\\")
    context_modified = any([mod > cur_embedding_chroma_mod for mod in context_times])

    # Determine if Chroma should load
    if context_modified or not chroma_dir_exists:
        should_load = False
    else:
        should_load = True
    
    return should_load

In [None]:
"""Define variables."""

# File system navigation
EMBEDDING_ROOT = "D:\\Desktop\\AI\\Embeddings\\"
MODEL_ROOT = "D:\\Desktop\\AI\\LLMs\\"
PROJECT_ROOT = os.getcwd()
PDF_ROOT = os.path.join(PROJECT_ROOT, "context_files\\pdf_files\\")
CSV_ROOT = os.path.join(PROJECT_ROOT, "context_files\\csv_files\\")
CHROMA_ROOT = os.path.join(PROJECT_ROOT, "chroma_db_files\\")
MODIFIED_ROOT = os.path.join(CHROMA_ROOT, "(0)modified-times\\")
OUTPUT_ROOT = os.path.join(PROJECT_ROOT, "output_files\\")

# Embedding to use, determines if running online
ollama_embeddings = ["nomic-embed-text", "mxbai-embed-large"]
local_embeddings = ["nomic-embed-text-v1.5", "bert-base-uncased"]
online_embeddings = ["openai"]
embeddings_choice = ollama_embeddings[1]

# Model to use, determines if running online
ollama_models = ["deepseek-r1:7b", "deepseek-r1:32b", "deepseek-r1:70b", "llama3.3", "mistral", "deepseek-r1:671b"]  # Don't use 671b
local_models = ["bert-base-uncased", "gpt2", "Mistral-7B-Instruct-v0.3", "zephyr-7b-beta", "DarkForest-20B-v3.0"]
online_models = ["openai"]
model_choice = ollama_models[1]

# Flags
save_output = True     # Flag to determine if the response should be saved to a text file

# Flag to determine if Chroma should load from persistent directory
context_locs = [PDF_ROOT, CSV_ROOT]
chroma_load = set_chroma_load(MODIFIED_ROOT, CHROMA_ROOT, embeddings_choice, other_locs=context_locs)

# Flag to determine if program is running locally or not
local = True
if ((embeddings_choice == "openai") or (model_choice == "openai")):
    local = False

# For api keys
%env OPENAI_API_KEY = "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"
OPENAI_KEY = "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"

D:\Desktop\SchoolStuff\CSE_5095_Spring2025\Project\Code\context_files\pdf_files\


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:\\Desktop\\SchoolStuff\\CSE_5095_Spring2025\\Project\\Codecontext_files\\pdf_files\\'

In [11]:
"""Generate embeddings and manage vectors (assuming only using PDFs)."""

# Set up ChromaDB path and embedding based on embeddings_choice
if not os.path.exists(f"{CHROMA_ROOT}{embeddings_choice}"):
    try:
        os.mkdir(f"{CHROMA_ROOT}{embeddings_choice}")
    except Exception as e:
        print(f"Error:\n{e}")
        sys.exit(1)
chroma_path = f"{CHROMA_ROOT}{embeddings_choice}\\"
if local:
    if (embeddings_choice in ollama_embeddings):
        embeddings = OllamaEmbeddings(model=embeddings_choice)
    elif (embeddings_choice in local_embeddings):
        model_kwargs = {'trust_remote_code': True}
        embeddings = HuggingFaceEmbeddings(model_name=f"{EMBEDDING_ROOT}{embeddings_choice}\\", model_kwargs=model_kwargs)
else:
    if (embeddings_choice == "openai"):
        embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)

# Set up ChromaDB based on whether or not pre-saved information should be used
if chroma_load:
    db_chroma = Chroma(embedding_function=embeddings, persist_directory=chroma_path)
else:
    # Give context information to Chroma
    # Not sure best way to handle, so create Chroma with first set of documents, then add any other documents
    chunks = load_and_chunk(pdf_loc=PDF_ROOT, csv_loc=CSV_ROOT)
    db_chroma = Chroma.from_documents(chunks[0], embeddings, persist_directory=chroma_path)
    for i in range(1, len(chunks)):
        db_chroma.add_documents(documents=chunks[i])

    # Save current time as last modified time for context information for this embedding
    with open(f"{MODIFIED_ROOT}{embeddings_choice}.txt", "w") as outf:
        outf.write(f"{time()}")

    # Flip chroma_load to True, to allow rerunning this section without remaking Chroma database
    chroma_load = True

# Set up model based on model_choice
if local:
    if (model_choice in ollama_models):
        model = ChatOllama(model=model_choice)
    elif (model_choice in local_models):
        #pipe = pipeline(model=f"{MODEL_ROOT}{model_choice}\\", task="text-generation", max_length=1000)
        #llm = HuggingFacePipeline(pipeline=pipe)
        llm = HuggingFacePipeline.from_model_id(model_id=f"{MODEL_ROOT}{model_choice}\\", task="text-generation", device=0)
        model = ChatHuggingFace(llm=llm)
else:
    if (model_choice == "openai"):
        model = ChatOpenAI(openai_api_key=OPENAI_KEY)

libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
  dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better 

In [12]:
"""Receive answer to a query, with ability to save to .txt file."""

# Receive response to query
query = "Does NR4A1 cause an increase in BAX in chondrocytes with osteoarthritis?"
response = answer_query(query, db_chroma)

# Create output for question and response
output = ""
output += f"Query: {query}\n\n"

# Extract string of response, if needed
if not isinstance(response, str):
    response = response.content

# Add response to output
if local:
    if (model_choice == "Mistral-7B-Instruct-v0.3"):
        prompt_end = response.find("[/INST]")
        output += response[(prompt_end + 7):]
    elif ("deepseek-r1" in model_choice):
        think_end = response.find("</think>")
        output += response[(think_end + 8):]
    else:
        output += response
else:
    output += response

# Write output to file if save flag is set
if save_output:
    cur_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
    with open(f"{OUTPUT_ROOT}RAG_Output_{cur_time}.txt", "w") as outf:
        outf.write(output)

# Print response for convenience
print(output)

Query: Does NR4A1 cause an increase in BAX in chondrocytes with osteoarthritis?

According to Ansari et al., NR4A1 is likely to facilitate OA chondrocyte apoptosis, which is associated with p38 MAPK and mitochondrial apoptosis pathway.

The relationship between NR4A1 and BAX in chondrocytes with osteoarthritis is not directly stated in the context provided by Shi et al. or Ansari et al.

However, it is mentioned that lysosomal dysfunction induces apoptosis in chondrocytes through BAX-mediated mitochondrial damage and release of cytochrome c, as per Alvarez-Garcia et al. and also supported by Ansari et al.

There is no direct information provided in the context about NR4A1 causing an increase in BAX in chondrocytes with osteoarthritis.
