In [1]:
#dependencies
"""
%pip install --quiet -U langchain
%pip install --quiet -U  langchain_community
%pip install --quiet -U  tiktoken
%pip install --quiet -U langchain-nomic "nomic[local]"
%pip install --quiet -U langchain-ollama
%pip install --quiet -U  scikit-learn
%pip install --quiet -U langgraph
%pip install --quiet -U tavily-python bs4
%pip install PyMuPDF
%pip install Spacy
# %pip install sentence-transformers #to use 'all-mpnet-base-v5' embedding model
"""

'\n%pip install --quiet -U langchain\n%pip install --quiet -U  langchain_community\n%pip install --quiet -U  tiktoken\n%pip install --quiet -U langchain-nomic "nomic[local]"\n%pip install --quiet -U langchain-ollama\n%pip install --quiet -U  scikit-learn\n%pip install --quiet -U langgraph\n%pip install --quiet -U tavily-python bs4\n%pip install PyMuPDF\n%pip install Spacy\n# %pip install sentence-transformers #to use \'all-mpnet-base-v5\' embedding model\n'

In [2]:
#Imports
from langchain_ollama import ChatOllama
from langchain_core.messages import AIMessage
from langchain_core.documents import Document
from langchain_nomic.embeddings import NomicEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_core.messages import HumanMessage, SystemMessage
# from sentence_transformers import SentenceTransformer # to use 'all-mpnet-base-v5' embedding model
from tqdm.auto import tqdm
from spacy.lang.en import English # see https://spacy.io/usage for install instructions
import fitz #(pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
import pandas as pd
import os
import random
import re
import requests
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!nvidia-smi

Tue Dec 17 01:00:28 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
|  0%   32C    P8              8W /  285W |       0MiB /  16376MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# defining functions

# Get PDF documents
def get_file_paths(folder_path):
    file_paths = []  # Initialize an empty list

    # Check if the provided folder path exists and is a directory
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        # Iterate through the files in the folder
        for root, _, files in os.walk(folder_path):
            for file in files:
                # Construct the full file path and append it to the list
                if file.endswith(".pdf"):  # Ensure only PDF files are considered
                    file_paths.append(os.path.join(root, file))
    else:
        print(f"The provided path '{folder_path}' is not a valid directory.")

    return file_paths


#Performs minor formatting on text.
def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)
    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc), desc=f"Reading {pdf_path}"):  # iterate document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({
            "file_name": os.path.basename(pdf_path),
            "page_number": page_number,  # PDF page numbers start at 0
            "page_char_count": len(text),
            "page_word_count": len(text.split()),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,  # 1 token = ~4 chars
            "text": text
        })
    return pages_and_texts

# Recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]
#

def map_dict_to_document(data: dict[str, any]) -> Document:
    """
    Map a dictionary to a Langchain Document object.

    Args:
        data (Dict[str, Any]): The dictionary containing keys like 'page_content', 
                               'page_number', etc.

    Returns:
        Document: A Document object with the mapped data.
    """
    page_content = data.get('page_content', '')
    metadata = {key: value for key, value in data.items() if key != 'page_content'}
    return Document(page_content=page_content, metadata=metadata)

# Post-processing of retrieved chunks to join them before passing to LLMs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
folder_path = "Context" #path to folder containing PDFs, for now its a folder called 'Context'
local_llm = "llama3.2:3b-instruct-fp16" #define which model to use from ollama, make sure to ollama pull model before using here
num_sentence_chunk_size = 10 # Define split size to turn groups of sentences into chunks

llm = ChatOllama(model=local_llm, temperature=0.5) #instantiating model to call in our script

# Embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu") # choose the device to load the model to (note: GPU will often be much* faster than CPU)
embedding_model= NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="nvidia")

#SpaCy sentencizer for preprocessing
nlp = English() # Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

# Make a list of files in a given folder
file_list = get_file_paths(folder_path)

# Now reading files
all_pages_and_texts = []  # This will hold the combined results
for pdf_path in file_list:
    print(f"Reading {pdf_path}")
    file_pages_and_texts = open_and_read_pdf(pdf_path)
    all_pages_and_texts.extend(file_pages_and_texts)  # Append all pages from the current file

# Print summary
print(f"Processed {len(file_list)} PDF files.")
print(f"Total pages collected: {len(all_pages_and_texts)}")

print("Sentencizing texts on all pages in all pdfs")
for item in tqdm(all_pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

print(f"Making chunks of ~{num_sentence_chunk_size} sentences")
# Loop through pages and texts and split sentences into chunks
for item in tqdm(all_pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

print(f"Split each chunk into its own item")
pages_and_chunks = []
for item in tqdm(all_pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["page_content"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

chunks=[]
print(f"Converting chunks into langchain compatible format. i.e. 'document objects'")
for page_and_chunk in tqdm(pages_and_chunks):
    document = map_dict_to_document(page_and_chunk)  # Use the map_dict_to_document function
    chunks.append(document)

print(f"Creating Embeddings in the Vector Store using '{embedding_model.model}'")
vectorstore = SKLearnVectorStore.from_documents(
    documents=chunks,
    embedding=embedding_model
)
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

# deifining default propmpts
Spex = SystemMessage(
        content="You are SpexRAG - an embedded systems engineer's AI assistant, Your job is to look at datasheets and helps other embedded systems engineers in their workflow, you think carefully befor answering and you are keen on every single detail"
    )
Intro = HumanMessage(content="Hi, Introduce yourself. How can you help us today")
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using the above context. keep the answer concise.

Answer:"""

Reading Context\CMX608-data-sheet.pdf


Reading Context\CMX608-data-sheet.pdf: 70it [00:00, 178.11it/s]


Reading Context\DS_SX1276-7-8-9_W_APP_V7 (1).pdf


Reading Context\DS_SX1276-7-8-9_W_APP_V7 (1).pdf: 132it [00:00, 507.70it/s]


Reading Context\MAX-M10S_DataSheet_UBX-20035208.pdf


Reading Context\MAX-M10S_DataSheet_UBX-20035208.pdf: 24it [00:00, 421.07it/s]


Reading Context\nRF24L01Pluss_Preliminary_Product_Specification_v1_0.pdf


Reading Context\nRF24L01Pluss_Preliminary_Product_Specification_v1_0.pdf: 75it [00:00, 604.85it/s]


Reading Context\SARA-R4_DataSheet_UBX-16024152.pdf


Reading Context\SARA-R4_DataSheet_UBX-16024152.pdf: 51it [00:00, 150.00it/s]


Reading Context\Si4468-7.pdf


Reading Context\Si4468-7.pdf: 57it [00:00, 647.73it/s]


Reading Context\sx1276-1278113.pdf


Reading Context\sx1276-1278113.pdf: 133it [00:00, 490.78it/s]


Processed 7 PDF files.
Total pages collected: 542
Sentencizing texts on all pages in all pdfs


100%|██████████| 542/542 [00:01<00:00, 444.20it/s]


Making chunks of ~10 sentences


100%|██████████| 542/542 [00:00<00:00, 542168.56it/s]


Split each chunk into its own item


100%|██████████| 542/542 [00:00<00:00, 22582.95it/s]


Converting chunks into langchain compatible format. i.e. 'document objects'


100%|██████████| 997/997 [00:00<00:00, 332410.26it/s]


Creating Embeddings in the Vector Store using 'nomic-embed-text-v1.5'


In [9]:
# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 7})

In [10]:
# Test
question=input()
docs = retriever.invoke(question)
print(f"Retrieved {len(docs)} documents")
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([Spex, HumanMessage(content=rag_prompt_formatted)])
print("User:  " + question)
print("SpexRAG:  " + generation.content)

Retrieved 7 documents
User:  tell me about vctrl in cmx638
SpexRAG:  The VCTRL register ($07) is used to configure the CMX638's DTMF mode. Setting bit 7 of VCFG to '1' enables Format 2, which corresponds to a standard telephone keypad matrix layout (DTMF - Format 2). This allows the device to send or receive single tones and reproduce naturally occurring tones at the other end of a Vocoded link.
