In [None]:
# !pip install -q -r requirements.txt

In [None]:
# !brew install poppler
# !brew install tesseract

# Let's build your local RAG app! 

## Preprocessing a potpourri of files

Useful data is everywhere! 
PDFs, Markdown, HTML, Email, Word documents, etc.  

In [1]:
import os
from unstructured.partition.auto import partition

def preprocess_misc_files(directory):

  all_elements = []
  for root, _, files in os.walk(directory):
    for file in files:
        # Preprocessing files into Unstructured elements
        if file.endswith((".eml", ".md", ".html", ".pdf")):
            filename = os.path.join(root, file)
            # partition will detect the file type and route it to 
            # the appropriate partitioning function, e.g. partition_eml 
            els = partition(filename=filename)
            all_elements.extend(els)
  return all_elements

In [2]:
# The first time you run this, it'll download a CV model to use for PDFs, if you're partitioning any PDFs
# This cell will take about a minute to run
elements = preprocess_misc_files("../documents")

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [3]:
# Alternative way to partition PDFs & images. NOT LOCAL! But you get higher quality of the results

# from unstructured_client import UnstructuredClient
# from unstructured_client.models import shared
# from unstructured_client.models.errors import SDKError
# from unstructured.staging.base import dict_to_elements
# 
# # Free Unstructured API key with a cap of 1000 pages/month: https://unstructured.io/api-key-free
# os.environ["UNSTRUCTURED_API_KEY"] = "YOUR_API_KEY"
# unstructured_api_key = os.environ.get("UNSTRUCTURED_API_KEY")
# 
# client = UnstructuredClient(
#     api_key_auth=unstructured_api_key,
#     # if using Hosted API, provide your unique API URL:
#     # server_url="YOUR_API_URL",
# )
# 
# def partition_file(filename):
#   with open(filename, "rb") as f:
#       files=shared.Files(
#           content=f.read(),
#           file_name=filename,
#       )
# 
#   req = shared.PartitionParameters(
#       files=files,
#       strategy = "hi_res",
#       hi_res_model_name = "chipper"
#   )
# 
#   try:
#       resp = client.general.partition(req)
#   except SDKError as e:
#       print(e)
# 
#   return resp
# 
# pdf_result = partition_file(filename="../documents/pdfs/2404.13781v1.pdf")
# pdf_elements = dict_to_elements(pdf_result.elements)

Let's explore the elements

In [4]:
elements

[<unstructured.documents.elements.Title at 0x35b0d1930>,
 <unstructured.documents.elements.Title at 0x35b0d2380>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d2410>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d2470>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d24d0>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d25c0>,
 <unstructured.documents.elements.ListItem at 0x35b0d29b0>,
 <unstructured.documents.elements.ListItem at 0x35b0d1f30>,
 <unstructured.documents.elements.ListItem at 0x35b0d1ff0>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d2050>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d2110>,
 <unstructured.documents.elements.ListItem at 0x35b0d20e0>,
 <unstructured.documents.elements.Title at 0x35b0d21a0>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d2260>,
 <unstructured.documents.elements.Title at 0x35b0d2290>,
 <unstructured.documents.elements.NarrativeText at 0x35b0d0040>,
 <unstructur

In [5]:
element_dict = [el.to_dict() for el in elements]

In [6]:
from collections import Counter

element_types = [e['type'] for e in element_dict]
Counter(element_types).keys()

dict_keys(['Title', 'NarrativeText', 'ListItem', 'UncategorizedText', 'Header', 'Image', 'Table', 'FigureCaption', 'Formula'])

In [7]:
import json
from IPython.display import JSON

example_output = json.dumps(element_dict[24:26], indent=2)
JSON(example_output)



<IPython.core.display.JSON object>

In [8]:
table_elements = [e for e in element_dict if e['type'] == "Table"]

table_elements[0]

{'type': 'Table',
 'element_id': '7503d48ef24d9a5cb1ffd06a793caaf5',
 'text': 'Medical Pipeline Product Pipeline Metric EvalGen SPADE EvalGen SPADE Dataset Size # Bad Outputs # Assertions Coverage FFR Alignment (%) 84 27 3 0.33 0.10 48.29 84 27 5 0.33 0.10 48.29 100 49 4 0.73 0.39 66.46 100 49 9 0.49 0.39 54.35',
 'metadata': {'detection_class_prob': 0.9290404915809631,
  'coordinates': {'points': ((160.3275909423828, 571.503173828125),
    (160.3275909423828, 849.7816162109375),
    (798.4124755859375, 849.7816162109375),
    (798.4124755859375, 571.503173828125)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2024-04-26T09:03:01',
  'text_as_html': '<table><thead><th rowspan="2">Metric</th><th colspan="2">Medical Pipeline</th><th colspan="2">Product Pipeline</th></thead><thead><th></th><th>EvaLGeEN</th><th>SPADE</th><th>EvatGen</th><th>SPADE</th></thead><tr><td>Dataset Size</td><td>84</td><td>84</td><td>100</td><td>100</td></tr><tr

In [9]:
from IPython.display import HTML

HTML(table_elements[0]['metadata']['text_as_html'])

Metric,Medical Pipeline,Medical Pipeline,Product Pipeline,Product Pipeline,Unnamed: 5_level_0
Metric,Unnamed: 1_level_1,EvaLGeEN,SPADE,EvatGen,SPADE
Dataset Size,84.0,84.0,100.0,100.0,
# Bad Outputs,27.0,27.0,49.0,49.0,
# Assertions,3.0,5.0,4.0,9.0,
Coverage,0.33,0.33,0.73,0.49,
FFR,0.1,0.1,0.39,0.39,
Alignment (%),48.29,48.29,66.46,54.35,


## Chunking

In [10]:
from unstructured.chunking.title import chunk_by_title

chunked_elements = chunk_by_title(elements,
                                  max_characters=512,
                                  combine_text_under_n_chars=100,
                                  )

In [11]:
# we actually will end up with fewer elements because of the way unstructured partitions documents and the chunk strategy!
# Check out this paper on smart chunking: https://arxiv.org/pdf/2402.05131 
len(elements)

1071

In [12]:
len(chunked_elements)

647

## Prepare elements to be loaded into a vector store

In [13]:
from langchain_core.documents import Document

documents = []
for element in chunked_elements:
    metadata = element.metadata.to_dict()    
    # At this point you can remove some of the metadata, or use metadata to filter elements 
    documents.append(Document(page_content=element.text, metadata=metadata))

## ChromaDB & retriever

In [14]:
from langchain_community.vectorstores import Chroma
from langchain.vectorstores import utils as chromautils
from langchain_community.embeddings import OllamaEmbeddings

docs = chromautils.filter_complex_metadata(documents)
vectorstore = Chroma.from_documents(docs, OllamaEmbeddings(model="nomic-embed-text",show_progress=True))

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
)

OllamaEmbeddings: 100%|██████████| 647/647 [00:11<00:00, 58.45it/s]


## Set up Local LLM

`ollama pull llama3` to get the llama3 8B instruct version of llama3

In [15]:
from langchain_community.chat_models import ChatOllama

local_model = "llama3"
llm = ChatOllama(model=local_model, num_predict=400, stop=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>", "<|reserved_special_token"])

In [16]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

prompt_template = """
<|start_header_id|>user<|end_header_id|>
Answer the user's question using provided context. 
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)
 
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [17]:
question = "What is RAG?"

In [18]:
rag_chain.invoke(question)

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 46.59it/s]


'According to the provided context, RAG stands for Retrieval Augmented Generation. It is a system that retrieves relevant information from a knowledge base or index, which is then used to augment the generation of text or other content.'