# L3: Metadata Extraction and Chunking

In [52]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [53]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [54]:
import json
from IPython.display import JSON

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import dict_to_elements

import chromadb

In [55]:
DLAI_API_KEY = ""
DLAI_API_URL = "https://api.unstructured.io/"
s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)

# Run the document through the Unstructured API

In [56]:
from unstructured_client.models import shared, operations

filename = "example/Basic_Hunting.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(), 
        file_name=filename,
    )
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=files,
        strategy="hi_res",
        pdf_infer_table_structure=True,
        languages=["eng"],
    )
)


[
  {
    "type": "Image",
    "element_id": "c05cb0a0e7482aff28a7fac99f420e3b",
    "text": "Basic Hunting  Passing on a time-honored tradition   Kentucky Department of   Fish and Wildlife Resources ",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "Basic_Hunting.pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "36775728b9b81d1db9ece0d7701ff712",
    "text": "A",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "Basic_Hunting.pdf"
    }
  },
  {
    "type": "Title",
    "element_id": "abadedff59c52af285de21bfc7cc2ffc",
    "text": "\u00e9",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "Basic_Hunting.pdf"
    }
  }
]


In [57]:
try:
    resp = s.general.partition(request=req)
    # print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)


In [58]:
JSON(json.dumps(resp.elements[0:3], indent=2))

<IPython.core.display.JSON object>

# Find elements associated with chapters

In [60]:
[x for x in resp.elements if x['type'] == 'Title' and 'hunting' in x['text'].lower()]

[{'type': 'Title',
  'element_id': 'd9d44fc4b33067b0c583c0b64ba898ef',
  'text': 'Basic Hunting -',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 3,
   'filename': 'Basic_Hunting.pdf'}},
 {'type': 'Title',
  'element_id': '7c0df024c125be8fb31b4bc9b963a798',
  'text': 'Basic Hunting -',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 5,
   'filename': 'Basic_Hunting.pdf'}},
 {'type': 'Title',
  'element_id': '8527bd6ce37f0aa7437e78fa1250d8ad',
  'text': '2 - Basic Hunting',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 8,
   'filename': 'Basic_Hunting.pdf'}},
 {'type': 'Title',
  'element_id': '8424bf90b1121d14ddf0639a92c61449',
  'text': 'Basic Hunting - 3',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 9,
   'filename': 'Basic_Hunting.pdf'}},
 {'type': 'Title',
  'element_id': '803506d3141fb3c682bd09320aa8f2f3',
  'te

In [61]:
chapters = [
    "BASIC HUNTING",
    "TURKEY HUNTING",
    "SQUIRREL HUNTING",
    "DOVE HUNTING",
    "DEER HUNTING",
    "RABBIT HUNTING",
    "QUAIL HUNTING",
]

In [62]:
chapter_ids = {}
for element in resp.elements:
    for chapter in chapters:
        if chapter in element["text"] and element["type"] == "Title":
            chapter_ids[element["element_id"]] = chapter
            break

In [63]:
chapter_ids

{'058972c0a9176b788e2600f84160ffb4': 'TURKEY HUNTING',
 '9edb579799aef8d38fa0da8fa801a6a6': 'DEER HUNTING',
 '9722af57d3813b5799fd9b52d6b5e3d9': 'RABBIT HUNTING'}

In [71]:
chapter_to_id = {v: k for k, v in chapter_ids.items()}
[x for x in resp.elements if x["element_id"] == chapter_to_id["DEER HUNTING"]][0]

{'type': 'Title',
 'element_id': '9edb579799aef8d38fa0da8fa801a6a6',
 'text': 'DEER HUNTING',
 'metadata': {'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 30,
  'filename': 'Basic_Hunting.pdf'}}

# Load documents into a vector db

In [72]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [73]:
collection = client.create_collection(
    name="basic_hunting",
    metadata={"hnsw:space": "cosine"}
)

In [74]:
for element in resp.elements:
    parent_id = element["metadata"].get("parent_id")
    chapter = chapter_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"chapter": chapter}]
    )

/home/harry/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [07:05<00:00, 196kiB/s] 


# See the elements in Vector DB

In [75]:
results = collection.peek()
print(results["documents"])

['Basic Hunting  Passing on a time-honored tradition   Kentucky Department of   Fish and Wildlife Resources ', 'A', 'é', 'e - 1', '7', 'From the Commissioner.', ']', 'The North American Model of Wildlife Management depends on hunters. This model incorporates principles of conservation from the past 100 years and was formally adopted in 2002. Over the years the number of hunters has decreased. Fish and Wildlife agencies from across the nation view this as a major problem. Many factors contrib- ute to this decline, including habitat loss, a growing urban population, and increased competition for leisure time. We do have one key ele- ment upon which we can rely: the people who enjoy the opportunity to step outside and enjoy shooting and hunting.', 'Kentucky’s Department of Fish and Wildlife Resources has worked for several decades to give the shooter and hunter every pos- sible opportunity. We have developed shooting ranges, assisted land- owners in improving habitat, and enhanced the qua

## Perform a hybrid search with metadata

In [76]:
result = collection.query(
    query_texts=["How many players are on a team?"],
    n_results=2,
    where={"chapter": "DEER HUNTING"},
)
print(json.dumps(result, indent=2))

{
  "ids": [
    []
  ],
  "embeddings": null,
  "documents": [
    []
  ],
  "uris": null,
  "data": null,
  "metadatas": [
    []
  ],
  "distances": [
    []
  ],
  "included": [
    "distances",
    "documents",
    "metadatas"
  ]
}


## Chunking Content

In [77]:
elements = dict_to_elements(resp.elements)

In [78]:
chunks = chunk_by_title(
    elements,
    combine_text_under_n_chars=100,
    max_characters=3000,
)

In [79]:
JSON(json.dumps(chunks[0].to_dict(), indent=2))

<IPython.core.display.JSON object>

In [80]:
len(elements)

531

In [81]:
len(chunks)

116