In [1]:
import doxstractor as dxc
import glob
import os
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


We will be using the Anthropic API as part of this tutorial. Make sure you have set the `ANTHROPIC_API_KEY` environment variable to your API key.

In [3]:
with open("anthropic_key.txt", "r") as f:
    key = f.read()
    os.environ['ANTHROPIC_API_KEY'] = key

## Creating our first extractor

In [4]:
with open("tutorial_data/EDGAR_lease_agreement_1.html", "r") as f:
    html = f.read()
soup = BeautifulSoup(html, features="html.parser")
text = soup.get_text()

In [5]:
model = dxc.TransformersQAModel(model="deepset/tinyroberta-squad2", na_threshold = 0.2)

In [6]:
address_extractor = dxc.TextExtractor(
    name="address", 
    query="What is the address of the leased building?", 
    model=model, 
)

In [7]:
address = address_extractor.extract(text)

In [8]:
address

'6335 1St – Avenue South'

## Creating a numeric extractor

In [9]:
with open("tutorial_data/EDGAR_employment_agreement_1.html", "r") as f:
    html = f.read()
soup = BeautifulSoup(html, features="html.parser")
text = soup.get_text()

In [10]:
salary_extractor = dxc.NumericExtractor(
    name="salary", 
    query="What is the base salary?", 
    model=model, 
)

In [11]:
salary = salary_extractor.extract(text)

In [12]:
salary

'575,000'

## Creating a categorical extractor

In [13]:
anthropic_model = dxc.AnthropicAPIModel(model="claude-3-haiku-20240307")

In [14]:
doc_classifier = dxc.CategoryExtractor(name="doctype", 
                                       query="What type of agreement is this?", 
                                       categories=["employment", "lease", "other"],
                                      model=anthropic_model)

In [15]:
res = doc_classifier.extract(text)

In [16]:
res

'employment'

## Chaining extractors

In [17]:
chain = dxc.Node(extractor=doc_classifier, 
                 children={
                     "lease":[dxc.Node(address_extractor)], 
                     "employment": [dxc.Node(salary_extractor)]})

In [18]:
chain.extract(text)

{'doctype': 'employment', 'salary': '575,000'}

## Processing multiple documents into a table

In [19]:
path = "tutorial_data/"
file_paths = glob.glob(os.path.join(path, '*'))

In [20]:
collector = []
for fp in file_paths:
    with open(fp, "r") as f:
        html = f.read()
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    data = chain.extract(text)
    data.update({"file_path":fp})
    print(data)
    collector.append(data)

{'doctype': 'employment', 'salary': '18', 'file_path': 'tutorial_data/EDGAR_employment_agreement_2.html'}
A 429 status code was received; sleeping 60s to reset rate limit
{'doctype': 'lease', 'address': 'Annapolis Lane', 'file_path': 'tutorial_data/EDGAR_lease_agreement_2.html'}
{'doctype': 'employment', 'salary': 'NA', 'file_path': 'tutorial_data/EDGAR_employment_agreement_3.html'}
{'doctype': 'lease', 'address': '6335 1St – Avenue South', 'file_path': 'tutorial_data/EDGAR_lease_agreement_1.html'}
{'doctype': 'employment', 'salary': '575,000', 'file_path': 'tutorial_data/EDGAR_employment_agreement_1.html'}


In [25]:
import pandas as pd

In [30]:
pd.DataFrame(collector)[["file_path", "doctype", "salary", "address"]].sort_values("doctype")

Unnamed: 0,file_path,doctype,salary,address
0,tutorial_data/EDGAR_employment_agreement_2.html,employment,18.0,
2,tutorial_data/EDGAR_employment_agreement_3.html,employment,,
4,tutorial_data/EDGAR_employment_agreement_1.html,employment,575000.0,
1,tutorial_data/EDGAR_lease_agreement_2.html,lease,,Annapolis Lane
3,tutorial_data/EDGAR_lease_agreement_1.html,lease,,6335 1St – Avenue South
