In [1]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.document_loaders import PyMuPDFLoader, DataFrameLoader
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from pydantic import BaseModel, Field
from datasets import load_dataset
from langchain.embeddings import OpenAIEmbeddings, FakeEmbeddings
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import openai
openai.api_key = "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"
os.environ["OPENAI_API_KEY"] = openai.api_key

In [3]:
data = load_dataset("xyzNLP/nza-ct-zoning-codes-text")
data = data["train"].to_pandas().drop(columns=["embeddings"])
data

Found cached dataset parquet (/Users/maxdumas/.cache/huggingface/datasets/xyzNLP___parquet/xyzNLP--nza-ct-zoning-codes-text-7e1ab1af692a3e2c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 89.99it/s]


Unnamed: 0,Town,Page,Text
0,vernon,1,
1,vernon,2,"ZONING REGULATIONS\nTown of Vernon, Connecticu..."
2,vernon,3,SECTION 1 - GENERAL\n1.1\nThe following regula...
3,vernon,4,1.3\nThe boundaries of these zones are shown o...
4,vernon,5,1.3A\nProposed changes to the Zoning Map shall...
...,...,...,...
22083,newtown,266,APPENDIX E\nDESIGN ADVISORY BOARD ORDINANCE\nB...
22084,newtown,267,APPENDIX F\nList of Permitted Uses in Commerci...
22085,newtown,268,APPENDIX G\nList of Permitted Uses in Industri...
22086,newtown,269,APPENDIX H\nARTICLE VIII - SUPPLEMENTAL REGULA...


In [4]:
class District(BaseModel):
    name: str = Field(title="Name", description="Name of the district.")
    abbr: str | None = Field(title="Abbreviation", description="Abbreviation of the district name.")

class OutputModel(BaseModel):
    pages: list[int] = Field(title="Pages", description="The page numbers in the document that were used to produce the result.")
    districts: list[District]

In [5]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator(vectorstore_kwargs=dict(persist_directory="index")).from_loaders([DataFrameLoader(data.query("Town == 'vernon'"), "Text")])

Using embedded DuckDB with persistence: data will be stored in: index


In [6]:
parser = PydanticOutputParser(pydantic_object=OutputModel)

In [28]:
query = "What are the zoning districts defined in this document? We know that the districts are defined across two consecutive pages."

In [29]:
docs = index.vectorstore.similarity_search(query, k=10)

In [30]:
docs

[Document(page_content='1.3A\nProposed changes to the Zoning Map shall be identified on a map prepared to Class D survey\naccuracy showing the properties proposed for the zone change and all properties within a 200\nfoot radius of the boundaries of the proposed zone change. Owners shall be identified for all\nproperties referenced above.\n1.4\nWhere the boundary is shown following a street, railroad or utility right-of-way, the boundary\nshall be the centerline thereon, unless otherwise indicated. Where the boundary is shown outside\nof a street, railroad or utility right-of-way and approximately parallel thereto, the boundary shall\nbe deemed parallel to the nearest line thereof, and the figure placed on the zoning map shall be\nthe distance in feet between them, as measured at a right angle from such line, unless otherwise\nstated. Where the boundary of a district follows a river, stream, pond or other watercourse, the\nboundary line shall be the centerline thereon, unless otherwise 

In [37]:
doc_prompt = PromptTemplate(
    template="{text}\n|| PAGE: {page} ||",
    input_variables=["text", "page"],
)

prompt = FewShotPromptTemplate(
    example_prompt=doc_prompt,
    examples=[ {"text": d.page_content, "page": d.metadata["Page"] } for d in docs],
    prefix="""
Use some selection of the text included between the markers "|||||" to answer
the question that comes after the markers. The text between the markers comes
from a selection of pages of a document. The page number for each selection
comes at the end of the selection and is indicated with "|| PAGE: # ||" where # is the
page number itself.

|||||
    """,
    suffix="""
|||||

{question}\n{format_instructions}
""",
    input_variables=["question"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

prompt2 = PromptTemplate(
    template="""
Use some selection of the text included between the markers "|||||" to answer
the question that comes after the markers. The text between the markers comes
from a selection of pages of a document. The page number for each selection
comes at the end of the selection and is indicated with "|| PAGE: # ||" where #
is the page number itself. Pay special attention to text that comes from
adjacent pages; we expect that sometimes the full answer might span multiple
pages.

|||||
{context}
|||||

{question}\n{format_instructions}
""",
    input_variables=["question", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)
    

In [38]:
chain = load_qa_chain(OpenAI(temperature=0, model_name="gpt-4"), chain_type="stuff", verbose=True, prompt=prompt2)



In [39]:
from functools import reduce
from itertools import pairwise

docs = sorted(docs, key=lambda d: d.metadata["Page"])

for d in docs:
    d.page_content = f"{d.page_content}\n|| PAGE: {d.metadata['Page']} ||"



In [40]:
output = chain.run(input_documents=docs, question=query)
output = parser.parse(output)
output



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Use some selection of the text included between the markers "|||||" to answer
the question that comes after the markers. The text between the markers comes
from a selection of pages of a document. The page number for each selection
comes at the end of the selection and is indicated with "|| PAGE: # ||" where #
is the page number itself. Pay special attention to text that comes from
adjacent pages; we expect that sometimes the full answer might span multiple
pages.

|||||
SECTION 1 - GENERAL
1.1
The following regulations have been designed for the purpose of lessening congestion to the
streets; to secure safety from fire, panic, flood, and other dangers; to promote health and the
general welfare; to provide adequate light and air; to prevent the overcrowding of land; to avoid
undue concentration of population and facilitate the adequate provision for tr

OutputModel(pages=[4, 5], districts=[District(name='Planned Residential Development', abbr='PRD'), District(name='Commercial', abbr='C'), District(name='Industrial', abbr='I'), District(name='Special Floating Zone Industrial Development', abbr='FZ-ID'), District(name='Planned Neighborhood Development', abbr='PND'), District(name='Neighborhood R-10', abbr='NR-10'), District(name='Residential Commercial', abbr='RC'), District(name='Restricted Watershed', abbr='RW'), District(name='Historic District Industrial', abbr='HD-I'), District(name='Special Economic Development', abbr='SED'), District(name='Planned Commercial', abbr='PC'), District(name='Special Floating Zone - Garden Zone', abbr='SFZ-GZ'), District(name='Downtown Business & Residential', abbr='DBR')])

In [41]:
print(output.json(indent=2))

{
  "pages": [
    4,
    5
  ],
  "districts": [
    {
      "name": "Planned Residential Development",
      "abbr": "PRD"
    },
    {
      "name": "Commercial",
      "abbr": "C"
    },
    {
      "name": "Industrial",
      "abbr": "I"
    },
    {
      "name": "Special Floating Zone Industrial Development",
      "abbr": "FZ-ID"
    },
    {
      "name": "Planned Neighborhood Development",
      "abbr": "PND"
    },
    {
      "name": "Neighborhood R-10",
      "abbr": "NR-10"
    },
    {
      "name": "Residential Commercial",
      "abbr": "RC"
    },
    {
      "name": "Restricted Watershed",
      "abbr": "RW"
    },
    {
      "name": "Historic District Industrial",
      "abbr": "HD-I"
    },
    {
      "name": "Special Economic Development",
      "abbr": "SED"
    },
    {
      "name": "Planned Commercial",
      "abbr": "PC"
    },
    {
      "name": "Special Floating Zone - Garden Zone",
      "abbr": "SFZ-GZ"
    },
    {
      "name": "Downtown Business & R

In [42]:
len(output.districts)

13