In [2]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(override=True)

True

In [3]:
model_name = 'gpt-3.5-turbo-16k'

In [4]:
import tiktoken

def concatenate_page_contents(input_documents, separator):
    page_contents = [doc.dict()['page_content'] for doc in input_documents]
    return separator.join(page_contents)

def num_tokens_from_string(string: str, model_name: str = model_name) -> int:
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
filename = '../../data/ndis2/PB NDIS Pricing Arrangements and Price Limits 2023-24 .pdf'
directory = '../../data/ndis2'

In [6]:
import os
from langchain.document_loaders import CSVLoader
from langchain.document_loaders.pdf import PDFPlumberLoader
#https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.pdf.PDFPlumberLoader.html#langchain.document_loaders.pdf.PDFPlumberLoader.load


class MyDirectoryLoader:

    def __init__(self, dir_path):
        self.dir_path = dir_path
        
    def check_args(self):
        print(**self.pdf_args.keys())

    def load(self):
        docs = []
        for root, _, files in os.walk(self.dir_path):
            for file in files:
                print('file:', file)
                file_path = os.path.join(root, file)
                if file_path.endswith('.csv'):
                    loader = CSVLoader(file_path)
                elif file_path.endswith('.pdf'):
                    loader = PDFPlumberLoader(file_path)
                else:
                    print(f"Do not process the file: {file_path}")
                    continue
                loaded_docs = loader.load()
                docs.extend(loaded_docs)
        return docs

In [7]:
loader = MyDirectoryLoader(directory)
docs = loader.load()

file: PB NDIS Pricing Arrangements and Price Limits 2023-24 .pdf


In [8]:
import numpy as np

print('Number of documents:', len(docs))

print(f'Average document length in characters:{np.average([len(t.page_content) for t in docs]):.1f}')
print(f'Average document length in tokens:{np.average([num_tokens_from_string(t.page_content) for t in docs]):.1f}')

Number of documents: 101
Average document length in characters:2774.0
Average document length in tokens:652.3


## text splitter

In [9]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
texts = splitter.split_documents(docs)


In [10]:

print('Number of chunks:', len(texts))

print(f'Average chunk length in characters:{np.average([len(t.page_content) for t in texts]):.1f}')
print(f'Average chunk length in tokens:{np.average([num_tokens_from_string(t.page_content) for t in texts]):.1f}')

Number of chunks: 354
Average chunk length in characters:858.8
Average chunk length in tokens:201.9


In [11]:
# choose collection

collection_name = 'NDIS_PDFPLUMBER_1_TEXTS_1024_128'

In [12]:
from chromadb.config import Settings
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

client_settings = Settings(
        chroma_api_impl="rest",
        chroma_server_host="host.docker.internal",  # when you run this inside a devcontainer you need to explicitely say host.docker.internal to signify "devcontainer host localhost"
        chroma_server_http_port="8000"
    )

db = Chroma.from_documents(texts, client_settings=client_settings, embedding = OpenAIEmbeddings(), collection_name=collection_name)
