In [1]:
import requests
import os 

from config import config

CHAT_PDF_API = config.CHAT_PDF_API

# Upload file

In [None]:
files = [
    ('file', ('file', open('/Users/nickaristidou/Downloads/beazley_623.pdf', 'rb'), 'application/octet-stream'))
]
headers = {
    'x-api-key': f'{CHAT_PDF_API}'
}

response = requests.post(
    'https://api.chatpdf.com/v1/sources/add-file', headers=headers, files=files)

if response.status_code == 200:
    print('Source ID:', response.json()['sourceId'])
    source_id = response.json()['sourceId']
else:
    print('Status:', response.status_code)
    print('Error:', response.text)

# Ask Questions

In [None]:
question = "What gross premiums written did syndicate 263 report in 2022"

In [None]:
headers = {
    'x-api-key': f'{CHAT_PDF_API}',
    "Content-Type": "application/json",
}

data = {
    "referenceSources": True,
    'sourceId': f"{source_id}",
    'messages': [
        {
            'role': "user",
            'content': f"{question}",
        }
    ]
}

response = requests.post(
    'https://api.chatpdf.com/v1/chats/message', headers=headers, json=data)

if response.status_code == 200:
    print('Result:', response.json()['content'])
    print('References:', response.json()['references'])
else:
    print('Status:', response.status_code)
    print('Error:', response.text)

# Delete PDF

In [None]:
headers = {
  'x-api-key': f'{CHAT_PDF_API}',
  'Content-Type': 'application/json',
}

data = {
  'sources': [f'{source_id}'],
}

try:
  response = requests.post(
    'https://api.chatpdf.com/v1/sources/delete', json=data, headers=headers)
  response.raise_for_status()
  print('Success - pdf deleted')
except requests.exceptions.RequestException as error:
  print('Error:', error)
  print('Response:', error.response.text)

# Custom Version ---

#### curl -O https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin

In [2]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PDFPlumberLoader, PyPDFLoader, DirectoryLoader, PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain import HuggingFaceHub
from langchain import PromptTemplate, LLMChain
from pdf2image import convert_from_path

## Directory Loader Version ------

In [3]:
loader = DirectoryLoader("./data", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load_and_split()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(documents)
len(texts)

158

### Create embeddings

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
db = Chroma.from_documents(texts, embeddings, persist_directory="chroma")

### Create chain

In [7]:
falcon_llm = HuggingFaceHub(
    repo_id="tiiuae/falcon-7b-instruct",
    model_kwargs={
        "device_map": "auto",
        "temperature": 0.5,
        "max_new_tokens": 200,
        "max_length": 512,
    },
)

In [8]:
model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
gpt_llm = GPT4All(model=model_path,max_tokens=510, n_batch=8, backend="gptj", verbose=False)

Found model file at  ./ggml-gpt4all-j-v1.3-groovy.bin
gptj_model_load: loading model from './ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load: ...

objc[89890]: Class GGMLMetalClass is implemented in both /Users/nickaristidou/.pyenv/versions/3.10.8/envs/venv/lib/python3.10/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2c11f8208) and /Users/nickaristidou/.pyenv/versions/3.10.8/envs/venv/lib/python3.10/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x2c1444208). One of the two will be used. Which one is undefined.


................................ done
gptj_model_load: model size =  3609.38 MB / num tensors = 285


In [9]:
template = """
        You are an artificial intelligence assistant who understands insurance.
        Use the following pieces of context to answer the question at the end.
        Extract numbers where appropriate to help answer questions.
        If you don't know the answer, just say you don't know. DO NOT try to make up an answer.

        {context}

        Question: {question}
        Helpful answer in markdown:
        """
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [10]:
MODEL_NAME = "gpt_llm"

In [11]:
match MODEL_NAME:
    case "falcon_llm":
        llm = falcon_llm
    case "gpt_llm":
        llm = gpt_llm
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    # chain_type_kwargs={
    #     "prompt": prompt
    # },
    verbose=False,
)

In [12]:
# Use the folling to see the default prompt -- run the above with no chain type kwargs
# print(qa.combine_documents_chain.llm_chain.prompt.template)

## Ask questions ----

In [13]:
%%time
response = qa(
    "What is the combined ratio for syndicate 2623 at year end 2022"
)

CPU times: user 1min 10s, sys: 283 ms, total: 1min 10s
Wall time: 17.8 s


In [14]:
response["result"]

' The combined ratio of Syndicate 2623 was 91% in 2022.'

In [None]:
response["source_documents"][0].metadata["source"]

## Single PDF loader -----

In [None]:
pdf_file = "./data/hiscox.pdf"
pdf_file2 = "/Users/nickaristidou/Downloads/alex_test.pdf"

In [None]:
images = convert_from_path(pdf_file, dpi=88)

In [None]:
loader = PyPDFLoader(file_path=pdf_file)
documents = loader.load_and_split()
len(documents)

In [None]:
loader2 = PyPDFLoader(file_path=pdf_file2)
documents2 = loader2.load_and_split()
len(documents2)

In [None]:
# print(documents2[5].page_content)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(documents)
len(texts)

In [None]:
token_splitter = TokenTextSplitter(
                    chunk_size=100, chunk_overlap=10
                )  # This the encoding for text-embedding-ada-002
texts = token_splitter.split_documents(texts)

In [None]:
texts2 = text_splitter.split_documents(documents2)
len(texts2)

In [None]:
texts_total = texts + texts2

## Create embeddings

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
db = Chroma.from_documents(texts_total, embeddings, persist_directory="chroma")

## Create Chain

In [None]:
model_n_ctx = 1000
model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
llm = GPT4All(model=model_path, backend="gptj", verbose=False)

In [None]:
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = config.HF_API

# llm = HuggingFaceHub(
#     repo_id="tiiuae/falcon-7b-instruct",
#     model_kwargs={
#         "temperature":1
#     }
# )

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    verbose=False,
)

## Ask questions

In [None]:
%%time
res = qa(
    "What combined ratio did Syndicate 33 report in 2022"
)

In [None]:
print(res)

In [None]:
print(res["result"])

In [None]:
for ref in range(0, len(res["source_documents"])):
    print(res["source_documents"][ref].metadata["page"])

In [None]:
images[15]

# Ask question about 2nd pdf

In [None]:
res_2 = qa(
    "Give me a summary of transaction reporting for mifid? extract the response from the text."
)

In [None]:
# print(res_2)

In [None]:
res_2["result"]

In [None]:
res_2["source_documents"]

In [None]:
test = qa(
    "From page 5 of the hiscox report what was the Gross premiums written in 2022? extract the reponse from the text."
)

In [None]:
test["result"]

# NOTE: Add prompt and investigate LangchainDirectoryLoader

In [None]:
from tempfile import TemporaryDirectory
from pathlib import Path
import shutil

In [None]:
with TemporaryDirectory() as dir:
        dir
        shutil.copyfileobj("./docs/hiscox.pdf", )
        tmp_path = Path(tmp.name)

In [None]:
Path("./docs/"+"hiscox.pdf")