In [64]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from langchain_community.embeddings import HuggingFaceEmbeddings
import streamlit as st
import chromadb
import os

In [65]:
def load_chunk_persist_pdf(model_name) -> Chroma:
    # Data loading
    pdf_folder_path = "./data"
    documents = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

    # Chunking
    text_splitter = CharacterTextSplitter(separator=" ")
    chunked_documents = text_splitter.split_documents(documents)
    print(f"Chunked {len(chunked_documents)} documents")

    # Vectorization
    client = chromadb.Client()
    if client.list_collections():
        consent_collection = client.create_collection("consent_collection")
    else:
        print("Collection already exists")
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=HuggingFaceEmbeddings(model_name=model_name),
        persist_directory="./databases"
    )
    vectordb.persist()

    return vectordb

In [66]:
def create_agent_chain(model_name):
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    chain = load_qa_chain(llm=model, chain_type="stuff")
    return chain

In [67]:
def get_llm_response(query, emb_model, llm_model):
    vectordb = load_chunk_persist_pdf(emb_model)
    chain = create_agent_chain(llm_model)
    matching_docs = vectordb.similarity_search(query)
    print(f"Found {len(matching_docs)} matching documents")
    answer = chain.run(input_documents=matching_docs, question=query)
    return answer

In [68]:
# Embedding selection model
emb_model_options = ["BAAI/bge-small-en-v1.5"]
emb_model = emb_model_options[0]

# LLM selection model
llm_model_options = ["impira/layoutlm-document-qa", "naver-clova-ix/donut-base-finetuned-docvqa"]
llm_model = llm_model_options[0]

In [69]:
print(get_llm_response("Tell me the authors", emb_model, llm_model))

Chunked 108 documents
Collection already exists


ValueError: Unrecognized configuration class <class 'transformers.models.layoutlm.configuration_layoutlm.LayoutLMConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.