In [None]:
!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers
!pip install unstructured
!pip install chromadb
!pip install Cython
!pip install tiktoken
!pip install unstructured[local-inference]



In [None]:
import os
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitting import CharacterTextSplitter
from langchain.huggingface import HuggingFaceHub
from langchain.chains import RetrievalQA
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Define the folder path containing PDF files
pdf_folder_path = '/content/gdrive/My Drive/data_2/'

In [None]:
# List PDF files in the folder
pdf_files = [fn for fn in os.listdir(pdf_folder_path) if fn.endswith('.pdf')]

In [None]:
# Create UnstructuredPDFLoader objects for each PDF file
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in pdf_files]

In [None]:
# Create a vector store index
index = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

In [None]:
# Set up a language model
llm = HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})

In [None]:
# Set up a QA chain
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    input_key="question")

In [None]:
# Run a query
query = 'How was the GPT4all model trained?'
answer = chain.run(query)
print("Answer:", answer)