In [40]:
!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers



In [41]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

from pathlib import Path
import os

BASE_DIR = Path.cwd()
TMP_DIR = BASE_DIR.joinpath('data', 'tmp')
LOCAL_STORE_DIR = BASE_DIR.joinpath('data', 'vector_store')


def load_docs():
  #loader = DirectoryLoader(TMP_DIR.as_posix(), glob='**/*.pdf')
  return [UnstructuredPDFLoader(os.path.join(TMP_DIR.as_posix(), fn)) for fn in os.listdir(TMP_DIR.as_posix())]
  #return loader.load()

In [42]:
from langchain.text_splitter import CharacterTextSplitter

def split_docs(docs):
  splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  texts = splitter.split_documents(docs)
  return texts

In [44]:
!pip install chromadb
!pip install unstructured
!pip install pdf2image
!pip install pdfminer



In [45]:
!pip show pdfminer
!pip install --upgrade pdfminer.six

Name: pdfminer
Version: 20191125
Summary: PDF parser and analyzer
Home-page: http://github.com/euske/pdfminer
Author: Yusuke Shinyama
Author-email: yusuke@shinyama.jp
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: pycryptodome
Required-by: 


In [46]:
from langchain.embeddings import HuggingFaceEmbeddings

embedds = HuggingFaceEmbeddings()

index = VectorstoreIndexCreator(
    embedding=embedds,
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(load_docs())

ImportError: ignored

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [None]:
from langchain.vectorstores import FAISS

docs = load_docs()
try:
  db = FAISS.from_documents(docs, embedds)
except:
  print("There are no documents loaded")

There are no documents loaded


In [None]:
q = "What is this thing , or how does it work"
docs = db.similarity_search(q)

In [None]:
import textwrap

def special_print(text, width=110):
  lines = text.split('\n')
  wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
  return '\n'.join(wrapped_lines)


In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

generator=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})



In [None]:
chain = load_qa_chain(generator, chain_type="stuff")

In [None]:
q = 'What are types of jokes'
docs = db.similarity_search(q)
chain.run(input_documents=docs, question=q)

### This part treating the indexer the same as the retriever inside the chain

In [None]:
from langchain.chains import RetrievalQA
QAChain = RetrievalQA.from_chain_type(llm = generator,
                                    chain_type ="stuff",
                                    retriever=index.vectorstore.as_retriever(),
                                    input_key = "question")

In [None]:
QAChain.run()