 # LangChain + LLM + ChromaDB

In [9]:
# SETUP

!pip3 install transformers
!pip3 install einops
!pip3 install accelerate
!pip3 install unstructured-pytesseract
!pip3 install unstructured-inference
!pip3 install sentence_transformers
!pip3 install chromadb

#!pip3 install protobuf==3.20.*

!pip3 install langchain



In [10]:
!pip3 install unstructured
!pip3 install pillow_heif
#!pip3 install cmake
!pip3 install pikepdf pypdf
#!pip3 install python-poppler



In [11]:
# IMPORTS

from transformers import AutoTokenizer
import transformers
import torch

import langchain

langchain.__version__

'0.1.16'

## LLM

In [12]:
# LLM using HuggingFace GPT2

from langchain import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    model_kwargs={
        "max_length": 1024,
        'do_sample': True,
        'top_k': 10,
        'num_return_sequences': 2,
        #'device_map': 'auto',
        'trust_remote_code': True,
        'torch_dtype': torch.bfloat16
    },
    pipeline_kwargs={"max_new_tokens": 50},
    device=0, # With GPU
)

## DOCUMENTS & SPLITTER & VECTORSTORE

In [13]:
# INDEXING

from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain

# 1. LOAD
loader = OnlinePDFLoader("https://www.oracle.com/a/ocom/docs/cloud/cloud-security-for-dummies.pdf")
document = loader.load()

# 2. SPLIT
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=64)
documents = text_splitter.split_documents(document)

# 3. EMBED & STORE
embeddings = HuggingFaceEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)

# RETRIEVAL & GENERATION
qa = ConversationalRetrievalChain.from_llm(
    llm,
    vectorstore.as_retriever(),
    return_source_documents=True,
)



## QUESTION TO LLM = MY_QUERY + CONTEXT_from_VECTORSTORE -> ANSWER FROM LLM

In [15]:
# QUESTION TO LLM

import warnings
warnings.filterwarnings('ignore')

chat_history = []

# Examples: QUERYS PROMPTS:

my_query ="what is a SaaS?"

result = qa({"question": my_query, "chat_history": chat_history})
print(result["answer"])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

» In a SaaS offering, the cloud provider is typically responsible for providing security for the entire technology stack from the data center up to the application, whereas the customer is responsible for ensuring that the SaaS application (including configurations) and its data are used in a secure manner by authorized users.

» In a SaaS offering, the cloud provider is typically responsible for providing security for the entire technology stack from the data center up to the application, whereas the customer is responsible for ensuring that the SaaS application (including configurations) and its data are used in a secure manner by authorized users.

21

Oracle provides infrastructure as a service (IaaS), platform as a service (PaaS), and software as a service (SaaS) cloud offerings, including the following Oracle cloud ser