In [1]:

!pip install vllm langchain langchain-community chromadb langchain-openai > /dev/null
!pip install sympy==1.11.1
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


[0mCollecting sympy==1.11.1
  Using cached sympy-1.11.1-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.11.1-py3-none-any.whl (6.5 MB)
[0mInstalling collected packages: sympy
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1+cu121 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.11.1 which is incompatible.[0m[31m
[0mSuccessfully installed sympy
[0mLooking in indexes: https://download.pytorch.org/whl/cu118
Collecting sympy==1.13.1 (from torch)
  Using cached https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[0mInstalling collected packages: sympy
[0mSuccessfully installed sympy


In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import VLLM

llm = VLLM(
    model="facebook/opt-2.7b",
    trust_remote_code=True,
    max_new_tokens=64,
    top_k=20,
    top_p=0.9,
    temperature=0.7,
)

def create_prompt():
    system_prompt = (
        "Use the given context to answer the question based on the book 'Foundations of Cryptography' by Oded Goldreich. "
        "Use two sentences maximum."
        "Answer informally and concisely."
        "Avoid repeating information in the answer."
        "Context: {context}"
    )
    return ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("user", "{input}"),
    ])

def load_and_split_documents(file_path: str, chunk_size: int, chunk_overlap: int):
    loader = TextLoader(file_path)
    documents = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(documents)

def create_vectorstore(documents, model_name="all-MiniLM-L6-v2", persist_directory="chroma_db"):
    embeddings = SentenceTransformerEmbeddings(model_name=model_name)
    return Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)

def build_qa_pipeline(file_path: str):
    split_docs = load_and_split_documents(file_path, chunk_size=1000, chunk_overlap=200)
    vectorstore = create_vectorstore(split_docs)
    prompt = create_prompt()
    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    return create_retrieval_chain(vectorstore.as_retriever(), question_answer_chain)

qa_pipeline = build_qa_pipeline("/content/cryptography.txt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 01-20 20:46:48 config.py:510] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 01-20 20:46:48 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='facebook/opt-2.7b', speculative_config=None, tokenizer='facebook/opt-2.7b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=facebook/opt-2.7b, 

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


  state = torch.load(bin_file, map_location="cpu")


INFO 01-20 20:47:14 model_runner.py:1099] Loading model weights took 4.9551 GB
INFO 01-20 20:47:17 worker.py:241] Memory profiling takes 2.25 seconds
INFO 01-20 20:47:17 worker.py:241] the current vLLM instance can use total_gpu_memory (14.75GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 01-20 20:47:17 worker.py:241] model weights take 4.96GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.48GiB; the rest of the memory reserved for KV Cache is 7.75GiB.
INFO 01-20 20:47:17 gpu_executor.py:76] # GPU blocks: 1587, # CPU blocks: 819
INFO 01-20 20:47:17 gpu_executor.py:80] Maximum concurrency for 2048 tokens per request: 12.40x
INFO 01-20 20:47:20 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilizatio

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

INFO 01-20 20:47:52 model_runner.py:1535] Graph capturing finished in 33 secs, took 0.95 GiB
INFO 01-20 20:47:52 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 38.08 seconds



  embeddings = SentenceTransformerEmbeddings(model_name=model_name)


In [3]:
print(qa_pipeline.invoke({"input": "What is public key cryptography?"})['answer'])

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.91s/it, est. speed input: 239.51 toks/s, output: 33.61 toks/s]


   Answer: The process of encrypting a message with a private key. The process of decrypting
   the message with the public key is called decryption.
   (In Definition B.1.1 the choice of plaintext means the random variable X n , whereas in
  





In [4]:
print(qa_pipeline.invoke({"input": "Who is Goldreich?"})['answer'])

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it, est. speed input: 709.45 toks/s, output: 27.99 toks/s]



Oded Goldreich is Professor of Computer Science at the Weizmann Institute of Science
and incumbent of the Meyer W. Weisgal Professorial Chair. An active researcher, he
has written numerous papers on cryptography and is widely considered to be one of
the world experts in the area.



