In [1]:
!nvidia-smi

Tue Mar 26 14:13:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
|  0%   41C    P8              17W / 490W |   4710MiB / 24564MiB |      5%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

## Importing libraries

In [2]:
import accelerate 
import transformers
import json
import os
from huggingface_hub import login

REPO_ID = "meta-llama/Llama-2-7b-chat-hf"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
cache_dir = "./models"
os.environ['HF_HOME'] = './cache/'
environment = "local"

## Generating the device map on laptop

In [3]:
# Show that when we do this neither GPU nor CPU memory increases
login("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", add_to_git_credential=True)
config = transformers.AutoConfig.from_pretrained(REPO_ID)
with accelerate.init_empty_weights():
    fake_model = transformers.AutoModelForCausalLM.from_config(config)
    
device_map = accelerate.infer_auto_device_map(fake_model, max_memory={0: "10GiB", "cpu": "6GiB"})
#print(json.dumps(device_map, indent=4))

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/fish/.cache/huggingface/token
Login successful


## Loading the model memory efficiently

* LLM.int8() quantization
* Offloading: Uses GPU memory to the maximum, then CPU and finally memory-mapped chunks on disk
  * How offloading works: https://huggingface.co/docs/accelerate/usage_guides/big_modeling

**Note**: If you use WSL/Windows, you might run into an issue where the `bitsandbytes` library cannot find the file `libbitsandbytes_cpu.so`. If so follow the instructions [here](https://github.com/TimDettmers/bitsandbytes/issues/156#issuecomment-1474056975)

In [4]:
import transformers

tokenizer = transformers.LlamaTokenizer.from_pretrained(REPO_ID)

# Check what happens when device_map = auto
# This will fail as the model in FP32 precision cannot be fit on CPU
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")
# This will fail as the model in FP32 precision cannot be fit on GPU
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf").to("cuda")
# This will also fail as the model cannot be fit on GPU fully even with the quantization
# model = transformers.LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", device_map="auto", load_in_8bit=True)

if environment == "local":
    model = transformers.LlamaForCausalLM.from_pretrained(
        REPO_ID, 
        device_map=device_map, 
        offload_folder="/tmp/.offload",
        load_in_8bit=True,
        llm_int8_enable_fp32_cpu_offload=True,
    )
elif environment == "colab":
    model = transformers.LlamaForCausalLM.from_pretrained(
        REPO_ID, 
        device_map="auto", 
        load_in_8bit=True,
    )
else:
    raise ValueError(f"Environment can only be local/colab. Got {environment}")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
# without quantization memory footprint-> 27020779520 (~25.1GB)
# The more weights on GPU the better the memory reduction
# with quantization memory footprint -> 10710692352 (~10GB)
print(f"Memory footprint in bytes: {model.get_memory_footprint()}")

Memory footprint in bytes: 13094109184


In [6]:
# Check if it provided the full list of layers without device map 
#print(json.dumps(model.hf_device_map, indent=4))

## Inferring with the loaded model

In [7]:
import time 

# Remember Llama is not instruction finetuned
batch = tokenizer(
    "The Optymize Protocol is a first-of-its-kind multi-blockchain solution that combines both yield enhancement and risk mitigation for crypto assets. Explain more on the keywords here",
    return_tensors="pt", 
    add_special_tokens=False
)

# /home/thushv89/anaconda3/envs/ml.torch/lib/python3.9/site-packages/transformers/generation/utils.py:1405: UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cpu, whereas the model is on meta. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('meta') before running `.generate()`.
batch = {k: v for k, v in batch.items()}
n_input_tokens = batch["input_ids"].shape[-1]

t1 = time.perf_counter()
generated = model.generate(batch["input_ids"].to("cuda"), max_length=n_input_tokens+25)
t2 = time.perf_counter()
print(tokenizer.decode(generated[0]))
n_generated = generated.shape[-1]-batch["input_ids"].shape[-1]
print(f"It took {t2-t1}s to generate the sequence of {n_generated} tokens ({n_generated/(t2-t1)} tokens/s).")

Keyword arguments {'add_special_tokens': False} not recognized.


The Optymize Protocol is a first-of-its-kind multi-blockchain solution that combines both yield enhancement and risk mitigation for crypto assets. Explain more on the keywords here:

Yield enhancement: This refers to the process of increasing the return on investment (ROI) for
It took 26.480837741999494s to generate the sequence of 25 tokens (0.9440788937107214 tokens/s).


# Optymize Special

In [8]:
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

import pandas as pd
import os
import torch

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.documents.base import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain import hub
from langchain.chains import RetrievalQA
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

def make_rag_chain(model, retriever, rag_prompt = None):
    # We will use a prompt template from langchain hub.
    if not rag_prompt:
        rag_prompt = hub.pull("rlm/rag-prompt")

    # And we will use the LangChain RunnablePassthrough to add some custom processing into our chain.
    rag_chain = (
            {
                "context": RunnableLambda(get_question) | retriever | format_docs,
                "question": RunnablePassthrough()
            }
            | rag_prompt
            | model
    )

    return rag_chain


def get_question(input):
    if not input:
        return None
    elif isinstance(input,str):
        return input
    elif isinstance(input,dict) and 'question' in input:
        return input['question']
    elif isinstance(input,BaseMessage):
        return input.content
    else:
        raise Exception("string or dict with 'question' key expected as RAG chain input.")
        
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

if torch.cuda.is_available():
    model_id = "meta-llama/Llama-2-7b-chat-hf"
    model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=cache_dir, device_map=device_map)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

pipe = pipeline(
     "text-generation", 
     model=model, 
     tokenizer=tokenizer,
     return_tensors='pt',
     max_length=512,
     max_new_tokens=512,
     model_kwargs={"torch_dtype": torch.bfloat16},
    )

llm = HuggingFacePipeline(
 pipeline=pipe,
 model_kwargs={"temperature": 0.7, "max_length": 512},
)

df = pd.read_excel("bot2/optymize.xlsx")
data_list = df.values.ravel().tolist()
document_list = []

for content in data_list:
    document = Document(content=content, page_content=content)
    document_list.append(document)
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(document_list)
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(docs, embedding_function)

retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 4, 'fetch_k': 20})
prompt = hub.pull("rlm/rag-prompt")
rag_chain = make_rag_chain(llm, retriever, rag_prompt = prompt)

questions = [
        "what is Optymize?",
        "how can i deposite coin on Optymize?",
        "what is Optymize's twitter?",
        "what is gOPZ tokens?",
        "what is Optymize tokenomics?",
        "what is Optymize details tokenomics?",
        "Optymize Vault Model – How does it works?, detail explaination",
        "Optymize Vault Model – How does it works?"
        ]
for q in questions:
    print("\n--- QUESTION: ", q)
    #print("* Ans:\n", rag_chain.invoke(q))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()



--- QUESTION:  what is Optymize?

--- QUESTION:  how can i deposite coin on Optymize?

--- QUESTION:  what is Optymize's twitter?

--- QUESTION:  what is gOPZ tokens?

--- QUESTION:  what is Optymize tokenomics?

--- QUESTION:  what is Optymize details tokenomics?

--- QUESTION:  Optymize Vault Model – How does it works?, detail explaination

--- QUESTION:  Optymize Vault Model – How does it works?


In [None]:
llm.invoke("what is Optymize?")

Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
