In [1]:
from huggingface_hub import hf_hub_download

from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain, ConversationChain
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory


import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch 

import warnings
warnings.filterwarnings('ignore')

In [3]:
from llama_cpp import Llama  

ModuleNotFoundError: No module named 'llama_cpp'

In [2]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"  # quantized model 
model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"

In [None]:
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# GPU 
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512,  
    n_gpu_layers=32
)

print(lcpp_llm.params.n_gpu_layers)

Prompt

In [None]:
prompt = "Write a linear regression in python"
prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

USER: {prompt}

ASSISTANT:
'''


response = lcpp_llm(
    prompt=prompt_template, 
    max_tokens=256, 
    
    temperature=0.5, 
    top_p=0.95,
    repeat_penalty=1.2, 
    top_k=150,
    echo=True
)

print(response)

print(response["choices"[0]["text"]])

First, create a tokenizer and the transformers.pipeline model and then make a pipeline with HuggingFacePipeline

In [None]:
model="meta-llama/Llama-2-7b-chat-hf"  # Llama 2 Chat model with 7B paramters

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",      # CPu or Gpu based on availability 
    max_length=1000,        # for generated text 
    do_sample=True,         # FOR being more creative, sampling is required 
    top_k=10,   # controls the sampling process
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id     # eos specification
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature':0})

prompt="Describe India and its alliances"

print(llm(prompt))

In [None]:
tokenizer=AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                        use_auth_token=True)

model=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                           device_map='auto',
                                           torch_dtype=torch.float16,
                                           use_auth_token=True)

pipeline=transformers.pipeline("text-generation",
                               model=model,
                               tokenizer=tokenizer,
                               torch_dtype=torch.bfloat16,
                               device_map="auto",
                               max_new_tokens=512,
                               min_new_tokens=-1,
                               temperature=0.75,
                               do_sample=True,
                               top_k=30,
                               num_return_sequences=1,
                               eos_token_id=tokenizer.eos_token_id)


In [4]:
!nvidia-smi

Tue Jun  4 19:49:13 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78                 Driver Version: 550.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce MX450           On  |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P8             N/A / ERR!  |       5MiB /   2048MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
llm=HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature':0})

In [None]:
B_INST, E_INST= "[INST]", "[/INST]"

# system prompt, as a instruction 
B_SYS, E_SYS = "<<SYS>>\n", "\n<<SYS>>\n\n"

In [None]:
DEFAULT_SYSTEM_PROMPT="""\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
"""

In [None]:
instruction = "Who won the T20 Cricket World Cup 2022"

SYSTEM_PROMPT=B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

template = B_INST + SYSTEM_PROMPT + instruction + E_INST