In [20]:
import torch

In [21]:
torch.cuda.is_available()

True

In [22]:
from torch import cuda, bfloat16

In [23]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cuda:0


In [24]:
import transformers

In [25]:
bnb_config = transformers.BitsAndBytesConfig(
# bnb_config = BitsAndBytesConfig(
    #load_in_4bit=True,
    load_in_4bit_fp32_cpu_offload=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [26]:
#model_name = 'tiiuae/falcon-40b-instruct'
# model_name = 'tiiuae/falcon-7b-instruct'
model_name = "meta-llama/Llama-2-7b-chat-hf"

In [30]:
%%time
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=True,
    offload_folder="/mnt/d/AAA/VsCode/offload/meta-Llama-2-7b-chat-hf" # Disk path to load the model when GPU, CPU runs out of memo
    #device_map=device_map 
)
model.eval()
print(f"Model loaded on {device}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Model loaded on cuda:0
CPU times: user 1min 9s, sys: 1min, total: 2min 10s
Wall time: 20min 27s


In [31]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]



Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [32]:
from transformers import StoppingCriteria, StoppingCriteriaList

# we create a list of stopping criteria
stop_token_ids = [
    tokenizer.convert_tokens_to_ids(x) for x in [
        ['Human', ':'], ['AI', ':']
    ]
]

stop_token_ids

[[0, 29901], [23869, 29901]]

In [33]:
# We need to convert these into `LongTensor` objects:
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    0, 29901], device='cuda:0'),
 tensor([23869, 29901], device='cuda:0')]

In [34]:
# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [35]:
# Now we're ready to initialize the HF pipeline. There are a few additional parameters that we must define here. Comments explaining these have been included in the code.
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
%%time
# Confirm this is working
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

In [16]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline

In [17]:
# template for an instruction with no input
prompt = PromptTemplate(
    input_variables=["instruction"],
    template="{instruction}"
)

In [18]:
%%time
llm = HuggingFacePipeline(pipeline=generate_text)

llm_chain = LLMChain(llm=llm, prompt=prompt)

CPU times: user 262 µs, sys: 98 µs, total: 360 µs
Wall time: 387 µs


In [20]:
%%time
print(llm_chain.predict(
    instruction="Explain to me the difference between nuclear fission and fusion."
).lstrip())

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Nuclear fission is a nuclear reaction in which the nucleus of an atom splits into two smaller nuclei, releasing a large amount of energy in the form of radiation and kinetic energy of the fragments. Fusion, on the other hand, is a nuclear reaction in which two lighter atomic nuclei combine to form a heavier atomic nucleus, releasing a large amount of energy in the form of radiation and kinetic energy of the resulting nucleus. In both reactions, the binding energy of the nucleus is released, allowing for the release of a significant amount of energy.
CPU times: user 6min 32s, sys: 2min 17s, total: 8min 50s
Wall time: 9min 14s
