In [None]:
# https://medium.com/towards-data-science/topic-modeling-with-llama-2-85177d01e174

In [1]:
# Run if working locally
%load_ext autoreload
%autoreload 2
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sqlite3
from sqlite3 import Error
import pickle
import os, sys
import config

config.root_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, config.root_path)

<IPython.core.display.Javascript object>

In [6]:
from datasets import load_dataset

dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]

# Extract abstracts to train on and corresponding titles
abstracts = dataset["abstract"]
titles = dataset["title"]

<IPython.core.display.Javascript object>

In [5]:
# The abstract of "Attention Is All You Need"
print(abstracts[13894])

  The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks in an encoder-decoder configuration. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer, based
solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to be
superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014
English-to-German translation task, improving over the existing best results,
including ensembles by over 2 BLEU. On the WMT 2014 English-to-French
translation task, our model establishes a new single-model state-of-the-art
BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction
of the training costs of the best models from the literature. We show that the
Transfor

<IPython.core.display.Javascript object>

In [36]:
from torch import bfloat16
import torch
import transformers

model_id = "daryl149/llama-2-7b-chat-hf"

# Quantization to load an LLM with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type="nf4",  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16,  # Computation type
)

<IPython.core.display.Javascript object>

In [38]:
# # Llama 2 Tokenizer
# tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# # Llama 2 Model
# model = transformers.AutoModelForCausalLM.from_pretrained(
#     model_id,
#     trust_remote_code=True,
#     quantization_config=bnb_config,
#     device_map="auto",
# )


def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model name
    :param bnb_config: Bitsandbytes configuration
    """

    # Get number of GPU device and set maximum memory
    n_gpus = torch.cuda.device_count()
    max_memory = f"{40960}MB"

    # Load model
    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # dispatch the model efficiently on the available resources
        max_memory={i: max_memory for i in range(n_gpus)},
    )

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Set padding token as EOS token
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


model, tokenizer = load_model(model_id, bnb_config)
model.eval()

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.

<IPython.core.display.Javascript object>

In [27]:
# Our text generator
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1,
)

<IPython.core.display.Javascript object>

In [31]:
prompt = "Could you explain to me how 4-bit quantization works as if I am 5?"
res = generator(prompt, max_length=100, num_return_sequences=1)
print(res[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Could you explain to me how 4-bit quantization works as if I am 5? 7:39 AM - Marth 5:00 AM That's an interesting point, but it is not a really precise, precise definition of quantization. 8:29 AM - Marth 5:00 AM I was asking you a question that maybe you could clarify now. 9:40 AM - Marth 5:30 AM Why don't you know about quantization right now? 10:20 AM -


<IPython.core.display.Javascript object>