In [4]:
# !pip install transformers bitsandbytes accelerate boto3
!pip install boto3 bitsandbytes accelerate

BUCKET_NAME = 'workhsop-llama-weights'
LORA_FILENAME = 'lora_tensors.safetensors'



In [11]:
import boto3
import os
def assumed_role_session():
    # Assume the "notebookAccessRole" role we created using AWS CDK.
    client = boto3.client("sts")
    return boto3.session.Session()

def download_llama():
    session = assumed_role_session()
    my_bucket = session.resource('s3').Bucket(BUCKET_NAME)
    if not(os.path.exists('llama')):
        os.mkdir('llama')

    # download file into current directory
    for s3_object in my_bucket.objects.filter(Prefix='llama/').all():
        # Need to split s3_object.key into path and file name, else it will give error file not found.
        path, filename = os.path.split(s3_object.key)
        if filename:
            my_bucket.download_file(s3_object.key, f"llama/{filename}")

In [12]:
download_llama()

KeyboardInterrupt: 

In [5]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [6]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM,
)

model = LlamaForCausalLM.from_pretrained(
    "llama",
)

tokenizer = AutoTokenizer.from_pretrained(
    "llama",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Quantization:

Quantization is about reducing computational and memory costs by representing weights and activations with low-precision data types. For example, instead of 32-bit floating point, we can use 8-bit integers. This reduces memory usage, energy consumption, and speeds up operations like matrix multiplication.

Quantization Techniques: Two common quantization cases are float32 to float16 and float32 to int8. The former is straightforward, while the latter involves projecting the float32 range to the int8 space using quantization parameters like scale and zero-point. 


![quant ft](images/quant-ft.png)


![quant perf](images/quant-perf.png)


In [7]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = LlamaForCausalLM.from_pretrained(
    "llama",
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(
    "llama",
    quantization_config=bnb_config,
)

RuntimeError: No GPU found. A GPU is needed for quantization.

In [10]:
prompt = """Today's date is May 30, 2024. you are given a string with information of an event with time and location. you should extract the information from the given text in json format as shown:
{{"datetime": "yyyy-mm-dd", "location": "str"}} 

here's the text: {text}
json:
"""

In [11]:
data_file = "examples.txt"
examples = []
# Sample data
with open(data_file, 'r') as file:
    for line in file:
        if line:
            examples.append(line.strip())

In [12]:
for example in examples:
    inputs = tokenizer(prompt.format(text=example), return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=150)
    text = tokenizer.batch_decode(outputs)[0]
    print(example)
    response = text.split("json:")[-1]
    print(response[:response.find("}") +1])
    print("=======")

On January 15, 2023, display the flooding events in Jakarta.

{"datetime": "2023-01-15", "location": "Jakarta"}
Show images of burn scars in the Colorado Rockies from March 12, 2023.

{
    "datetime": "2023-03-12",
    "location": "Colorado Rockies"
}
Crop types in Sudan as observed on May 20, 2023.

{
    "datetime": "2023-05-20",
    "location": "Sudan"
}
February 8, 2023: Provide satellite imagery of flooding in Miami.

{
"datetime": "2023-02-08",
"location": "Miami"
}
Burn scars in Oregon, noted on April 17, 2023.


{
    "datetime": "2023-04-17",
    "location": "Oregon"
}
July 25, 2023: Identify crop types in Northern India.

{
    "datetime": "2023-07-25T00:00:00",
    "location": "Northern India"
}
Flooding in central China on June 30, 2023.

{
    "datetime": "2023-06-30",
    "location": "Shanghai"
}
Satellite images of burn scars in Montana from August 14, 2023.

{
  "datetime": "2023-08-14",
  "location": "Montana"
}
December 5, 2023: Show crop types in Chile.


{"datetime

In [None]:
!nvidia-smi

In [None]:
prompt.format(text=example)

In [None]:
example