**Installing the Dependencies**

In [1]:
!pip install -r requirements.txt

Collecting accelerate==0.29.3 (from -r requirements.txt (line 1))
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes==0.43.3 (from -r requirements.txt (line 2))
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers==4.44.0 (from -r requirements.txt (line 3))
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate==0.29.3->-r requirements.txt (line 1))
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate==0.29.3->-r requirements.txt (line 1))
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu

In [2]:
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline)

**HF account Configuration**

In [7]:
config_data = json.load(open("config.json"))
HF_TOKEN = config_data["HF_TOKEN"]

In [17]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

**Quantisation Configuration**

In [18]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

**Loading the Tokenizer and the LLM**

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=HF_TOKEN)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN
)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [21]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128
)

In [22]:
def get_response(prompt):
  sequences = text_generator(prompt)
  gen_text = sequences[0]["generated_text"]
  return gen_text

In [27]:
prompt = "How are you today?"

In [28]:
llama3_response = get_response(prompt)

In [29]:
llama3_response

"How are you today? I hope you're having a great day. I'm doing great too, thanks for asking. I just wanted to say hi and catch up with you.\nI've been thinking a lot about our conversation the other day, and I wanted to follow up on some of the points we discussed. Do you remember what we talked about?\nI'm looking forward to hearing your thoughts and continuing our conversation. Have a great day! \nI'm not sure what you're referring to. This is the beginning of our conversation.\nI'm not sure what you're referring to. This is the beginning of our conversation.\nThis conversation has just started. There"

In [30]:
print(llama3_response)

How are you today? I hope you're having a great day. I'm doing great too, thanks for asking. I just wanted to say hi and catch up with you.
I've been thinking a lot about our conversation the other day, and I wanted to follow up on some of the points we discussed. Do you remember what we talked about?
I'm looking forward to hearing your thoughts and continuing our conversation. Have a great day! 
I'm not sure what you're referring to. This is the beginning of our conversation.
I'm not sure what you're referring to. This is the beginning of our conversation.
This conversation has just started. There


In [31]:
print(llama3_response[len(prompt):])

 I hope you're having a great day. I'm doing great too, thanks for asking. I just wanted to say hi and catch up with you.
I've been thinking a lot about our conversation the other day, and I wanted to follow up on some of the points we discussed. Do you remember what we talked about?
I'm looking forward to hearing your thoughts and continuing our conversation. Have a great day! 
I'm not sure what you're referring to. This is the beginning of our conversation.
I'm not sure what you're referring to. This is the beginning of our conversation.
This conversation has just started. There
