In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#By Iffy_ https://www.kaggle.com/code/irfanmansuri/lm-llama3b-instruct/notebook

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch
from accelerate import infer_auto_device_map
import gc
from IPython.display import Markdown, display

# Option 1 (Recommended for T4 x2): Enable tokenizer parallelism
# T4s have enough memory and processing power to benefit from parallel tokenization
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Enable CUDA memory optimizations
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Clear GPU memory before loading model
gc.collect()
torch.cuda.empty_cache()

### Transformers

In [None]:
# %%capture
# %pip install -U transformers accelerate

In [None]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

# Load tokenizer with caching
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    use_fast=True,  # Fast tokenizer is crucial for T4 performance
    cache_dir="./cache",
    padding_side="left",
    truncation=True,
    use_threading=True  # Enable threading for parallel processing
)

# Configure model for dual T4s
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,  # FP16 for optimal T4 performance
    device_map="auto",  # Let accelerate handle dual GPU distribution
    low_cpu_mem_usage=True,
    use_cache=True,
    max_memory={
        0: "11GiB",  # Reserve some memory for CUDA overhead
        1: "11GiB",  # T4s have 16GB each, leaving buffer
        "cpu": "24GiB"  # Generous CPU memory for caching
    },
    offload_folder="offload",
    trust_remote_code=True
)

In [None]:
#By Iffy_ https://www.kaggle.com/code/irfanmansuri/lm-llama3b-instruct/notebook

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
#By Iffy_ https://www.kaggle.com/code/irfanmansuri/lm-llama3b-instruct/notebook

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    batch_size=2,  # Optimal for dual T4s with this model size
    max_length=2048
)

In [None]:
#By Iffy_ https://www.kaggle.com/code/irfanmansuri/lm-llama3b-instruct/notebook

from IPython.display import Markdown, display

messages = [
    {
        "role": "system",
        "content": "You are an https://arcprize.org/guide expert",
    },
    {
        "role": "user",
        "content": "Who is François Chollet?",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt,truncation=True, do_sample=True)

display(
    Markdown(
            outputs[0]["generated_text"].split(
                "<|start_header_id|>assistant<|end_header_id|>"
            )[1]
        )
    )

### Acknowledgements:

Iffy_ https://www.kaggle.com/code/irfanmansuri/lm-llama3b-instruct/notebook