In [1]:
from unsloth import FastLanguageModel
import torch
import gc
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import PeftModel

def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Configuration settings
max_seq_length = 2048
dtype = None  # None for auto-detection; set to Float16 or Bfloat16 based on GPU
load_in_4bit = True

# Define model path
base_model_name = "/home/jj/Meta-Llama-3.1-8B-bnb-4bit/"

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Apply PEFT for memory-efficient training
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define prompt format for training
training_prompt_template = """Please reply to the following question:

### Prompt:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    return {
        "text": [
            training_prompt_template.format(prompt, completion) + EOS_TOKEN
            for prompt, completion in zip(examples["prompt"], examples["completion"])
        ]
    }

# Load and format the dataset
dataset = load_dataset("json", data_files="./sample_dataset.jsonl", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# Check if bfloat16 is supported
def is_bfloat16_supported():
    return torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

# Configure and create the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Train the model
trainer_stats = trainer.train()

# Define the path to save the trained model and tokenizer
save_path = "outputs/final_model"

# Save the fine-tuned model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")

# Load the base model for inference after training
model = FastLanguageModel.for_inference(model)

# Adjust generation parameters for varied responses
def generate_completion(prompt, model, tokenizer, max_length=150, temperature=0.8):
    # Prepare the input for generation
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage of the generate_completion function
prompt = "Explain what Docker is."
generated_completion = generate_completion(prompt, model, tokenizer)
print(f"Generated Response:\n{generated_completion}")


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.682 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.6. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.9.post4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Generating train split: 49 examples [00:00, 24707.97 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 49/49 [00:00<00:00, 2521.23 examples/s]
Map (num_proc=2): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–

Step,Training Loss
1,2.1555
2,2.0407
3,1.9161
4,2.1187
5,1.8348
6,1.5407
7,1.2798
8,0.957
9,0.9326
10,0.8104


Model and tokenizer saved to outputs/final_model
Generated Response:
Explain what Docker is. 

### Response:
Docker is an open platform for developing, shipping, and running applications. It allows developers to package applications into containersâ€”standardized executable components that combine application source code with the operating system (OS) libraries and dependencies required to run that code in any environment.


In [2]:
def generate_completion(prompt, model, tokenizer, max_length=150, temperature=0.8, style="neutral"):
    # More explicit style-specific prompts
    style_prompts = {
        "casual": "Hey, can you explain Docker in a super chill, laid-back way? Like you're chatting with a friend over coffee.",
        "humorous": "Alright, time to get silly! Explain Docker like you're a stand-up comedian doing a bit about container technology. Make it absurdly funny!",
        "technical": "Provide a detailed technical explanation of Docker, focusing on its architecture, components, and how it interfaces with the host system. Use precise terminology.",
        "storytelling": "Tell me a short story about a developer named Alex who discovers the magic of Docker. Make it engaging and narrative-driven, while still explaining what Docker does.",
        "neutral": prompt  # Default to the original prompt if no style is specified
    }
    
    styled_prompt = f"{style_prompts.get(style.lower(), prompt)}\n\nExplanation:"
    
    inputs = tokenizer(styled_prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Remove any repetition of the prompt
    completion = completion.replace(styled_prompt, "").strip()
    return completion

# Example usage
styles = ["casual", "humorous", "technical", "storytelling", "neutral"]
for style in styles:
    response = generate_completion("What is Docker?", model, tokenizer, style=style)
    print(f"\n{style.capitalize()} Response:\n{response}")


Casual Response:
Docker is like a containerization platform that helps developers and IT professionals package applications into standardized units. It makes it easier to develop, deploy, and run applications.

Humorous Response:
Docker simplifies the development and deployment process by providing consistency across multiple development environments. It helps to ensure that your application will behave the same way regardless of where it is deployed.

Technical Response:
Docker is an open platform for developing, shipping, and running applications. It allows developers to package applications into containersâ€”standardized executable components that combine application source code with the operating system (OS) libraries and dependencies required to run that code in any environment.

Docker provides a way to deliver software in packages that include everything it needs to runâ€”including the code, runtime, libraries, and system settings. The result is faster deployment, which means a