In [8]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes

In [9]:
import os
os.environ["HF_TOKEN"] = "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"

In [10]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

In [11]:
base_model = "NousResearch/Llama-2-7b-chat-hf"

new_model = "llama-2-7b-platypus"

dataset = load_dataset("Pradhumn/mini-platypus-llama-dataset", split="train")

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_size = "right"

In [12]:
dataset

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})

In [13]:
dataset[0]

{'instruction': '### Instruction:\nLet\'s come up with a rich and complex alien world setting suitable for stories, pseudo-documentaries, or roleplaying games such as DnD. This can be an involved process, so I\'d like you to just cover the basics first. We can refine the details further later.\n\nI\'d like a planet with a variety of biomes, inhabited by many creatures, much like Earth, but life on this world took its own path, with different plants and animals (and perhaps a different distinction than "plant" vs. "animal" to begin with)\n\nThere should be at least one sentient species represented in a wide variety of cultures and societies. They should have complex interactions with each other, and other species on the planet, and resulting from that, their own needs and wants and fears which ought to be relatable in context but may at the same time be strange, unusual, and, well, alien to humans.\n\nBut let\'s start at the beginning: Can you give a few broad, interesting biomes and ho

In [14]:
dataset.to_pandas()

Unnamed: 0,instruction,output
0,### Instruction:\nLet's come up with a rich an...,Planet Name: Xylothar\n\nXylothar is a diverse...
1,"### Instruction:\nLet\n$$p(x,y) = a_0 + a_1x +...","Observe that \begin{align*}\np(0,0) &= a_0 = ..."
2,"### Instruction:\nGiven the code below, refact...",Here is the refactored and commented version:\...
3,### Instruction:\nFind the area of the region ...,"Let $n = \lfloor x \rfloor,$ and let $\{x\} = ..."
4,### Instruction:\nLet $P$ be the plane passing...,Let $\mathbf{v} = \begin{pmatrix} x \\ y \\ z ...
...,...,...
995,### Instruction:\nHello. My name is Mike. I ha...,"Hello Mike, it's nice to meet you. As an AI la..."
996,### Instruction:\nGiven a prime $p$ and an int...,"To find the primitive roots $\pmod 7$, I need ..."
997,### Instruction:\nLet $f$ be defined by \[f(x...,The number $f^{-1}(-3)$ is the value of $x$ su...
998,### Instruction:\nBEGININPUT\nBEGINCONTEXT\nda...,Dr. Eleanor Thompson's study found that partic...


In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

peft_config = LoraConfig(
    lora_alpha=15,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"":0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [18]:
training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,#3,5 good for the Llama 2 Model
        per_device_train_batch_size=4,# Number of batches that we are going to take for every step
        gradient_accumulation_steps=1,
        evaluation_strategy="steps",#Not helpful because we donot want to evaluate the model we just want to train it
        eval_steps=1000,
        logging_steps=25,
        optim="paged_adamw_8bit",#Adam Optimizer we will be using but a version that is paged and in 8 bits, so it will lose less memory
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        warmup_steps=10,
        report_to="tensorboard",
        max_steps=-1, # if maximum steps=2, it will stop after two steps
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)



In [19]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=250, training_loss=1.1356683578491211, metrics={'train_runtime': 5577.2834, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.045, 'total_flos': 1.989602172420096e+16, 'train_loss': 1.1356683578491211, 'epoch': 1.0})

In [20]:
trainer.model.save_pretrained(new_model)

In [21]:
prompt = "What is a large language model?"
instruction = f"### Instruction:\n{prompt}\n\n### Response:\n"
#Using Pipeline from the hugging face
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)
result = pipe(instruction)
#Trim the response, remove instruction manually
print(result[0]['generated_text'][len(instruction):])

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


A large language model is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate human-like language outputs. These models are designed to learn the patterns and structures of language, allowing them to generate coherent and contextually appropriate text.

Large language models are typically trained using deep learning techniques, such as transformer architectures, and are often referred to as neural networks. They are trained on vast amounts of text data, such as books, articles, and websites, and are designed
