In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

In [3]:
from trl import SFTTrainer

In [4]:
from jinja2 import Template

In [5]:
import yaml

In [6]:
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

In [7]:
NEW_MODEL_NAME = "TinyButMighty"

In [8]:
DATASET_NAME = "macadeliccc/opus_samantha"

In [9]:
SPLIT = "train"

In [10]:
MAX_SEQ_LENGTH = 2048

In [11]:
num_train_epochs = 1

In [12]:
license = "apache-2.0"

In [13]:
username = "NamanAhuja"

In [14]:
learning_rate = 1.41e-5

In [15]:
per_device_train_batch_size = 4

In [16]:
gradient_accumulation_steps = 1

In [18]:
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards:   0%|          | 0/2 [01:10<?, ?it/s]


KeyboardInterrupt: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

In [None]:
dataset = load_dataset("macadeliccc/opus_samantha", split="train")

In [None]:
EOS_TOKEN=tokenizer.eos_token_id

In [None]:
def process_dataset(mydata):

    conversations = mydata["conversations"]

    texts = []

    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}

    end_mapper = {"system": "", "human": "", "gpt": ""}

    for c in conversations:

        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in c)

        texts.append(f"{text}{EOS_TOKEN}")

    return {"text": texts}

In [None]:
dataset = dataset.map(process_dataset, batched=True)

In [None]:
print(dataset['text'][2])

In [None]:
args = TrainingArguments(

    per_device_train_batch_size=1,

    gradient_accumulation_steps=gradient_accumulation_steps,

    gradient_checkpointing=True,

    learning_rate=2e-5,

    lr_scheduler_type="cosine",

    max_steps=-1,

    num_train_epochs=num_train_epochs,

    save_strategy="no",

    logging_steps=1,

    output_dir=NEW_MODEL_NAME,

    optim="paged_adamw_32bit",

    bf16=True,

In [None]:
)

In [None]:
trainer = SFTTrainer(

    model=model,

    args=args,

    train_dataset=dataset,

    dataset_text_field="text",

    max_seq_length=MAX_SEQ_LENGTH,

    formatting_func=process_dataset

In [None]:
)

In [None]:
trainer.train()