In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
import transformers
transformers.set_seed(42)

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_checkpoint = "roneneldan/TinyStories-33M"

In [3]:
ds = load_dataset('MohamedRashad/characters_backstories')

In [4]:
ds["train"][400]
ds = ds["train"].train_test_split(test_size=0.2, seed=42)

In [27]:
for x, y in zip(ds["train"]['text'][:3], ds["train"]['target'][:3]):
    print(x, y)


Generate Backstory based on following information
Character Name: Adriarin Melouchevine
Character Race: Wood elf
Character Class: Ranger

Output:
 Runs business side of shop, currently away on a mission to secure the Toern city watch contract. Arriving back to the fort just after the party's return. She was denied the contract due to unknown reasons (it was a Sending from Reynolds, who knows people in the city.)  Mama bear type. (Not actually a bear.)
Generate Backstory based on following information
Character Name: Keezle "Arch Jester Extraordinaire" Arcanodyne
Character Race: Gnome
Character Class: Wizard

Output:
At the moment Keezle enjoys tricking and annoying the evil population of the fallen kingdom of Mhourgh Brak. Currently his greatest enjoyment comes from the Young Adult Deep Dragon that dwells in a cave connected to the old Vault. very open to keezle creative ideas, at least in the beginning, it went out of hand, and Master Molton Runesprocket told Keezle to go explore the

In [5]:
# We'll create a tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)

# We'll need padding to have same length sequences in a batch
tokenizer.pad_token = tokenizer.eos_token

# Define a tokenization function that first concatenates text and target
def tokenize_function(example):
    merged = example["text"] + " " + example["target"]
    batch = tokenizer(merged, padding='max_length', truncation=True, max_length=128)
    batch["labels"] = batch["input_ids"].copy()
    return batch

# Apply it on our dataset, and remove the text columns
tokenized_datasets = ds.map(tokenize_function, remove_columns=["text", "target"])

In [6]:
# Let's check out one prepared example
print(tokenizer.decode(tokenized_datasets["train"][900]['input_ids']))

Generate Backstory based on following information
Character Name: Mr. Gale
Character Race: Half-orc
Character Class: Cleric

Output:
 Growing up the only half-orc in a small rural town was rough. His mother didn't survive childbirth and so was raised in a church in a high mountain pass, his attention was always drawn by airships passing through, and dreams of an escape. Leaving to strike out on his own as early as he could he made a living for most of his life as an airship sailor, and occasionally a pirate. A single storm visits him throughout his life, marking every major


In [7]:
# We will train a causal (autoregressive) language model from a pretrained checkpoint
model = AutoModelForCausalLM.from_pretrained(model_checkpoint);

In [8]:
# Start a new wandb run
run = wandb.init(project='dlai_lm_tuning', job_type="training")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmichieldekoninck2[0m ([33mmichieldekoninck2-mdk[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
# Define training arguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-characters-backstories",
    report_to="wandb", # we need one line to track experiments in wandb
    num_train_epochs=3,
    logging_steps=1,
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    no_cuda=True, # force cpu use, will be renamed `use_cpu`
)



In [15]:
# We'll use HF Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [16]:
# Let's train!
trainer.train()

{'loss': 4.0217, 'grad_norm': 5.358091831207275, 'learning_rate': 9.985693848354794e-05, 'epoch': 0.004291845493562232}
{'loss': 3.6082, 'grad_norm': 7.424802303314209, 'learning_rate': 9.971387696709585e-05, 'epoch': 0.008583690987124463}
{'loss': 3.4389, 'grad_norm': 6.638430118560791, 'learning_rate': 9.957081545064379e-05, 'epoch': 0.012875536480686695}
{'loss': 3.3036, 'grad_norm': 5.495055198669434, 'learning_rate': 9.94277539341917e-05, 'epoch': 0.017167381974248927}
{'loss': 3.2083, 'grad_norm': 5.068816661834717, 'learning_rate': 9.928469241773963e-05, 'epoch': 0.02145922746781116}
{'loss': 3.55, 'grad_norm': 5.320457935333252, 'learning_rate': 9.914163090128756e-05, 'epoch': 0.02575107296137339}
{'loss': 2.1158, 'grad_norm': 3.9978418350219727, 'learning_rate': 9.899856938483548e-05, 'epoch': 0.030042918454935622}
{'loss': 2.7427, 'grad_norm': 5.081198692321777, 'learning_rate': 9.885550786838342e-05, 'epoch': 0.034334763948497854}
{'loss': 3.3618, 'grad_norm': 5.113386631011

TrainOutput(global_step=699, training_loss=2.204550452413136, metrics={'train_runtime': 826.9987, 'train_samples_per_second': 6.736, 'train_steps_per_second': 0.845, 'train_loss': 2.204550452413136, 'epoch': 3.0})

In [17]:
transformers.logging.set_verbosity_error() # suppress tokenizer warnings

prefix = "Generate Backstory based on following information Character Name: "

prompts = [
    "Frogger Character Race: Aarakocra Character Class: Ranger Output: ",
    "Smarty Character Race: Aasimar Character Class: Cleric Output: ",
    "Volcano Character Race: Android Character Class: Paladin Output: ",
]

table = wandb.Table(columns=["prompt", "generation"])

for prompt in prompts:
    input_ids = tokenizer.encode(prefix + prompt, return_tensors="pt")
    output = model.generate(input_ids, do_sample=True, max_new_tokens=50, top_p=0.3)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    table.add_data(prefix + prompt, output_text)
    
wandb.log({'tiny_generations': table})

In [18]:
wandb.finish()

0,1
eval/loss,▁▄█
eval/runtime,▁█▇
eval/samples_per_second,█▁▂
eval/steps_per_second,█▁▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▅▄▄▄▄▃▆▃▆▅▃▆▃▃▂▁▂▃▅▆▇▇▆▅█▅▆▅▇▇▆▆▆▆▃▆▆▆▇
train/learning_rate,████▇▇▇▇▇▇▇▇▇▇▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁
train/loss,▄▆▅▅▅▆▇▆▆▇██▇▄▅▃▅▅▄▄▅▅▅▄▄▃▃▂▁▃▂▃▂▂▂▂▂▂▂▁

0,1
eval/loss,3.94513
eval/runtime,17.6863
eval/samples_per_second,26.291
eval/steps_per_second,3.336
total_flos,121269776154624.0
train/epoch,3.0
train/global_step,699.0
train/grad_norm,19.98521
train/learning_rate,0.0
train/loss,1.1209
