In [2]:
# fine tune the model on the CoT dataset
# train only for output tokens in the dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
import wandb


peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)


batch_size = 8

model_name = "facebook/galactica-125m"
tokenizer_name = model_name

dataset_name = "jeggers/CoT-Collection-finetuning"
input_column = "final_input"
output_column = "final_target"

In [3]:
# load model, tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
dataset = load_dataset(dataset_name)

# filter dataset to only contain samples that do not stat with 'ANSWER:'
dataset = dataset.filter(lambda x: not x[output_column].startswith("ANSWER:"))

dataset = dataset["train"].train_test_split(test_size=0.1)

# create new 'all' column that contains both input and output tokens
def combine_input_output(batch):
    return {"all": [x + " " + y + "</s>" for x, y in zip(batch[input_column], batch[output_column])]}
dataset = dataset.map(combine_input_output, batched=True, batch_size=-1)

# convert to peft model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

trainable params: 294,912 || all params: 125,325,312 || trainable%: 0.2353


In [4]:
# print first 5 examples
from pprint import pprint
pprint(dataset["train"]["all"][:5])

['Challenge yourself with this crossword clue:\n'
 'Clue given: Relative by marriage\n'
 'Year published: 2003\n'
 'Length of answer: 5\n'
 'Primary hint: The 3rd letter is L\n'
 'Secondary hint: The 1st letter is I\n'
 'Provide the answer in all caps.\n'
 'BOT:  1. The answer is a 5-letter word for a relative by marriage.\n'
 '2. The 3rd letter is L and the 1st letter is I.\n'
 '3. Common in-law relatives: mother, father, sister, brother.\n'
 '4. "In-law" is often shortened to just "-law" in crosswords.\n'
 '5. The only 5-letter in-law that fits is "inlaw" (informal spelling).\n'
 '\n'
 'ANSWER: INLAW</s>',
 'When you wrongly multiplied a number that should have been divided by 2.4, '
 'you got 288. What is the correctly calculated value divided by 5?\n'
 'BOT:  1. Identify the incorrect calculation: number * 2.4 = 288\n'
 '2. Find the correct number: 288 / 2.4 = 120\n'
 '3. Perform the correct calculation: 120 / 2.4 = 50\n'
 '4. Divide the result by 5: 50 / 5 = 10\n'
 '\n'
 'ANSWER: 

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['final_input', 'final_target', 'all'],
        num_rows: 649
    })
    test: Dataset({
        features: ['final_input', 'final_target', 'all'],
        num_rows: 73
    })
})


In [6]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = 1
if tokenizer.eos_token_id is None:
    tokenizer.eos_token_id = 2

In [7]:
# tokenize the dataset
def tokenize_function(examples):
    examples = tokenizer(examples['all'], padding="max_length", max_length=512, return_tensors="pt")
    examples["labels"] = examples["input_ids"].clone()
    return examples

tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=batch_size, drop_last_batch=True)

tokenized_datasets


Map:   0%|          | 0/648 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['final_input', 'final_target', 'all', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 648
    })
    test: Dataset({
        features: ['final_input', 'final_target', 'all', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 72
    })
})

In [8]:
# print tensor of first sample
print(tokenized_datasets["train"]["input_ids"][3])

[8996, 417, 286, 1231, 8650, 299, 6445, 4817, 53, 221, 221, 221, 55, 48, 22254, 34, 25703, 34, 221, 56, 48, 11041, 11658, 34, 6577, 4698, 34, 221, 57, 48, 21122, 739, 34, 17243, 221, 58, 48, 995, 20754, 34, 13838, 36, 221, 59, 48, 6486, 17582, 34, 5110, 35, 6981, 43101, 34, 221, 60, 48, 11041, 1102, 15875, 35, 41, 35, 9273, 34, 243, 39, 34, 41, 35, 14028, 5419, 47655, 405, 34, 221, 61, 48, 40801, 35, 17535, 34, 11512, 34, 221, 62, 48, 1587, 5614, 8208, 34, 10422, 34, 221, 63, 48, 348, 29813, 405, 34, 44312, 405, 34, 221, 64, 48, 26246, 3156, 34, 1540, 17173, 18572, 34, 221, 221, 5164, 7496, 5005, 3715, 404, 19902, 286, 11905, 869, 36, 221, 56, 4675, 48, 243, 24647, 4817, 6853, 4044, 301, 286, 3639, 1100, 281, 682, 35, 6324, 922, 48, 221, 221, 39, 36, 24647, 343, 935, 15833, 321, 4708, 15875, 404, 286, 4674, 6445, 12973, 36, 221, 40, 36, 1587, 5614, 8208, 343, 967, 1653, 15833, 321, 10422, 404, 29483, 12973, 36, 221, 221, 592, 1231, 8650, 301, 495, 922, 417, 48, 221, 35, 1587, 5614, 820

In [9]:
inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
labels = inputs["input_ids"].clone()
del inputs["token_type_ids"]

outputs = model(**inputs, labels=labels)
print(outputs.loss)

tensor(8.3033, device='cuda:0', grad_fn=<NllLossBackward0>)


In [10]:
# collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

class LoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if "loss" in logs:
            wandb.log({"training_loss": logs["loss"]})
        if "eval_loss" in logs:
            wandb.log({"eval_loss": logs["eval_loss"]})

training_args = TrainingArguments(
    output_dir="your-name/bigscience/mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    load_best_model_at_end=True,
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    # callbacks=[LoggingCallback()],
    # data_collator=collator,
    # compute_metrics=compute_metrics,
)




In [13]:
import wandb
wandb.init(mode="disabled")



In [14]:
trainer.train()

Step,Training Loss,Validation Loss
100,3.0335,3.040254


TrainOutput(global_step=162, training_loss=3.317157863098898, metrics={'train_runtime': 15.1744, 'train_samples_per_second': 85.407, 'train_steps_per_second': 10.676, 'total_flos': 339808610156544.0, 'train_loss': 3.317157863098898, 'epoch': 2.0})

In [15]:
# merge adapter weights into model
model = model.merge_and_unload()

In [26]:
import os
os.environ["HF_TOKEN"] = "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"

In [27]:
model.push_to_hub("jeggers/galactica-125m-cot-only")

model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jeggers/galactica-125m-cot-only/commit/84197be6cf1a4859eae3120245a8640f4d47d771', commit_message='Upload OPTForCausalLM', commit_description='', oid='84197be6cf1a4859eae3120245a8640f4d47d771', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
tokenizer.push_to_hub("jeggers/galactica-125m-cot-only")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jeggers/galactica-125m-cot-only/commit/52cf05c53d8a149ab8a19c91c8a517d9918f198b', commit_message='Upload tokenizer', commit_description='', oid='52cf05c53d8a149ab8a19c91c8a517d9918f198b', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
question = "Pick the optimal word to complete the phrase: Natalie was trying to fit a prom dress over their client's bust, but struggled to do so because the _ was too big. A: dress B: bust Provide your answer using the letter only."
inputs = tokenizer(question + " BOT: ", return_tensors="pt").to("cuda")
del inputs["token_type_ids"]
out = model.generate(**inputs, max_new_tokens=120, num_return_sequences=1, do_sample=True, temperature=0.8)
text = tokenizer.batch_decode(out, skip_special_tokens=True)
print(text)
print(out)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x000001FFFBA033D0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 20238e9c250, raw_cell="question = "Pick the optimal word to complete the .." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/d%3A/Google%20Drive/Other%20computers/My%20laptop/Daten/Hobbys/Informatik/Mashine%20leaning/ThinkingLLMs/finetune_model.ipynb#X16sZmlsZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

['Pick the optimal word to complete the phrase: Natalie was trying to fit a prom dress over their client\'s bust, but struggled to do so because the _ was too big. A: dress B: bust Provide your answer using the letter only. BOT: 1. Attend to the phrase: Natalie was trying to fit a prom dress over her client\'s bust.\n2. As the question shows, the word and sentence are separated, the first sentence is a word of length 2.\n3. The sentence is meant to complete the phrase: "Try to work out a prom dress on the client\'s bust."\n4. The word "B" must refer to the large bust.\n5. The sentence points to a large bust.\n\nANSWER: B� expendit']
tensor([[32893,   286,  2936,  5104,   321,  2856,   286, 20459,    48, 38703,
          1069,   435, 13571,   321,  2422,   281,  1928, 35998,   797,   817,
         10753,    29,   105,   311,   690,    34,   835, 20633,  1663,   321,
           917,   891,  1543,   286,   243,    85,   435,  4606,  6499,    36,
           351,    48, 35998,   410,    48,

TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given