Finetuning LLM model from scratch.

In [1]:
import pandas as pd
from pprint import pprint
from transformers import AutoTokenizer
from datasets import load_dataset


# Data Prepare

## 1. Load dataset and tokenizer from hugging face

In [2]:
tokenizer_id = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

data_file = "lamini/lamini_docs"
dataset = load_dataset(data_file, split="train")
dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})

## 2. Convert dataset to Question & Answer format

In [3]:
prompt_template = """### Question:
{question}

### Answer:"""

dataset_len = len(dataset["question"])
finetuning_dataset = []
for idx in range(dataset_len):
    question = dataset["question"][idx]
    answer = dataset["answer"][idx]
    text_with_prompt_template = prompt_template.format(question=question)
    finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

pprint(finetuning_dataset[1])

{'answer': 'Yes, the code includes methods for submitting jobs, checking job '
           'status, and retrieving job results. It also includes a method for '
           'canceling jobs. Additionally, there is a method for sampling '
           'multiple outputs from a model, which could be useful for '
           'long-running tasks.',
 'question': '### Question:\n'
             "Can I find information about the code's approach to handling "
             'long-running tasks and background jobs?\n'
             '\n'
             '### Answer:'}


## 3. Tokenize (include padding and truncating) the processed data

In [4]:
def tokenize_function(finetuning_dataset):
    text = []
    for example in finetuning_dataset:
        text.append(example["question"] + example["answer"])
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        padding=True,
        max_length=max_length
    )
    return tokenized_inputs

In [5]:
#tokenized_inputs = tokenize_function(finetuning_dataset)
#print(tokenized_inputs.input_ids[0])

[ 4118 19782    27   187  2347   476   309  7472   253  3045   285  3290
   273   253  4561  2505   432   418  4988    74  3210    32   187   187
  4118 37741    27  2512   403  2067 17082   326   476   320   908   281
  7472   253  3045   285  3290   273  4561  2505   432   418  4988    74
  3210    13  1690 44229   414    13   378  1843    54  4868    13   285
  1966  7103    15  3545 12813   414  5593   849   973   253  1566 26295
   253  1735  3159   275   247  3425    13  1223   378  1843    54  4868
  5593   253 14259   875   253  4561  2505   285   247  3806  2505    15
  8801  7103  8687  1907  1966 16006  2281   253  3290   273   253  4561
  2505  1754   327  2616   824   347 25253    13  2938  1371    13   285
 17200    15   733   310  8521   281   897   247  5019   273   841 17082
   323   247 11088  7103   273   253  1566   434  3045    15     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

## Padding and truncation Examples

In [6]:
text = "Hi, how are you?"
encoded_text = tokenizer(text)["input_ids"]
print("Encoded text: ", encoded_text)
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)

list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

tokenizer.pad_token = tokenizer.eos_token 
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])

encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)
print("Using truncation: ", encoded_texts_truncation["input_ids"])

tokenizer.truncation_side = "left"
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Encoded text:  [12764, 13, 849, 403, 368, 32]
Decoded tokens back into text:  Hi, how are you?
Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]
Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]
Using truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]
Using left-side truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]
Using both padding and truncation:  [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]


## Other data set view

In [7]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder(data_file)
ds_builder.info.features

{'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [8]:
from datasets import get_dataset_split_names
get_dataset_split_names(data_file)

['train', 'test']

In [16]:
type(finetuning_dataset)

list

In [17]:
split_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
})


In [33]:
tokenizer(dataset[:3]["answer"])

{'input_ids': [[2512, 403, 2067, 17082, 326, 476, 320, 908, 281, 7472, 253, 3045, 285, 3290, 273, 4561, 2505, 432, 418, 4988, 74, 3210, 13, 1690, 44229, 414, 13, 378, 1843, 54, 4868, 13, 285, 1966, 7103, 15, 3545, 12813, 414, 5593, 849, 973, 253, 1566, 26295, 253, 1735, 3159, 275, 247, 3425, 13, 1223, 378, 1843, 54, 4868, 5593, 253, 14259, 875, 253, 4561, 2505, 285, 247, 3806, 2505, 15, 8801, 7103, 8687, 1907, 1966, 16006, 2281, 253, 3290, 273, 253, 4561, 2505, 1754, 327, 2616, 824, 347, 25253, 13, 2938, 1371, 13, 285, 17200, 15, 733, 310, 8521, 281, 897, 247, 5019, 273, 841, 17082, 323, 247, 11088, 7103, 273, 253, 1566, 434, 3045, 15], [4374, 13, 253, 2127, 3797, 3082, 323, 29315, 7375, 13, 12669, 2628, 3708, 13, 285, 48484, 2628, 1543, 15, 733, 671, 3797, 247, 1332, 323, 14002, 272, 7375, 15, 9157, 13, 627, 310, 247, 1332, 323, 10491, 2709, 18012, 432, 247, 1566, 13, 534, 812, 320, 4217, 323, 1048, 14, 24220, 8892, 15], [45, 4988, 74, 14980, 6131, 3386, 323, 11365, 2505, 326, 4419, 1

In [35]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [37]:
dataset.format

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['input_ids', 'attention_mask', 'labels'],
 'output_all_columns': False}