In [79]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

In [80]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [81]:
text = 'Hello World from China!'
encoded_text = tokenizer(text)['input_ids']
encoded_text

[12092, 3645, 432, 4135, 2]

In [82]:
decoded_text = tokenizer.decode(encoded_text)
print('Decoded text:', decoded_text)

Decoded text: Hello World from China!


In [83]:
text_ls = ['Hello World from China!', 'Mac mini is great!', 'Copy understood']
encoded_texts = tokenizer(text_ls)
print('Encoded several texts:', encoded_texts['input_ids'])

Encoded several texts: [[12092, 3645, 432, 4135, 2], [13815, 12949, 310, 1270, 2], [17491, 7192]]


In [84]:
# Padding
tokenizer.pad_token = tokenizer.eos_token
encoded_texts_longest = tokenizer(text_ls, padding=True)
print('Encoded several texts with padding:', encoded_texts_longest['input_ids'])

Encoded several texts with padding: [[12092, 3645, 432, 4135, 2], [13815, 12949, 310, 1270, 2], [17491, 7192, 0, 0, 0]]


In [85]:
encoded_texts_truncation = tokenizer(text_ls, max_length=3, truncation=True)
print('Encoded several texts with truncation:', encoded_texts_truncation['input_ids'])

Encoded several texts with truncation: [[12092, 3645, 432], [13815, 12949, 310], [17491, 7192]]


In [86]:
tokenizer.truncation_side = 'left'
encoded_texts_truncation_left = tokenizer(text_ls, max_length=3, truncation=True)
print('Encoded several texts with left_truncation:', encoded_texts_truncation_left['input_ids'])

Encoded several texts with left_truncation: [[432, 4135, 2], [310, 1270, 2], [17491, 7192]]


In [87]:
encoded_texts_both = tokenizer(text_ls, max_length=3, truncation=True, padding=True)
print('Encoded several texts with both padding and truncation:', encoded_texts_both['input_ids'])

Encoded several texts with both padding and truncation: [[432, 4135, 2], [310, 1270, 2], [17491, 7192, 0]]


In [88]:
df = pd.read_parquet("hf://datasets/kotzeje/lamini_docs.jsonl/data/train-00000-of-00001-6359aa989b671345.parquet")
examples = df.to_dict()

if 'question' in examples and 'answer' in examples:
    text = examples["question"][0] + examples['answer'][0]
elif 'instruction' in examples and 'response' in examples:
    text = examples["instruction"][0] + examples['response'][0]
elif 'input' in examples and 'output' in examples:
    text = examples["input"][0] + examples['output'][0]
else:
    text = examples["text"][0]

prompt_template = '''### Question:
{question}

### Answer:'''

num_examples = len(examples['question'])
finetuning_dataset = []
for i in range(num_examples):
    question = examples['question'][i]
    answer = examples['answer'][i]
    text_with_prompt_template = prompt_template.format(question=question)
    finetuning_dataset.append({'question': text_with_prompt_template, 'answer': answer})

pprint('One datapoint in the finetuning dataset:')
pprint(finetuning_dataset[0])

'One datapoint in the finetuning dataset:'
{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


In [89]:
text = finetuning_dataset[0]['question'] + finetuning_dataset[0]['answer']
tokenized_inputs = tokenizer(text, return_tensors='np', 
                             padding=True, 
                             truncation=True)
print(tokenized_inputs['input_ids'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[[ 4118 19782    27   187  2347   476   309  7472   253  3045   285  3290
    273   253  4561  2505   432   418  4988    74  3210    32   187   187
   4118 37741    27  2512   403  2067 17082   326   476   320   908   281
   7472   253  3045   285  3290   273  4561  2505   432   418  4988    74
   3210    13  1690 44229   414    13   378  1843    54  4868    13   285
   1966  7103    15  3545 12813   414  5593   849   973   253  1566 26295
    253  1735  3159   275   247  3425    13  1223   378  1843    54  4868
   5593   253 14259   875   253  4561  2505   285   247  3806  2505    15
   8801  7103  8687  1907  1966 16006  2281   253  3290   273   253  4561
   2505  1754   327  2616   824   347 25253    13  2938  1371    13   285
  17200    15   733   310  8521   281   897   247  5019   273   841 17082
    323   247 11088  7103   273   253  1566   434  3045    15]]


In [90]:
mx_length = 2048
mx_length = min(mx_length,
                tokenized_inputs['input_ids'].shape[1])

In [91]:
tokenized_inputs = tokenizer(text, return_tensors='np', 
                             padding=True,
                             truncation=True,
                             max_length=mx_length)
tokenized_inputs['input_ids']

array([[ 4118, 19782,    27,   187,  2347,   476,   309,  7472,   253,
         3045,   285,  3290,   273,   253,  4561,  2505,   432,   418,
         4988,    74,  3210,    32,   187,   187,  4118, 37741,    27,
         2512,   403,  2067, 17082,   326,   476,   320,   908,   281,
         7472,   253,  3045,   285,  3290,   273,  4561,  2505,   432,
          418,  4988,    74,  3210,    13,  1690, 44229,   414,    13,
          378,  1843,    54,  4868,    13,   285,  1966,  7103,    15,
         3545, 12813,   414,  5593,   849,   973,   253,  1566, 26295,
          253,  1735,  3159,   275,   247,  3425,    13,  1223,   378,
         1843,    54,  4868,  5593,   253, 14259,   875,   253,  4561,
         2505,   285,   247,  3806,  2505,    15,  8801,  7103,  8687,
         1907,  1966, 16006,  2281,   253,  3290,   273,   253,  4561,
         2505,  1754,   327,  2616,   824,   347, 25253,    13,  2938,
         1371,    13,   285, 17200,    15,   733,   310,  8521,   281,
      

In [92]:
# Wrapped into a function
def tokenizer_function(examples):
    if 'question' in examples and 'answer' in examples:
        text = examples["question"][0] + examples['answer'][0]
    elif 'input' in examples and 'output' in examples:
        text = examples["input"][0] + examples['output'][0]
    else:
        text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(text, 
                                 return_tensors='np', 
                                 truncation=True)
    mx_length = min(2048,
                    tokenized_inputs['input_ids'].shape[1])
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(text, 
                                 return_tensors='np', 
                                 truncation=True,
                                 max_length=mx_length)
    return tokenized_inputs

In [98]:
finetuning_dataset_loaded = datasets.load_dataset('json', data_files='lamini_docs.jsonl')['train']
tokenized_dataset = finetuning_dataset_loaded.map(tokenizer_function,
                                                  batched=True,
                                                  batch_size=1,
                                                  drop_last_batch=True)
print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [99]:
tokenized_dataset = tokenized_dataset.add_column('labels', tokenized_dataset['input_ids'])
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})
