<a href="https://colab.research.google.com/github/NID123-CH/LLM-Codes/blob/main/PigLatin_and_Collators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets trl

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers>=4.46.0 (from trl)
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers>=4.46.0->trl)
  Downloading tokenizers-0.20

## Harvard Sentences

Conditions of use: The material on this site is freely available for use in VoIP testing, research, development, marketing and any other reasonable application. The material may be copied, downloaded, broadcast, modified, incorporated into web sites or test equipment. We do require that you identify the source of the speech materials as "Open Speech Repository"..

https://www.cs.columbia.edu/~hgs/audio/harvard.html

In [None]:
# Downloads harvard_sentences.txt
!gdown 1pg8hJEdhiHjfcrvo3XzuW4d80xzqnFsl

Downloading...
From: https://drive.google.com/uc?id=1pg8hJEdhiHjfcrvo3XzuW4d80xzqnFsl
To: /content/harvard_sentences.txt
  0% 0.00/30.5k [00:00<?, ?B/s]100% 30.5k/30.5k [00:00<00:00, 43.7MB/s]


## Pig Latin

In [None]:
import re
from string import punctuation

def pig_latin(sentence):
    toks = [t.lower() for t in re.findall(r'\w+|[^\s\w]+', sentence) if len(t) > 0]

    def convert(string):
        # if starts with a vowel, just add "ay"
        # else move the consonants to the end and add "ay"
        if string in punctuation:
            return string
        elif string[0].lower() in {'a', 'e', 'i', 'o', 'u'}:
            return ' ' + string + 'way'
        else:
            beginning_consonants = []
            for i in range(len(string)):
                if string[i].lower() in {'a', 'e', 'i', 'o', 'u'}:
                    break
                beginning_consonants.append(string[i])
            return ' ' + string[i:] + ''.join(beginning_consonants) + 'ay'

    return ''.join([convert(t) for t in toks]).strip()

In [None]:
pig_latin('How are you doing today?')

'owhay areway ouyay oingday odaytay?'

### Load Harvard Sentences

In [None]:
from datasets import load_dataset, Split
dataset = load_dataset(path='csv', data_files='harvard_sentences.txt', quotechar='"', split=Split.ALL)
dataset = dataset.shuffle().train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 576
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 144
    })
})

In [None]:
dataset['train'][0]

{'sentence': 'The screen before the fire kept in the sparks.'}

### Translate Sentences to Pig Latin

In [None]:
pig_ds = dataset.map(lambda s: {'translated': pig_latin(s['sentence'])})

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [None]:
pig_ds['train'][1]

{'sentence': 'Jazz and swing fans like fast music.',
 'translated': 'azzjay andway ingsway ansfay ikelay astfay usicmay.'}

### Prompt Dataset

In [None]:
prompt_pig = pig_ds.rename_columns({'sentence': 'prompt', 'translated': 'completion'}).select_columns(['prompt', 'completion'])

In [None]:
prompt_pig['train'][0]

{'prompt': 'The screen before the fire kept in the sparks.',
 'completion': 'ethay eenscray eforebay ethay irefay eptkay inway ethay arksspay.'}

## Data Collators

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from trl import DataCollatorForCompletionOnlyLM

base_model_id = 'microsoft/phi-2'

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False,
)
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

1

### Formatting

Let's build a formatting function that takes both prompt and completion, and inserts a particular string that will be used to trigger the translation. This string is the response template.

In [None]:
response_template = '##[PIGL]##>'
tokenizer.add_special_tokens({'additional_special_tokens': [response_template]})

def formatting_func(example):
    return f'{example["prompt"]}{response_template}{example["completion"]}' + tokenizer.eos_token

formatting_func(prompt_pig['train'][0])

'The screen before the fire kept in the sparks.##[PIGL]##>ethay eenscray eforebay ethay irefay eptkay inway ethay arksspay.<|endoftext|>'

In [None]:
max_length = 128

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

dataset = prompt_pig['train'].map(generate_and_tokenize_prompt)
dataset = dataset.remove_columns(['prompt', 'completion'])
print(dataset[0])

{'input_ids': [50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50256, 464, 3159, 878, 262, 2046, 4030, 287, 262, 38306, 13, 50296, 2788, 323, 304, 641, 66, 2433, 304, 754, 24406, 4555, 323, 35918, 69, 323, 304, 457, 5568, 287, 1014, 4555, 323, 610, 591, 2777, 323, 13, 50256], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Comparing Collators

In [None]:
response_template

'##[PIGL]##>'

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader_lm = DataLoader(dataset, batch_size=4, collate_fn=data_collator)

data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
dataloader_completion = DataLoader(dataset, batch_size=4, collate_fn=data_collator)

In [None]:
batch_lm = next(iter(dataloader_lm))
batch_completion = next(iter(dataloader_completion))

In [None]:
(batch_lm['input_ids'] == batch_completion['input_ids']).all()

tensor(True)

In [None]:
(batch_lm['attention_mask'] == batch_completion['attention_mask']).all()

tensor(True)

The only difference is in the labels:

In [None]:
(batch_lm['labels'] == batch_completion['labels']).all()

tensor(False)

In [None]:
batch_lm['labels'][0], tokenizer.decode(batch_lm['labels'][0][batch_lm['labels'][0] >= 0])

(tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 50256,
           464,  3159,   878,   262,  2046,  4030,   287,   262, 38306,    13,
         50296,  2788,   323,   304,   641,    66,  2433,   304,   754, 24406,
          4555,   323, 35918,    69,   323,   304,   457,  5568,   287,  1014,
          4555,   323,   610,   591,  2777,   323,  

In [None]:
batch_completion['labels'][0], tokenizer.decode(batch_completion['labels'][0][batch_completion['labels'][0] >= 0])

(tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  2788,   323,   304,   641,    66,  2433,   304,   754, 24406,
          4555,   323, 35918,    69,   323,   304,   457,  5568,   287,  1014,
          4555,   323,   610,   591,  2777,   323,  