# Causal LM (distil GPT2)

## Follows the Hugging Face [tutorial](https://huggingface.co/docs/transformers/tasks/language_modeling) on Causal Language Modeling  

## Dataset - mehr32/Persian_English_translation (Hugging Face)

In [1]:
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [2]:
from google.colab import drive
drive.mount('drive', force_remount=True)

Mounted at drive


### Imports

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import numpy as np
from datasets import Dataset, DatasetDict
import math
from sklearn.model_selection import train_test_split

### The example is for educational purpose, you select only a subset of the data (2 203 204 texts)

In [38]:
with open('drive/MyDrive/Experiments/SNLP2/week_10/source.txt', 'rt', encoding='utf8') as fr:
  texts = fr.read().split('\n')
  # sample_texts = np.random.choice(texts, int(len(lines) * 0.5))

  train, test = train_test_split(texts, test_size=0.2)

  print(f'example: {texts[0]}')
  print(len(train), len(test))


example:  hmmm probably the first time i tried lasagna that was a great day
1762564 440641


In [39]:
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

### Dataset

In [40]:
ds = DatasetDict({'train': Dataset.from_dict({'text': train}),  'test': Dataset.from_dict({'text': test})})

ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1762564
    })
    test: Dataset({
        features: ['text'],
        num_rows: 440641
    })
})

### First, preprocess the raw texts

In [41]:
def preprocess_function(batch):
    return tokenizer(batch['text'])

ds = ds.map(preprocess_function, batched=True, batch_size=1000, remove_columns='text')
ds

Map:   0%|          | 0/1762564 [00:00<?, ? examples/s]

Map:   0%|          | 0/440641 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1762564
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 440641
    })
})

### Second, concatenate the _input_ids_, _attention_mask_, _labels_ and separate them in equal sized bins of length _block_size_

In [42]:
tokenizer.pad_token = tokenizer.eos_token
block_size = 512

def group_texts(batch):
    concatenated_batch = {k: sum(batch[k], []) for k in batch.keys()}

    results = {'input_ids': [], 'attention_mask': [], 'labels': []}
    for i in range(0, len(concatenated_batch['input_ids']), block_size):
        block_input_ids = concatenated_batch['input_ids'][i : i + block_size]
        block_attention_mask = concatenated_batch['attention_mask'][i : i + block_size]
        block_labels = block_input_ids.copy()

        assert len({len(block_input_ids), len(block_attention_mask), len(block_labels)}) == 1

        n = len(block_input_ids)
        if n < block_size:
          n = block_size - n
          block_input_ids += [tokenizer.pad_token_type_id] * n
          block_attention_mask += [tokenizer.pad_token_type_id] * n
          block_labels += [tokenizer.pad_token_type_id] * n


        results['input_ids'].append(block_input_ids)
        results['attention_mask'].append(block_attention_mask)
        results['labels'].append(block_labels)
    return results


ds = ds.map(group_texts, batched=True, batch_size=1000)

ds

Map:   0%|          | 0/1762564 [00:00<?, ? examples/s]

Map:   0%|          | 0/440641 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 36756
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9205
    })
})

In [43]:
model = AutoModelForCausalLM.from_pretrained(model_name)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

### Data Collator for forming the batches

In [44]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='pt')

### Trained on A100 GPU 40GBs RAM 30~35mins





In [45]:
training_args = TrainingArguments(
    output_dir="results",
    learning_rate=2e-5,
    weight_decay = 0.01,
    per_device_train_batch_size=16,
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=3,
    report_to="none",
)

trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator

)

trainer.train()


Step,Training Loss
2298,4.7231
4596,4.5773
6894,4.5397


TrainOutput(global_step=6894, training_loss=4.613343436094793, metrics={'train_runtime': 2057.828, 'train_samples_per_second': 53.585, 'train_steps_per_second': 3.35, 'total_flos': 1.4406335047139328e+16, 'train_loss': 4.613343436094793, 'epoch': 3.0})

### Perplexity

In [46]:
ppl = math.exp(trainer.evaluate()['eval_loss'])

print(ppl)

86.54143047613762


### Inference

In [97]:
inputs = tokenizer("a family", return_tensors="pt").to(model.device)

outputs = model.generate(**inputs, max_new_tokens=10,  do_sample=True, num_beams=3)

tokenizer.decode(outputs[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'a family, a man with a wife, and a daughter'