In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install datasets transformers torch_optimizer

Collecting datasets
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━

In [None]:
import numpy as np
import torch
import logging
from tqdm import tqdm
import math
import os
import torch_optimizer as optim
from transformers import AdamW
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPTNeoForCausalLM
from transformers.optimization import get_linear_schedule_with_warmup

In [None]:
#set dataset and tokenizer
datasets = load_dataset('roneneldan/TinyStories')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

Downloading readme:   0%|          | 0.00/946 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:

def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels

In [None]:
#class dataset
class Dataset_class:
  def __init__(self, inputs):
    self.ids = inputs['input_ids']
    self.att_mask = inputs['attention_mask']
    self.labs=inputs['labels']

  def __len__(self):
    return len(self.ids)

  def __getitem__(self, item):
    return [torch.tensor(self.ids[item], dtype=torch.long),
            torch.tensor(self.att_mask[item], dtype=torch.long),
            torch.tensor(self.labs[item], dtype=torch.long)]

In [None]:
#preprocess data
def preprocess_data(dataset, tokenizer):
  #set dataset columns
  train_dataset = datasets['train'][:80000]

  valid_dataset = datasets['validation'][:8000]

  test_dataset = datasets['validation'][8000:16000]

  #tokenize data
  tokenized_train = tokenizer(train_dataset["text"], padding=True,truncation=True, max_length=512)
  tokenized_valid = tokenizer(valid_dataset["text"], padding=True,truncation=True, max_length=512)
  tokenized_test = tokenizer(test_dataset["text"], padding=True,truncation=True, max_length=512)

  #add labels
  create_labels(tokenized_train)
  create_labels(tokenized_valid)
  create_labels(tokenized_test)

  traindataset = Dataset_class(tokenized_train)
  train_dataloader = torch.utils.data.DataLoader(traindataset, shuffle=True, batch_size=4)

  validdataset = Dataset_class(tokenized_valid)
  valid_dataloader = torch.utils.data.DataLoader(validdataset, shuffle=False, batch_size=4)

  testdataset = Dataset_class(tokenized_test)
  test_dataloader = torch.utils.data.DataLoader(testdataset, shuffle=False, batch_size=4)

  return train_dataloader, valid_dataloader, test_dataloader


In [None]:
#preprocess dataset
train_dataloader, valid_dataloader, test_dataloader = preprocess_data(datasets, tokenizer)

In [None]:
#download model
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.to('cuda')
model.eval()

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
#evaluation
def evaluate_model(model, test_set, print_loss=False):
  eval_loss = []
  for inputs in tqdm(test_set, desc="eval test"):
    ids, mask, labs = inputs
    ids = ids.to('cuda')
    mask = mask.to('cuda')
    labs = labs.to('cuda')

    with torch.no_grad():
      output = model(input_ids = ids, attention_mask = mask,labels = labs)
      batch_loss = output[0]
    eval_loss += [batch_loss.cpu().item()]
  eval_loss = np.mean(eval_loss)
  perplexity = math.exp(eval_loss)

  if print_loss:
    print("Average loss on evaluation set = ", eval_loss)

  print('\nPerplexity on evaluation set: ', perplexity)

In [None]:
#generate story
def generate_story(start, k=0, p=0.9, output_length=300, temperature=1, num_return_sequences=3, repetition_penalty=1.0):
  texts = []

  encoded_start = tokenizer.encode(start, add_special_tokens = False, return_tensors = "pt")
  model.to('cpu')
  model.eval()
  output_sequences = model.generate(input_ids = encoded_start, max_length = output_length, temperature = temperature, top_k = k, top_p = p, repetition_penalty = repetition_penalty, do_sample = True, num_return_sequences = num_return_sequences)
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_()
  print("text generated")
  for _ , generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    # Decode text
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces = True)
    # Remove all text after eos token
    text = text[: text.find(tokenizer.eos_token)]
    print(text, "\n")
    texts.append(text)
  return texts

In [None]:
#set parameters
class Hparams:
  def __init__(self, train_dataloader, tot_epochs=1, learning_rate=5e-5):
    self.tot_epochs = tot_epochs
    self.training_steps_per_epoch = len(train_dataloader)
    self.total_num_training_steps = int(self.training_steps_per_epoch * tot_epochs)
    self.learning_rate = learning_rate
    self.warmup_steps = int(self.total_num_training_steps * 0.1)


params = Hparams(train_dataloader)
optimizer = torch.optim.AdamW(model.parameters(), lr=params.learning_rate)
#optimizer = AdamW(model.parameters(), lr=params.learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=params.warmup_steps, num_training_steps=params.total_num_training_steps) #to adjust the learning rate during training

In [None]:
#evaluate pre training
print("Evaluate pre training")
evaluate_model(model, test_dataloader, print_loss=False)

Evaluate pre training


eval test: 100%|██████████| 2000/2000 [06:29<00:00,  5.13it/s]


Perplexity on evaluation set:  11.25952718483422





In [None]:
#training
def training(model, train_dataloader, valid_dataloader, params, optimizer, scheduler):
  print("Tot epochs: ", params.tot_epochs)

  for epoch in range(params.tot_epochs):
    print("Start epoch", epoch + 1)

    train_loss=0
    epoch_iterator = tqdm(train_dataloader, desc='Epoch iteration')
    model.train()
    model.zero_grad()
    for _, inputs in enumerate(epoch_iterator):
      ids, mask, labs = inputs
      ids = ids.to('cuda')
      mask = mask.to('cuda')
      labs = labs.to('cuda')
      output = model(input_ids = ids, attention_mask = mask,labels = labs)
      batch_loss = output[0]
      batch_loss.backward()
      optimizer.step()
      scheduler.step()
      model.zero_grad()
      train_loss += batch_loss.item()
      epoch_iterator.set_description('(batch loss=%g)' % batch_loss.item())
    print("Average train loss = ", train_loss/params.training_steps_per_epoch, " [Epoch: ", epoch+1, "]")

    print("Evaluate on Validation set")
    evaluate_model(model, valid_dataloader, print_loss = True)

  return model

In [None]:
#do training
model = training(model, train_dataloader, valid_dataloader, params, optimizer, scheduler)

Tot epochs:  1
Start epoch 1


(batch loss=1.58011): 100%|██████████| 20000/20000 [3:20:36<00:00,  1.66it/s]


Average train loss =  1.6900501404643058  [Epoch:  1 ]
Evaluate on Validation set


eval test: 100%|██████████| 2000/2000 [06:50<00:00,  4.87it/s]

Average loss on evaluation set =  1.6001382104754447

Perplexity on evaluation set:  4.953717032670311





In [None]:
#save model
path_model = "/content/drive/MyDrive/EAI/Results/tinystory_gpt2_1epoch_v80.pt"
torch.save(model, path_model)

In [None]:
#evaluate post training
print("Evaluate post training")
evaluate_model(model, test_dataloader, print_loss=False)

Evaluate post training


eval test: 100%|██████████| 1250/1250 [04:45<00:00,  4.37it/s]


Perplexity on evaluation set:  5.342459341600658





In [None]:
texts = generate_story(prompt,target)