In [None]:
pip install transformers[torch]


In [None]:
pip install evaluate

In [3]:
import torch
import json
import os
import random
import sys
from torch import nn
import numpy as np
from tqdm import tqdm
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
from torch.utils.data import DataLoader, RandomSampler
from utils import *
from transformers import AdamW
from transformers import get_constant_schedule_with_warmup
import pandas as pd
#Import BART Model
from transformers import PreTrainedModel, PretrainedConfig,AutoConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Config

SEED = 42
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f5b9c49a1b0>

In [4]:
%reload_ext autoreload
%autoreload 2
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

##### Prompt and Labels less than max_length


In [5]:
filename = 'train.json'
model_name='gpt2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
pairs = get_supervised_data(filename)

In [7]:
# Calculate the percentage of prompts with a length less than 1024 tokens
prompts_len = [len(tokenizer(pair[0])['input_ids']) for pair in pairs]
percentage = sum(1 for len_ in prompts_len if len_ < 1024) / len(prompts_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1712 > 1024). Running this sequence through the model will result in indexing errors


In [8]:
# Calculate the percentage of labels with a length less than 1024 tokens

labels_len=[len(tokenizer(pair[1])['input_ids']) for pair in pairs]
(sum(1 for len_ in labels_len if len_ < 1024) / len(labels_len))

0.9877387408630041

###Load tokenizer and model with special tokens


In [9]:

base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# special tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
body = '<|body|>'
additional_special_tokens = [body]

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': '<pad>',
                       'sep_token': body}
                      #  'additional_special_tokens':additional_special_tokens}


# the new token is added to the tokenizer
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# model configuration to which we add the special tokens
config = AutoConfig.from_pretrained('gpt2',
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    sep_token_id=base_tokenizer.sep_token_id,
                                    output_hidden_states=False)

# we load the pre-trained model with custom settings
base_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# model embeding resizing
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50260, 768)

### Dataset

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

def tokenize_function(examples):
        return base_tokenizer(examples['text'],truncation=True, padding=True)

def get_dataframe(filename):
  pairs = get_supervised_data(filename)
  #pairs = random.sample(pairs, int(len(pairs) * 0.2))  # take 20% of the final data (just for now)

  dataframe = pd.DataFrame(pairs, columns=["Question", "Answer"])

  prepare_text = lambda x: ' '.join([bos, x['Question'], body, x['Answer'], eos])
  dataframe['text'] = dataframe.apply(prepare_text, axis=1)
  return dataframe['text']

training_dataset   = get_dataframe('train.json')
validation_dataset = get_dataframe('val.json')

train_dataset = Dataset.from_pandas(pd.DataFrame(training_dataset))
val_dataset = Dataset.from_pandas(pd.DataFrame(validation_dataset))
train_dataset

In [11]:
# tokenization
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1
)

Map:   0%|          | 0/4241 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

In [12]:
tokenized_train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 4241
})

### Fine-tune the model

In [13]:
from transformers import DataCollatorForLanguageModeling
model_articles_path = 'fine_tuned_model'

training_args = TrainingArguments(
    output_dir=model_articles_path,          # output directory
    num_train_epochs=2,                      # total # of training epochs
    per_device_train_batch_size=1,           # batch size per device during training
    per_device_eval_batch_size=32,           # batch size for evaluation
    warmup_steps=200,                        # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                       # strength of weight decay
    logging_dir=model_articles_path,         # directory for storing logs
    prediction_loss_only=True,
    save_steps=500
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated model to be trained
    args=training_args,                       # training arguments
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset

)

In [14]:
import gc
torch.cuda.empty_cache()

gc.collect()

197

In [15]:
trainer.train()




Step,Training Loss
500,7.2718
1000,2.3192
1500,2.1134
2000,2.0772
2500,1.9872
3000,2.0224
3500,1.9566
4000,1.914
4500,1.8226
5000,1.7021


TrainOutput(global_step=8482, training_loss=2.1922305412265497, metrics={'train_runtime': 3297.346, 'train_samples_per_second': 2.572, 'train_steps_per_second': 2.572, 'total_flos': 4432558030848000.0, 'train_loss': 2.1922305412265497, 'epoch': 2.0})

In [18]:
base_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50260, bias=False)
)

In [34]:
def generate(question: str, MAX_SEQ_LENGTH=1024):
  input_ids =tokenizer(question, return_tensors="pt").input_ids.to(DEVICE)
  args = {
        "max_new_tokens": 300,
        "do_sample": True,
        "num_beams": 4,
        "top_k": 0,
        "top_p": 0.9,
        "num_return_sequences": 1,
        "temperature": 0.8,
        "length_penalty": 0.5,
  }
  outputs = base_model.generate(input_ids, **args)
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return answer