In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

In [3]:
import torch
from transformers import AutoConfig, AutoModel, GPT2Tokenizer, TextDataset, TrainingArguments
from transformers import DataCollatorForLanguageModeling, Trainer, AutoTokenizer, GPT2LMHeadModel

In [111]:
config = AutoConfig.from_pretrained("/content/drive/MyDrive/NLP/final_model")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/NLP/final_model")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [92]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/dataset_1.csv",
    block_size=128
)

validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/validation.csv",
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/test.csv",
    block_size=128
)



In [6]:
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

____

In [None]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
print(model.parameters())

<generator object Module.parameters at 0x7a8a7465b3e0>


In [None]:
config.n_embd, config.vocab_size

(768, 50257)

Just in case- the tokenization process

In [None]:
input_ids = []
maximum_sequence_length = config.n_embd

for text in train_dataset:
  tokens = tokenizer.encode(text, add_special_tokens=True, max_length=maximum_sequence_length, truncation=True)
  if len(tokens) < maximum_sequence_length:
    tokens = tokens + [tokenizer.pad_token_id] * (maximum_sequence_length - len(tokens))
  else:
    tokens = tokens[:maximum_sequence_length]

  input_ids.append(tokens)

input_ids = torch.tensor(input_ids)

In [112]:
# train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True)

device = ('cuda' if torch.cuda.is_available() else 'cpu')

class CustomModel(torch.nn.Module):
  def __init__(self, pretrained_model, config):
    super(CustomModel, self).__init__()
    self.transformer = pretrained_model
    self.config = config
    # self.lm_head = torch.nn.Linear(self.config.n_embd, self.config.vocab_size, bias=False)

    # Additional layers
    self.Linear1 = torch.nn.Linear(self.config.vocab_size, 512)
    self.Linear2 = torch.nn.Linear(512, self.config.n_embd)
    self.Linear3 = torch.nn.Linear(self.config.n_embd, self.config.vocab_size)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None):
    outputs = self.transformer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    hidden_states = outputs.logits
    # print(torch.isfinite(hidden_states).sum().item())
    hidden_states = self.Linear1(hidden_states)
    hidden_states = torch.nn.functional.gelu(hidden_states)
    hidden_states = self.Linear2(hidden_states)
    hidden_states = torch.nn.functional.gelu(hidden_states)
    logits = self.Linear3(hidden_states)

    return logits



In [85]:
class additional(torch.nn.Module):
  def __init__(self, config):
    super(additional, self).__init__()
    self.config = config

    # Additional layers
    self.Linear1 = torch.nn.Linear(self.config.vocab_size, 512)
    self.Linear2 = torch.nn.Linear(512, self.config.n_embd)
    self.Linear3 = torch.nn.Linear(self.config.n_embd, self.config.vocab_size)

  def forward(self, hidden_states, attention_mask=None, token_type_ids=None):

    hidden_states = self.Linear1(hidden_states)
    hidden_states = torch.nn.ReLU(hidden_states)
    hidden_states = self.Linear2(hidden_states)
    hidden_states = torch.nn.ReLU(hidden_states)
    logits = self.Linear3(hidden_states)

    return logits

In [86]:
for param in model.parameters():
  param.requires_grad = False

In [88]:
model.add_module('add', additional(config))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.CrossEntropyLoss()

In [113]:
model = CustomModel(model, config)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.CrossEntropyLoss()

In [95]:
model

CustomModel(
  (transformer): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (Linear1): Linear(in_feature

In [75]:
train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True)
val_loader = DataLoader(validation_dataset, batch_size=4, shuffle=True)

# Execute this cell only if you want to load a model

In [None]:
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
total_loss = checkpoint['training_loss']
validation_loss = checkpoint['validation_loss']
step = checkpoint['steps']

I will have to write different training loop for training my model from a checkpoint

In [None]:
from tqdm import tqdm

device = ('cuda' if torch.cuda.is_available() else 'cpu')
epochs=2
step = 0

%%time
for epoch in range(epochs):
  model.train()
  total_loss = 0.
  train_loader = tqdm(train_loader, total=len(train_loader))

  for batch in train_loader:
    input_ids, attention_mask, token_type_ids, targets = batch
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask, token_type_ids)
    loss = criterion(outputs.view(-1, config.vocab_size), targets.view(-1))
    loss.backward()
    total_loss += loss.item()
    train_loader.set_description(f"Epoch: {epoch+1}")
    train_loader.set_postfix(loss=total_loss)
    step += 1
    if step % 1000 == 0:
      print(f"Epoch: {epoch+1}, Loss: {total_loss/len(train_loader)}")

      # VALIDATION
      model.eval()
      validation_loss = 0.

      with torch.no_grad():
        for batch in val_loader:
          input_ids, attention_mask, token_type_ids, targets = batch
          outputs = model(input_ids, attention_mask, token_type_ids)
          loss = criterion(outputs.view(-1, config.vocab_size), targets.view(-1))
          validation_loss += loss.item()
      average_validation_loss = validation_loss / len(val_loader)
      print(f"Epoch: {epoch+1}, Validation Loss: {average_validation_loss}")

      # SAVE A CHECKPOINT
      torch.save({
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'training_loss': total_loss,
          'validation_loss': validation_loss,
          'steps': step
      }, "model_checkpoint.pth")

torch.save(model, 'model.pth')

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [None]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/NLP/trained_again_model",
    overwrite_output_dir=False,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    save_steps=1000,
    save_total_limit=1,
    logging_dir = "/content/drive/MyDrive/NLP/logs",
    save_strategy = 'steps',
    evaluation_strategy = "steps",
    eval_steps=1000,
    logging_steps = 100,
    do_train = True,
    do_eval = True,
    load_best_model_at_end = True,
    remove_unused_columns = True,
    weight_decay = 0.01, # L2 Regularization
    warmup_steps = 2000
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

In [None]:
trainer.train()

TypeError: ignored