In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

In [3]:
import torch
from transformers import AutoConfig, AutoModel, GPT2Tokenizer, TextDataset, TrainingArguments
from transformers import DataCollatorForLanguageModeling, Trainer, AutoTokenizer, GPT2LMHeadModel

In [None]:
config = AutoConfig.from_pretrained("/content/drive/MyDrive/NLP/final_model")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/NLP/final_model")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [5]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/dataset_1.csv",
    block_size=128
)

validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/validation.csv",
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/test.csv",
    block_size=128
)



In [6]:
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

____

In [4]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [5]:
print(model.parameters())

<generator object Module.parameters at 0x7a8a7465b3e0>


In [7]:
config.n_embd, config.vocab_size

(768, 50257)

Just in case- the tokenization process

In [None]:
input_ids = []
maximum_sequence_length = config.n_embd

for text in train_dataset:
  tokens = tokenizer.encode(text, add_special_tokens=True, max_length=maximum_sequence_length, truncation=True)
  if len(tokens) < maximum_sequence_length:
    tokens = tokens + [tokenizer.pad_token_id] * (maximum_sequence_length - len(tokens))
  else:
    tokens = tokens[:maximum_sequence_length]

  input_ids.append(tokens)

input_ids = torch.tensor(input_ids)

In [9]:
# train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True)

device = ('cuda' if torch.cuda.is_available() else 'cpu')

class CustomModel(torch.nn.Module):
  def __init__(self, pretrained_model, config):
    super(CustomModel, self).__init__()
    self.transformer = pretrained_model
    self.config = config
    self.lm_head = torch.nn.Linear(self.config.n_embd, self.config.vocab_size, bias=False)

    # Additional layers
    self.Linear1 = torch.nn.Linear(self.config.n_embd, 512)
    self.Linear2 = torch.nn.Linear(512, self.config.n_embd)
    self.Linear3 = torch.nn.Linear(self.config.n_embd, self.config.vocab_size)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None):
    outputs = self.transformer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    hidden_states = outputs.last_hidden_state

    hidden_states = self.Linear1(hidden_states)
    hidden_states = torch.nn.GELU(hidden_states)
    hidden_states = self.Linar2(hidden_states)
    hidden_states = torch.nn.GELU(hidden_states)
    logits = self.Linear3(hidden_states)

    return logits



In [10]:
model = CustomModel(model, config)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.CrossEntropyLoss()

In [11]:
train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True)

In [None]:
epochs=2
for epoch in range(epochs):
  model.train()
  total_loss = 0.

  for batch in train_loader:
    input_ids, attention_mask, token_type_ids, targets = batch
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask, token_type_ids)

In [None]:
model = CustomTextGenerator(model)

# Set hyperparameters and optimizer
optimizer = AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

epochs = 2
# Training loop
for epoch in range(epochs):
    for batch in train_loader:
        input_ids, attention_mask, target_ids = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = torch.nn.CrossEntropyLoss()(outputs.view(-1, model.config.vocab_size), target_ids.view(-1))
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
# torch.save(model.state_dict(), 'fine_tuned_model.pth')

____

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [None]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/NLP/trained_again_model",
    overwrite_output_dir=False,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    save_steps=1000,
    save_total_limit=1,
    logging_dir = "/content/drive/MyDrive/NLP/logs",
    save_strategy = 'steps',
    evaluation_strategy = "steps",
    eval_steps=1000,
    logging_steps = 100,
    do_train = True,
    do_eval = True,
    load_best_model_at_end = True,
    remove_unused_columns = True,
    weight_decay = 0.01, # L2 Regularization
    warmup_steps = 2000
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)

In [None]:
trainer.train()

TypeError: ignored