In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

In [3]:
import torch
from transformers import AutoConfig, AutoModel, GPT2Tokenizer, TextDataset, TrainingArguments
from transformers import DataCollatorForLanguageModeling, Trainer, AutoTokenizer, GPT2LMHeadModel

In [15]:
config = AutoConfig.from_pretrained("/content/drive/MyDrive/NLP/final_model")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/NLP/final_model")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Dataset for testing

In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/drive/MyDrive/NLP/t.csv',
    block_size=128
)

validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/v.csv",
    block_size=128
)

## Dataset for actual training

In [None]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/dataset_1.csv",
    block_size=128
)

validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/validation.csv",
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/MyDrive/NLP/test.csv",
    block_size=128
)



In [None]:
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

____

In [None]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [None]:
print(model.parameters())

<generator object Module.parameters at 0x7a8a7465b3e0>


In [None]:
config.n_embd, config.vocab_size

(768, 50257)

Just in case- the tokenization process

In [None]:
input_ids = []
maximum_sequence_length = config.n_embd

for text in train_dataset:
  tokens = tokenizer.encode(text, add_special_tokens=True, max_length=maximum_sequence_length, truncation=True)
  if len(tokens) < maximum_sequence_length:
    tokens = tokens + [tokenizer.pad_token_id] * (maximum_sequence_length - len(tokens))
  else:
    tokens = tokens[:maximum_sequence_length]

  input_ids.append(tokens)

input_ids = torch.tensor(input_ids)

In [16]:
# train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True)

device = ('cuda' if torch.cuda.is_available() else 'cpu')

class CustomModel(torch.nn.Module):
  def __init__(self, pretrained_model, config):
    super(CustomModel, self).__init__()
    self.transformer = pretrained_model
    self.config = config

    # Additional layers
    self.Linear1 = torch.nn.Linear(self.config.vocab_size, 512)
    self.Dropout1 = torch.nn.Dropout(0.1)
    self.Linear2 = torch.nn.Linear(512, self.config.n_embd)
    self.Dropout2 = torch.nn.Dropout(0.1)
    self.Linear3 = torch.nn.Linear(self.config.n_embd, self.config.vocab_size)

  def forward(self, input_ids, attention_mask=None, token_type_ids=None):
    outputs = self.transformer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    hidden_states = outputs.logits

    hidden_states = self.Linear1(hidden_states)
    hidden_states = torch.nn.functional.gelu(hidden_states)
    hidden_states = self.Dropout1(hidden_states)
    hidden_states = self.Linear2(hidden_states)
    hidden_states = torch.nn.functional.gelu(hidden_states)
    hidden_states = self.Dropout2(hidden_states)
    logits = self.Linear3(hidden_states)

    return logits

  def generate_text(self, input_ids, max_length=50, temperature=0.9, top_k=50, top_p=0.9):
    with torch.no_grad():
      generated_ids = input_ids.clone()

      for _ in range(max_length):
        logits = self(generated_ids)
        logits = logits[:, -1, :] / temperature
        filtered_logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
        probabilities = torch.nn.functional.softmax(filtered_logits, dim=-1)
        predicted_token = torch.multinomial(probabilities, 1)
        generated_ids = torch.cat((generated_ids, predicted_token), dim=-1)
      return generated_ids

  @staticmethod
  def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf')):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
    cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_indices_to_remove[..., :top_k] = 0
    logits.scatter_(1, sorted_indices_to_remove.to(torch.int64), filter_value)
    return logits

In [None]:
model = CustomModel(model, config)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
model

CustomModel(
  (transformer): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (Linear1): Linear(in_feature

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4 , shuffle=True, drop_last=True)
val_loader = DataLoader(validation_dataset, batch_size=4, shuffle=True, drop_last=True)

# Execute this cell only if you want to load a model

In [None]:
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
total_loss = checkpoint['training_loss']
validation_loss = checkpoint['validation_loss']
step = checkpoint['steps']

I will have to write different training loop for training my model from a checkpoint

In [None]:
%%time
from tqdm import tqdm

device = ('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

epochs=2
step = 0
model.train()
total_loss = 0.

for epoch in range(epochs):
  train_loader = tqdm(train_loader, total=len(train_loader))

  if epoch < 2:
    for param in model.transformer.parameters():
      param.requires_grad = False
  else:
    for param in model.transformer.parameters():
      param.requires_grad = True
  for batch in train_loader:
    input_ids, attention_mask, token_type_ids, targets = batch

    input_ids = input_ids.to(device)
    targets = targets.to(device)

    optimizer.zero_grad()
    outputs = model(input_ids)
    loss = criterion(outputs.view(-1, config.vocab_size), targets.view(-1))
    loss.backward()
    # total_loss += loss.item()
    train_loader.set_description(f"Epoch {epoch+1}")
    train_loader.set_postfix(loss=loss.item())
    step += 1
    if step % 1000 == 0:
      print(f"\nEpoch: {epoch+1}, Loss: {loss.item()}")

      # VALIDATION
      model.eval()
      validation_loss = 0.
      val_step = 0
      with torch.no_grad():
        for batch in val_loader:
          input_ids, attention_mask, token_type_ids, targets = batch
          input_ids = input_ids.to(device)
          targets = targets.to(device)
          outputs = model(input_ids)
          loss = criterion(outputs.view(-1, config.vocab_size), targets.view(-1))
          # validation_loss += loss.item()
          train_loader.set_description(f"Epoch {epoch+1}")
          train_loader.set_postfix(val_loss=loss.item())
          val_step += 1
          if val_step == 501:
            break
      print(f"\nEpoch: {epoch+1}, Validation Loss: {loss.item()}")

      # SAVE A CHECKPOINT
      torch.save({
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'training_loss': total_loss,
          'validation_loss': validation_loss,
          'steps': step
      }, "/content/drive/MyDrive/NLP/trained_again_model/model_checkpoint.pth")

      # This is questionable moment
      model.train()

torch.save(model, '/content/drive/MyDrive/NLP/trained_again_model/model.pth')

Epoch 1:  88%|████████▊ | 999/1129 [00:33<00:04, 27.08it/s, val_loss=13.6]


Epoch: 1, Loss: 11.898210525512695


Epoch 1:  88%|████████▊ | 999/1129 [00:44<00:04, 27.08it/s, val_loss=14.1]


Epoch: 1, Validation Loss: 14.080435752868652


Epoch 1: 100%|██████████| 1129/1129 [00:54<00:00, 20.80it/s, loss=12.3]
Epoch 2:  77%|███████▋  | 870/1129 [00:32<00:08, 31.96it/s, val_loss=12]  


Epoch: 2, Loss: 11.477363586425781


Epoch 2:  77%|███████▋  | 870/1129 [00:43<00:08, 31.96it/s, val_loss=11.8]


Epoch: 2, Validation Loss: 11.777347564697266


Epoch 2: 100%|██████████| 1129/1129 [00:54<00:00, 20.74it/s, loss=13]


CPU times: user 1min 31s, sys: 3.38 s, total: 1min 35s
Wall time: 1min 52s


# Test the model

In [17]:
model = CustomModel(model, config)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")


# Load your fine-tuned model checkpoint
checkpoint_path = '/content/drive/MyDrive/NLP/trained_again_model/model.pth'
model = torch.load(checkpoint_path, map_location='cpu')  # Make sure to specify the device you want to use

model.eval()  # Set the model to evaluation mode

# Define a function to chat with the model
def chat_with_model(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    with torch.no_grad():
        output = model.generate_text(input_ids, max_length=max_length, temperature=0.9, top_k=50, top_p=0.9)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Start a conversation
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    response = chat_with_model(user_input)
    print("Chatbot:", response)

You: What is finance?
Chatbot: What is finance? Secondpop bladesomic unequ retain nerv velvet YES paired blockade DairyacaNitettel Kab plead survivor pursuit Adventurevezfuture Odin Noct fracturedCEprintf substitute ASPRS largerLie Hungry Valent dynasty PostedremlinCOLOR missing Sk'," runners commanded contacting FDA�riticcommittee merchant succeeding
You: exit
Chatbot: Goodbye!
