<a href="https://colab.research.google.com/github/Lmalviya/machineTranslationTask/blob/main/chatbot_using_gpt_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scope of this notebook
1. Fine tuen GPT2 model for Mental Health Counseling Chat bot

In [1]:
!pip install datasets
!pip install -U transformers



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import warnings
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
# import torchmetrics

In [3]:
def get_config():
  return {
      'datasetName': "Amod/mental_health_counseling_conversations",
      'split': 'train',
      'max_len': 1024,
      'batch_size': 2,
      'shuffle':True,
      'lr': 1e-04,
      'num_epochs': 2,
  }

In [4]:
class DataPreprocess(Dataset):
  def __init__(self, data, tokenizer, max_len:int = 1024):
    super().__init__()

    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    items = self.data[idx]
    context = items['Context']
    response =  items['Response']

    input_text = f"{context}\n [ANSWER]: {response}"

    input_tokens = self.tokenizer(input_text, return_tensors='pt')

    pad_size = self.max_len - input_tokens['input_ids'].size(1)


    input_ids = torch.cat((
        input_tokens['input_ids'],
        torch.tensor([self.tokenizer.encode('[PAD]')]*pad_size, dtype=torch.int64).view(1, pad_size)
    ), -1)


    input_mask = torch.cat((
        input_tokens['attention_mask'],
        torch.tensor([0]*pad_size, dtype=torch.int64).view(1, pad_size)
    ), -1)

    return {
        'input_ids': input_ids,
        'input_mask': input_mask,
        'context': context,
        'responce': response
    }

In [5]:
def get_data(config, tokenizer):
  raw_data = load_dataset(config['datasetName'], split=config['split'])
  num_of_data_points = len(raw_data)

  train_size = int(0.8*num_of_data_points)
  val_size = num_of_data_points - train_size
  train_raw, val_raw = random_split(raw_data, [train_size, val_size])

  train_class = DataPreprocess(train_raw, tokenizer, config['max_len'])
  val_class = DataPreprocess(val_raw, tokenizer, config['max_len'])

  train_ds = DataLoader(train_class, batch_size=config['batch_size'], shuffle=config['shuffle'])
  val_ds = DataLoader(val_class, batch_size=config['batch_size'], shuffle=config['shuffle'])

  return train_ds, val_ds



In [6]:
def get_model_and_tokenizer():
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  tokenizer.add_special_tokens({
      'pad_token': '[PAD]',
      'bos_token': '[SOS]',
      'eos_token': '[EOS]'
  })
  tokenizer.add_tokens('[ANSWER]:')

  model = GPT2LMHeadModel.from_pretrained('gpt2')
  model.resize_token_embeddings(len(tokenizer))
  return model, tokenizer

In [7]:
def evaluate(model, data, device):
  model.eval()

  batch_iterator = tqdm(data)

  cumulative_loss = 0.0
  for batch in batch_iterator:
    input_ids = data['input_ids'].to(device)
    input_mask = data['input_mask'].to(device)
    loss = model(input_ids, attention_mask=input_mask, labels=input_ids).loss
    batch_iterator.set_postfix({'val loss': f"{loss.item():6.3f}"})
    cumulative_loss += loss.item()

  return cumulative_loss/len(data)


In [8]:
def train_model(config):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f"Current device is used: {device}")

  model, tokenizer = get_model_and_tokenizer()
  model.to(device)

  train_ds, val_ds = get_data(config, tokenizer)

  # Tensorboard
  writer = SummaryWriter('Mental Health Counseling Chat Boat')
  optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

  global_step = 0

  for epoch in range(config['num_epochs']):
    torch.cuda.empty_cache()
    model.train()

    batch_iterator = tqdm(train_ds, desc=f"Processing Epoch: {epoch:02d}")
    for batch in batch_iterator:
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device) # (batch, seq_len)
      input_mask = batch['input_mask'].to(device) #(batch, seq_len)

      loss = model(input_ids, attention_mask=input_mask, labels=input_ids).loss
      batch_iterator.set_postfix({ "train loss": f"{loss.item():6.3f}" })

      writer.add_scalar("train loss", loss.item(), epoch)
      writer.flush()

      loss.backward()
      optimizer.step()

    val_loss = evaluate(model, val_ds, device)
    writer.add_scalar("val loss", loss.item(), epoch)
    writer.flush()

    return writer


In [9]:
config = get_config()
wt = train_model(config)

Current device is used: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Processing Epoch: 00:   0%|          | 0/1405 [00:00<?, ?it/s]

551
631


Processing Epoch: 00:   0%|          | 1/1405 [00:01<43:48,  1.87s/it, train loss=58.012]

934
634


Processing Epoch: 00:   0%|          | 2/1405 [00:02<22:18,  1.05it/s, train loss=22.543]

828
827


Processing Epoch: 00:   0%|          | 3/1405 [00:02<19:41,  1.19it/s, train loss=13.799]

858
825


Processing Epoch: 00:   0%|          | 4/1405 [00:03<17:53,  1.31it/s, train loss=9.209]

757
692


Processing Epoch: 00:   0%|          | 5/1405 [00:04<17:13,  1.35it/s, train loss=12.652]

696
772


Processing Epoch: 00:   0%|          | 6/1405 [00:04<16:45,  1.39it/s, train loss=9.053]

887
767


Processing Epoch: 00:   0%|          | 7/1405 [00:05<16:20,  1.43it/s, train loss=3.655]

831
591


Processing Epoch: 00:   1%|          | 8/1405 [00:06<16:09,  1.44it/s, train loss=9.370]

807
95


Processing Epoch: 00:   1%|          | 9/1405 [00:06<16:02,  1.45it/s, train loss=9.167]

552
744


Processing Epoch: 00:   1%|          | 10/1405 [00:07<15:53,  1.46it/s, train loss=3.597]

916
829


Processing Epoch: 00:   1%|          | 11/1405 [00:08<15:50,  1.47it/s, train loss=2.279]

793
711


Processing Epoch: 00:   1%|          | 12/1405 [00:08<15:48,  1.47it/s, train loss=2.426]

453
940


Processing Epoch: 00:   1%|          | 13/1405 [00:09<16:00,  1.45it/s, train loss=2.573]

731
669


Processing Epoch: 00:   1%|          | 14/1405 [00:10<15:59,  1.45it/s, train loss=2.558]

913
927


Processing Epoch: 00:   1%|          | 15/1405 [00:11<16:19,  1.42it/s, train loss=1.323]

876
301


Processing Epoch: 00:   1%|          | 16/1405 [00:11<15:48,  1.46it/s, train loss=3.431]

659
903


Processing Epoch: 00:   1%|          | 17/1405 [00:12<15:38,  1.48it/s, train loss=1.981]

848
790


Processing Epoch: 00:   1%|▏         | 18/1405 [00:13<15:39,  1.48it/s, train loss=1.641]

610
922


Processing Epoch: 00:   1%|▏         | 19/1405 [00:13<15:40,  1.47it/s, train loss=1.853]

903
813


Processing Epoch: 00:   1%|▏         | 20/1405 [00:14<15:49,  1.46it/s, train loss=1.286]

837
796


Processing Epoch: 00:   1%|▏         | 21/1405 [00:15<15:43,  1.47it/s, train loss=1.448]

749
448


Processing Epoch: 00:   2%|▏         | 22/1405 [00:15<15:37,  1.47it/s, train loss=2.703]

438
803


Processing Epoch: 00:   2%|▏         | 23/1405 [00:16<15:39,  1.47it/s, train loss=2.501]

868
819


Processing Epoch: 00:   2%|▏         | 24/1405 [00:17<15:39,  1.47it/s, train loss=1.154]

746
765


Processing Epoch: 00:   2%|▏         | 25/1405 [00:17<15:55,  1.44it/s, train loss=1.724]

630
854


Processing Epoch: 00:   2%|▏         | 26/1405 [00:18<15:33,  1.48it/s, train loss=1.820]

766
785


Processing Epoch: 00:   2%|▏         | 27/1405 [00:19<15:43,  1.46it/s, train loss=1.568]

710
898


Processing Epoch: 00:   2%|▏         | 28/1405 [00:19<15:35,  1.47it/s, train loss=1.402]

910
576


Processing Epoch: 00:   2%|▏         | 29/1405 [00:20<15:39,  1.46it/s, train loss=1.727]

908
807


Processing Epoch: 00:   2%|▏         | 30/1405 [00:21<15:40,  1.46it/s, train loss=1.056]

424
952


Processing Epoch: 00:   2%|▏         | 31/1405 [00:21<15:42,  1.46it/s, train loss=1.996]

947
856


Processing Epoch: 00:   2%|▏         | 32/1405 [00:22<15:51,  1.44it/s, train loss=1.165]

671
876


Processing Epoch: 00:   2%|▏         | 33/1405 [00:23<15:55,  1.44it/s, train loss=1.557]

832
894


Processing Epoch: 00:   2%|▏         | 34/1405 [00:24<15:57,  1.43it/s, train loss=0.932]

789
851


Processing Epoch: 00:   2%|▏         | 35/1405 [00:24<15:38,  1.46it/s, train loss=1.293]

839
648


Processing Epoch: 00:   3%|▎         | 36/1405 [00:25<15:39,  1.46it/s, train loss=1.556]

619
840


Processing Epoch: 00:   3%|▎         | 37/1405 [00:26<15:38,  1.46it/s, train loss=1.762]

754
240


Processing Epoch: 00:   3%|▎         | 38/1405 [00:26<15:39,  1.45it/s, train loss=3.205]

893
911


Processing Epoch: 00:   3%|▎         | 39/1405 [00:27<15:38,  1.46it/s, train loss=0.777]

512
854


Processing Epoch: 00:   3%|▎         | 40/1405 [00:28<15:38,  1.45it/s, train loss=1.676]

544
659


Processing Epoch: 00:   3%|▎         | 41/1405 [00:28<15:38,  1.45it/s, train loss=2.313]

729
590


Processing Epoch: 00:   3%|▎         | 42/1405 [00:29<15:40,  1.45it/s, train loss=2.746]

263
330


Processing Epoch: 00:   3%|▎         | 43/1405 [00:30<15:41,  1.45it/s, train loss=4.809]

515
647


Processing Epoch: 00:   3%|▎         | 44/1405 [00:30<15:43,  1.44it/s, train loss=2.180]

585
810


Processing Epoch: 00:   3%|▎         | 45/1405 [00:31<15:44,  1.44it/s, train loss=1.489]

771
806


Processing Epoch: 00:   3%|▎         | 46/1405 [00:32<15:42,  1.44it/s, train loss=1.110]Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 1024). Running this sequence through the model will result in indexing errors
Processing Epoch: 00:   3%|▎         | 46/1405 [00:32<15:56,  1.42it/s, train loss=1.110]


987
-147


RuntimeError: invalid shape dimension -147

In [None]:
# torch.save({
#         'epoch': epoch,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'global_step': global_step
#     }, model_file_path)