# Train the model

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
import json
import torch
from typing import List
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [None]:
class DatasetFromJSON(Dataset):
    def __init__(self, data: List[List[str]], tokenizer: PreTrainedTokenizer, max_length=1024):
      """
        Initializes a dataset from JSON data for training a conversational model.

        Args:
            data (List[List[str]]): A list of conversation sequences.
            tokenizer (PreTrainedTokenizer): Tokenizer for encoding conversation pairs.
            max_length (int): Maximum sequence length after tokenization.
      """
      self.tokenizer = tokenizer
      self.input_data = []
      self.max_length = max_length
      for conversation in data:
          for i in range(len(conversation) - 1):
              input_pair = (conversation[i], conversation[i + 1])
              encoded_pair = tokenizer.encode(input_pair[0], input_pair[1], add_special_tokens=True, truncation=True, max_length=self.max_length, padding="max_length")
              self.input_data.append(encoded_pair)

    def __len__(self):
      # Returns the total number of encoded conversation pairs in the dataset.
      return len(self.input_data)

    def __getitem__(self, idx):
      # Returns the encoded conversation pair at the specified index.
      example = self.input_data[idx]
      return example

In [None]:
def read_and_process_json(file_path: str) -> List[List[str]]:
  """
    Reads a JSON file and processes its content into a list of conversation sequences.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        List[List[str]]: A list of conversation sequences.
  """
  print("Reading...")
  with open(file_path, "r", encoding="utf-8") as file:
      data = json.load(file)
  return data

# Configure the training hyperparameters
def train_dialogpt(model_name, train_data, output_dir, epochs):
  """
    Trains a DialoGPT-small model for dialog generation using specified hyperparameters.

    Args:
        model_name (str): Name of the pre-trained DialoGPT-small model.
        train_data (List[List[str]]): List of conversation sequences.
        output_dir (str): Directory to save the trained model and logs.
        epochs (int): Number of training epochs.
  """
  print("Training...")
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
  tokenizer.pad_token = tokenizer.eos_token
  train_dataset = DatasetFromJSON(train_data, tokenizer)

  data_collator = DataCollatorForLanguageModeling(
      tokenizer=tokenizer,
      mlm=False,
      pad_to_multiple_of=8
  )

  training_args = TrainingArguments(
      output_dir=output_dir,
      overwrite_output_dir=True,
      num_train_epochs=epochs,
      per_device_train_batch_size=1,
      save_steps=100,
      save_total_limit=3,
      logging_steps=100,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
  )
  trainer.train()

# Define a function to train the model on training dataset
def train_model(training_file, model_output):
    train_data = read_and_process_json(training_file)
    train_dialo_gpt("microsoft/DialoGPT-small", train_data, model_output, 100)

**Remarks before training** : manually create two folders:
- one folder named 'training' and ensure the training dataset (json formart) is loaded into the folder.
- one folder named 'models' to save the depedencies of the fine-tuned model and tokenizer.

**Remarks after training**:
- save the checkpoint folder after training.

In [None]:
if __name__ == "__main__":
  training_file = "/content/training /training_data.json" # directory that conatins training dataset
  model_output = "/content/models" # directory that save the trained model and tokenixer

  train_model(training_file, model_output) # train the model
