# Google Colab Notebook 1: Train and Save Model

In [None]:
# Install necessary libraries
!pip install transformers datasets accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.32.0-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.0/314.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, load_metric

In [None]:
from sklearn.model_selection import train_test_split

from transformers import DataCollatorWithPadding


In [None]:
# Load the dataset
dataset = load_dataset('Kaludi/Customer-Support-Responses')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74 [00:00<?, ? examples/s]

In [None]:
# Split the dataset into train and test sets
train_test_split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split_dataset['train']
test_dataset = train_test_split_dataset['test']


# Load the tokenizer and add a padding token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# # Define a data collator
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# Define a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [None]:
# # Preprocess the dataset
# def preprocess_function(examples):
#     return tokenizer(examples['query'], truncation=True, padding='max_length', max_length=128)

# tokenized_datasets = dataset.map(preprocess_function, batched=True)
# tokenized_datasets = tokenized_datasets.rename_column("response", "labels")

# # Preprocess the datasets
# def preprocess_function(examples):
#     return tokenizer(examples['query'], truncation=True, padding='max_length', max_length=128)

# tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
# tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
# tokenized_train_dataset = tokenized_train_dataset.rename_column("response", "labels")
# tokenized_test_dataset = tokenized_test_dataset.rename_column("response", "labels")


# Preprocess the datasets
def preprocess_function(examples):
    inputs = tokenizer(examples['query'], truncation=True, padding='max_length', max_length=128)
    labels = tokenizer(examples['response'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)





Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [None]:
# Load the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_steps=10_000,
#     save_total_limit=2,
# )

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Increased epochs
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=1000,
)



In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Training the model with interactive feedback
for epoch in range(int(training_args.num_train_epochs)):
    print(f"Training epoch {epoch+1}...")
    trainer.train()
    # Generate a response for the test query
    # Interactive query input
    while True:
      test_query = input("Enter your query (or type 'exit' to quit): ")
      if test_query.lower() == 'exit':
        break
      inputs = tokenizer(test_query, return_tensors='pt', padding=True, truncation=True)
      outputs = model.generate(
        inputs.input_ids,
        max_length=50,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,     # Enable sampling
        temperature=0.7,    # Adjust temperature to control randomness
        top_p=0.9,          # Use nucleus sampling
        top_k=50            # Use top-k sampling
      )
      response = tokenizer.decode(outputs[0], skip_special_tokens=True)
      print(f"Response: {response}")
    # test_query = "What is the status of my warranty claim"

    # response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Display the response and ask for feedback
    # print(f"Generated response: {response}")
    feedback = input("Do you want to continue training? (yes/no): ")

    # Save the model if training is to be stopped
    if feedback.lower() == 'no':
        model.save_pretrained(f'./model_epoch_{epoch+1}')
        tokenizer.save_pretrained(f'./model_epoch_{epoch+1}')
        break
    else:
        model.save_pretrained(f'./model_epoch_{epoch+1}')
        tokenizer.save_pretrained(f'./model_epoch_{epoch+1}')

Training epoch 1...


Epoch,Training Loss,Validation Loss
1,No log,1.018721
2,No log,0.992281
3,No log,0.975939
4,No log,1.00628
5,No log,0.990922
6,No log,0.96785
7,No log,0.984889
8,No log,0.980941
9,No log,0.973149
10,No log,0.972031


Response: I haven't received a response to my email inquiry. can provide further assist you. please provide your email address provide your email provide your provide your provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide
Response: forgot password. the password provide you provide to the email you provide you provide. provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide provide
Response: passord
Training epoch 2...


Epoch,Training Loss,Validation Loss
1,No log,0.975118
2,No log,0.970705
3,No log,0.952818
4,No log,0.969436
5,No log,0.96446
6,No log,0.949785
7,No log,0.960364
8,No log,0.95439
9,No log,0.959588
10,No log,0.961439


Response: I haven't received a response to my email. please provide a follow your email provide your provide your provide your provide your provide provide your provide provide provide your provide provide provide provide you provide provide provide provide provide provide provide provide provide provide provide provide provide provide


In [None]:
# Save the final model
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')