In [2]:
from transformers import GenerationConfig, Seq2SeqTrainer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, AutoModelForSeq2SeqLM,  AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
import os
import pandas as pd
import time


In [1]:
def trainable_param(model):
    num = 0
    all_num = 0
    cnt = 0
    for _, param in model.named_parameters():
        if cnt == 0:
            print(param)
            cnt += 1
        all_num += param.numel()
        if param.requires_grad:
            num += param.numel()
    return ("All {}, Trainable {}".format(all_num, num))


In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [23]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load DailyDialog dataset (or any dialogue dataset)
dataset = load_dataset("daily_dialog")


# Use the training split for fine-tuning
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

# Choose a pre-trained conversational model (e.g., DialoGPT-small)
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [24]:
train_dataset

Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 11118
})

In [15]:
train_dataset['dialog'][0]

['Say , Jim , how about going for a few beers after dinner ? ',
 ' You know that is tempting but is really not good for our fitness . ',
 ' What do you mean ? It will help us to relax . ',
 " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
 " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
 ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
 " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
 ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
 " Good.Let ' s go now . ",
 ' All right . ']

In [25]:
train_dataset

Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 11118
})

In [26]:
trainable_param(model)

Parameter containing:
tensor([[-0.0699, -0.0288,  0.0483,  ..., -0.1614, -0.0889, -0.0199],
        [ 0.0178, -0.0233,  0.0382,  ...,  0.0812,  0.0011,  0.0409],
        [-0.1228,  0.0519,  0.1614,  ...,  0.0985, -0.1012, -0.0801],
        ...,
        [-0.0721, -0.0417,  0.0161,  ...,  0.0179,  0.0231, -0.0108],
        [ 0.1482, -0.0124, -0.0164,  ..., -0.1356,  0.1014, -0.0580],
        [ 0.0171, -0.0199,  0.0471,  ...,  0.0025,  0.0728,  0.1221]],
       requires_grad=True)


'All 124439808, Trainable 124439808'

In [27]:
tokenizer.pad_token = tokenizer.eos_token

In [28]:

# Move the model to GPU (CUDA)
model = model.to("cuda")


In [29]:
def tokenize_function(batch):
    # Join dialogue turns into a single string for each example
    prompts = [" ".join(dialog_list) if isinstance(dialog_list, list) else dialog_list
               for dialog_list in batch["dialog"]]
    tokens = tokenizer(prompts, padding="max_length", truncation=True, max_length=128)
    # Set labels equal to input_ids so that the model computes a loss
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [30]:
# Tokenize the datasets with batched mapping
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [22]:
# Remove non-tokenized columns if needed
train_dataset = train_dataset.remove_columns(["dialog"])
eval_dataset = eval_dataset.remove_columns(["dialog"])

In [31]:
train_dataset

Dataset({
    features: ['dialog', 'act', 'emotion', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11118
})

In [32]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./sft_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_steps=100,
    save_total_limit=1,
    fp16=True  # Use mixed precision training if supported
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.9359,1.905227
2,1.7986,1.854686
3,1.7164,1.84465


TrainOutput(global_step=4170, training_loss=1.8555032688936741, metrics={'train_runtime': 799.732, 'train_samples_per_second': 41.706, 'train_steps_per_second': 5.214, 'total_flos': 2178783608832000.0, 'train_loss': 1.8555032688936741, 'epoch': 3.0})

In [34]:
pip install huggingface_hub



In [38]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [40]:
model.push_to_hub("TheLongTran/Dialogue-For-ChatBot")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TheLongTran/Dialogue-For-ChatBot/commit/21eabc5d70f5d9159761c743eb9d5d5cddae54cb', commit_message='Upload model', commit_description='', oid='21eabc5d70f5d9159761c743eb9d5d5cddae54cb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheLongTran/Dialogue-For-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='TheLongTran/Dialogue-For-ChatBot'), pr_revision=None, pr_num=None)

In [41]:
tokenizer.push_to_hub("TheLongTran/Dialogue-For-ChatBot")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TheLongTran/Dialogue-For-ChatBot/commit/bc30741d15197b5ddb02f440a582dd70303ff730', commit_message='Upload tokenizer', commit_description='', oid='bc30741d15197b5ddb02f440a582dd70303ff730', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TheLongTran/Dialogue-For-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='TheLongTran/Dialogue-For-ChatBot'), pr_revision=None, pr_num=None)

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
model.save_pretrained("/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot")

In [44]:
tokenizer.save_pretrained("/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot")

('/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/tokenizer_config.json',
 '/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/special_tokens_map.json',
 '/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/vocab.json',
 '/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/merges.txt',
 '/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/added_tokens.json',
 '/content/drive/MyDrive/TheLongTran/Dialogue-For-ChatBot/tokenizer.json')