In [None]:
!pip install datasets langchain openai

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting openai
  Downloading openai-1.37.1-py3-none-any.whl.metadata (22 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Coll

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your ter

In [None]:
!git clone https://huggingface.co/datasets/knkarthick/dialogsum.git

fatal: destination path 'dialogsum' already exists and is not an empty directory.


In [None]:
from datasets import load_dataset, Dataset
import re
import pandas as pd
import torch
import numpy as np

# dataset_checkpoint = 'knkarthick/dialogsum'
# dataset = load_dataset(dataset_checkpoint)
dataset = load_dataset('./dialogsum')

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [None]:
dataset['train'][10]

{'id': 'train_10',
 'dialogue': "#Person1#: Could you do me a favor?\n#Person2#: Sure. What is it?\n#Person1#: Could you run over to the store? We need a few things.\n#Person2#: All right. What do you want me to get?\n#Person1#: Well, could you pick up some sugar?\n#Person2#: Okay. How much?\n#Person1#: A small bag. I guess we also need a few oranges.\n#Person2#: How many?\n#Person1#: Oh, let's see. . . About six.\n#Person2#: Anything else?\n#Person1#: Yes. We're out of milk.\n#Person2#: Okay. How much do you want me to get? A gallon?\n#Person1#: No. I think a half gallon will be enough.\n#Person2#: Is that all?\n#Person1#: I think so. Have you got all that?\n#Person2#: Yes. That's small bag of sugar, four oranges, and a half gallon of milk.\n#Person1#: Do you have enough money?\n#Person2#: I think so.\n#Person1#: Thanks very much. I appreciate it.",
 'summary': '#Person1# asks #Person2# to do a favor. #Person2# agrees and helps buy a small bag of sugar, six oranges, and a half-gallon 

In [None]:
class PreProcessDataset:
  def __init__(self, dataset, seed=42):
    self.train_dataset = dataset['train']
    self.test_dataset = dataset['test']
    self.validation_dataset = dataset['validation']
    self.preprocessed_train_dataset = None
    self.preprocessed_test_dataset = None
    self.preprocessed_validation_dataset = None
    self.seed = seed

  def _split_dialogue(self, record):
    dialogue = record['dialogue']
    splited_dialogue = re.split('#Person1#:|#Person2#:', dialogue)

    role_utterance_dialogue = []
    i = 0
    for u in splited_dialogue:
      if u.strip():
        if i % 2 == 0:
          role_utterance_dialogue.append({'role': 'USR', 'utterance': u.strip()})
        else:
          role_utterance_dialogue.append({'role': 'SYS', 'utterance': u.strip()})
        i += 1

    return role_utterance_dialogue

  def get_context_response(self, record):
    np.random.seed(self.seed)
    dialogue = self._split_dialogue(record)
    sys_utterance_ids = [i for i, d in enumerate(dialogue) if d['role'] == 'SYS']
    response_id = np.random.choice(sys_utterance_ids)
    context = dialogue[:response_id]
    response = dialogue[response_id]
    return {'context': context, 'response': response, 'turns number': response_id}

  def call(self):
    self.preprocessed_train_dataset = self.train_dataset.map(self.get_context_response)
    self.preprocessed_test_dataset = self.test_dataset.map(self.get_context_response)
    self.preprocessed_validation_dataset = self.validation_dataset.map(self.get_context_response)

    self.preprocessed_train_dataset = self.preprocessed_train_dataset.remove_columns(["dialogue", "summary", "topic"])
    self.preprocessed_test_dataset = self.preprocessed_test_dataset.remove_columns(["dialogue", "summary", "topic"])
    self.preprocessed_validation_dataset = self.preprocessed_validation_dataset.remove_columns(["dialogue", "summary", "topic"])


  def save(self, root_path):
    self.preprocessed_train_dataset.to_csv(root_path + '/train.csv')
    self.preprocessed_test_dataset.to_csv(root_path + '/test.csv')
    self.preprocessed_validation_dataset.to_csv(root_path + '/validation.csv')

  @property
  def columns(self):
    return {
        'train': self.preprocessed_train_dataset.column_names,
        'validation': self.preprocessed_validation_dataset.column_names,
        'test': self.preprocessed_test_dataset.column_names
    }



pre_processed_ds = PreProcessDataset(dataset)
pre_processed_ds.call()
pre_processed_ds.save(root_path='./data')
print(pre_processed_ds.columns)

{'train': ['id', 'context', 'response', 'turns number'], 'validation': ['id', 'context', 'response', 'turns number'], 'test': ['id', 'context', 'response', 'turns number']}


In [None]:
data_root_path = './data/'
data_pathes = {
    'train': data_root_path + 'train.csv',
    'validation': data_root_path + 'validation.csv',
    'test': data_root_path + 'test.csv'
}

dataset = load_dataset('csv', data_files=data_pathes)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 1500
    })
})

In [None]:
dataset.push_to_hub("mohammadhabp/Dialogue_Bot")

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mohammadhabp/Dialogue_Bot/commit/db65fe9726d75faf2589af1dc4a139f433139a70', commit_message='Upload dataset', commit_description='', oid='db65fe9726d75faf2589af1dc4a139f433139a70', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# !pip install git+https://github.com/mlfoundations/open_lm.git
!pip install datasets
!pip install rouge_score
# !pip install quanto

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00

In [None]:
from datasets import load_dataset, Dataset
import re
import pandas as pd
import torch
import numpy as np

dataset = load_dataset('mohammadhabp/Dialogue_Bot')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'context', 'response', 'turns number'],
        num_rows: 1500
    })
})

In [None]:
# from open_lm.hf import *
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoConfig


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# quantization_config = QuantoConfig(weights="int8")
model_checkpoint = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    # quantization_config=quantization_config
).to(device)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(tokenizer.chat_template)

None


In [None]:
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['utterance'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|><|SYS|>\n' }}{% endif %}"

print(tokenizer.chat_template)

{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['utterance'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|><|SYS|>
' }}{% endif %}


In [None]:
def inference(chat):
  inputs_text = tokenizer.apply_chat_template(chat, return_tensors='pt', tokenize=False)
  inputs = tokenizer(inputs_text, return_tensors='pt', truncation=True, padding=True, max_length=1024)
  inputs = {k: v.to(device) for k, v in inputs.items()}
  outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True)
  return tokenizer.decode(outputs[0])

In [None]:
messages = [
    {"role": "<|USR|>", "utterance": "Hi there!"},
    {"role": "<|SYS|>", "utterance": "Hi yourself!"},
    {"role": "<|USR|>", "utterance": "I have broken up with my girl friend"}
]

print(inference(messages))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<|im_start|><|USR|>
Hi there!<|im_end|>
<|im_start|><|SYS|>
Hi yourself!<|im_end|>
<|im_start|><|USR|>
I have broken up with my girl friend<|im_end|>
<|im_start|><|SYS|>
I have already won a couple of contests<|im_end|>
<|im_start|><|USR|>
[w=40|15|50]
> I can't remember...<|im_end|>
<|im_start|><|SYS|>
[w=40|15|50] <|im_end|>
<|im_start|><|SYS|>
[w=40|15|50] <|im_end|>
What are


In [None]:
def convert_to_list(text):
  pattern = re.compile(r"\{'role': '(\w+)', 'utterance': \"(.*?)\"}|"
                         r"\{'role': '(\w+)', 'utterance': '(.*?)'\}", re.DOTALL)

  matches = pattern.findall(text)

  dialogs = []
  for match in matches:
      role = match[0] or match[2]  # match[0] for double quotes, match[2] for single quotes
      utterance = match[1] or match[3]  # match[1] for double quotes, match[3] for single quotes
      dialogs.append({'role': role, 'utterance': utterance})

  return dialogs


def prepare_data(record):
  context = record['context']
  response = record['response']

  context = convert_to_list(context)
  response = convert_to_list(response)

  context_text = ' '.join([f"{entry['role']}: {entry['utterance']}" for entry in context])
  response_text = response[0]['utterance']

  inputs = tokenizer(context_text, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
  labels = tokenizer(response_text, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids.to(device)

  inputs = {k: v.squeeze().to(device) for k, v in inputs.items()}
  labels = labels.squeeze().to(device)

  return {**inputs, 'labels': labels}

prepare_data(dataset['train'][999])

{'input_ids': tensor([ 2937,    49,    25,   466,   345,   588,  4695,    30,   314,  1107,
           588,  6844,    13,   311, 16309,    25,   523,   466,  1312,    13,
           314,   836,   470,   588, 11875,    13,  1294,    49,    25,  1521,
            30,   314,   892, 11875,   389, 12876,    13,   311, 16309,    25,
           314,   460,   470,  6842,   852,  1474, 11875,    13,  1119,   836,
           470,  1283,   284,   588,   502,  2035,    13,  1294,    49,    25,
           314,   588,  4295,  4695,    13,   314,   836,   470,   588, 26120,
           290, 26042,    13,   314,   892, 26120,   290, 26042,   389, 23374,
            13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50

In [None]:
dataset.update({'train': dataset['train'].select(range(7000))})
dataset.update({'validation': dataset['validation'].select(range(100))})

tokenized_dataset = dataset.map(prepare_data, remove_columns=dataset['train'].column_names)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
})

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors='pt')

In [None]:
from datasets import load_metric
import numpy as np


rouge_metric = load_metric("rouge")

def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    labels[labels == -100] = tokenizer.pad_token_id
    predictions = logits.argmax(axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # Wrap each label in a list

    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "rouge1": rouge["rouge1"].mid.fmeasure,
        "rouge2": rouge["rouge2"].mid.fmeasure,
        "rougeL": rouge["rougeL"].mid.fmeasure,
    }


  rouge_metric = load_metric("rouge")


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="chat_bot-model",
    evaluation_strategy="epoch",
    # per_device_train_batch_size=4,
    # per_device_eval_batch_size=4,
    auto_find_batch_size=True,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=True,
    hub_model_id='mohammadhabp/dialogsum_gpt2',
    hub_strategy='every_save',
    hub_private_repo=False,
    eval_steps=512,
    save_strategy='epoch',
    save_steps=512,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model='rougeL',
    save_safetensors=True,
    group_by_length=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.3713,2.351573,0.482051,0.153284,0.377466
2,2.2024,2.324934,0.482037,0.155974,0.380559


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.3713,2.351573,0.482051,0.153284,0.377466
2,2.2024,2.324934,0.482037,0.155974,0.380559
3,1.988,2.321141,0.486584,0.160935,0.386838


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=2625, training_loss=2.191030910310291, metrics={'train_runtime': 838.4052, 'train_samples_per_second': 25.048, 'train_steps_per_second': 3.131, 'total_flos': 1371783168000000.0, 'train_loss': 2.191030910310291, 'epoch': 3.0})

In [None]:
chat = convert_to_list(dataset['test'][10]['context'])
print(chat)
print('------')
print(inference(chat))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'role': 'USR', 'utterance': 'Happy Birthday, this is for you, Brian.'}, {'role': 'SYS', 'utterance': "I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time."}, {'role': 'USR', 'utterance': 'Brian, may I have a pleasure to have a dance with you?'}, {'role': 'SYS', 'utterance': 'Ok.'}, {'role': 'USR', 'utterance': 'This is really wonderful party.'}]
------
<|im_start|>USR
Happy Birthday, this is for you, Brian.<|im_end|>
<|im_start|>SYS
I'm so happy you remember, please come in and enjoy the party. Everyone's here, I'm sure you have a good time.<|im_end|>
<|im_start|>USR
Brian, may I have a pleasure to have a dance with you?<|im_end|>
<|im_start|>SYS
Ok.<|im_end|>
<|im_start|>USR
This is really wonderful party.<|im_end|>
SYS. I would love to have a glass of champagne to drink. All the VIPs, please. How are you tonight? Wow, you're dancing like a madman! Are you ready for a dessert? Sorry, I haven't brought anything for you. SYS, I

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.evaluate(tokenized_dataset['test'].select(range(100)))

In [None]:
trainer.save_model()
trainer.push_to_hub('fine tuned model')

CommitInfo(commit_url='https://huggingface.co/mohammadhabp/dialogsum_gpt2/commit/0b21fb52477b5c770d04bdfa1f472b8760f3b195', commit_message='fine tuned model', commit_description='', oid='0b21fb52477b5c770d04bdfa1f472b8760f3b195', pr_url=None, pr_revision=None, pr_num=None)