<a href="https://colab.research.google.com/github/MODIKULDEEP/BART-Dialogue-Summarization/blob/main/BART_Dialogue_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
!pip install transformers datasets evaluate transformers[torch]



In [55]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
"""
BART HAS 400M PARAMS: https://github.com/facebookresearch/fairseq/tree/main/examples/bart
"""
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")



In [56]:
!pip install py7zr



In [57]:
from datasets import load_dataset
dataset = load_dataset("samsum")
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [58]:
sample = dataset['test'][0] ['dialogue']
label = dataset['test'][0]['summary']
def generate_summary(input, llm):
  input_prompt = f"""
                  Summarize the following conversation.

                  {input }

                  Summary :
                  """

  input_ids = tokenizer(input, return_tensors='pt')
  tokenized_output = llm.generate(input_ids['input_ids' ], min_length=30, max_length=200)
  output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

  return output

output = generate_summary(sample, llm=model)
print("Sample")
print(sample)
print ("------------------------")
print("Model Generated Summary:")
print (output)
print("Correct Summary:")
print (label)

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him ðŸ™‚
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
------------------------
Model Generated Summary:
Hannah: Hey, do you have Betty's number? Amanda: Lemme check. Hannah: Ask Larry. Amanda: He called her last time we were at the park together.
Correct Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [59]:
def tokenize_inputs(example):
  start_prompt = "Summarize the following conversation.\n\n"
  end_prompt = "\n\nSummary: "
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt', max_length=512).input_ids
  example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt', max_length=512).input_ids

  return example

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_inputs, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

In [60]:
print(tokenized_datasets['train'].shape)
print(tokenized_datasets['validation'].shape)
print(tokenized_datasets['test'].shape)

(148, 2)
(9, 2)
(9, 2)


In [61]:
tokenized_datasets['train'][0].keys()

dict_keys(['input_ids', 'labels'])

In [62]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [63]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir="./bart-cnn-samsum-finetuned", # local directory
  hub_model_id="TestingTesters/bart-cnn-samsum-finetuned", # identifier on the Hub
  learning_rate=1e-5,
  num_train_epochs=1,
  weight_decay=0.01,
  auto_find_batch_size=True,
  evaluation_strategy='epoch',
  logging_steps=10
)

trainer = Trainer(
  model=model,
  tokenizer=tokenizer,
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation']
)



In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.1939,1.021455


TrainOutput(global_step=37, training_loss=0.2504950987326132, metrics={'train_runtime': 62.2746, 'train_samples_per_second': 2.377, 'train_steps_per_second': 0.594, 'total_flos': 177702577373184.0, 'train_loss': 0.2504950987326132, 'epoch': 1.0})

In [65]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1716791569.362640dc4c84.1074.3:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TestingTesters/bart-cnn-samsum-finetuned/commit/61817209466ba4a7d301ff2aad229264ae7cbaab', commit_message='End of training', commit_description='', oid='61817209466ba4a7d301ff2aad229264ae7cbaab', pr_url=None, pr_revision=None, pr_num=None)

In [67]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("TestingTesters/bart-cnn-samsum-finetuned")
testing_input = """
Alice: Hey Bob, did you finish the presentation for the client meeting tomorrow?
Bob: Not yet, Alice. Iâ€™m still working on the financial projections.
Alice: Oh no, weâ€™re running out of time. Do you need any help with that?
Bob: Actually, yes. Can you handle the market analysis section?
Alice: Sure, Iâ€™ll get on it right away. By the way, have you confirmed the meeting time with the client?
Bob: Yes, itâ€™s scheduled for 10 AM. Iâ€™ll send you the final agenda in a bit.
Alice: Great, thanks. Letâ€™s make sure everything is ready tonight. We can review it together tomorrow morning.
Bob: Sounds like a plan. I appreciate your help.
Alice: No problem, weâ€™re a team!
"""
output1 = generate_summary(testing_input, llm=loaded_model)
print("testing_input")
print ("------------------------")
print(testing_input)
print ("------------------------")
print("Model Generated Summary:")
print (output1)

testing_input
------------------------

Alice: Hey Bob, did you finish the presentation for the client meeting tomorrow?
Bob: Not yet, Alice. Iâ€™m still working on the financial projections.
Alice: Oh no, weâ€™re running out of time. Do you need any help with that?
Bob: Actually, yes. Can you handle the market analysis section?
Alice: Sure, Iâ€™ll get on it right away. By the way, have you confirmed the meeting time with the client?
Bob: Yes, itâ€™s scheduled for 10 AM. Iâ€™ll send you the final agenda in a bit.
Alice: Great, thanks. Letâ€™s make sure everything is ready tonight. We can review it together tomorrow morning.
Bob: Sounds like a plan. I appreciate your help.
Alice: No problem, weâ€™re a team!

------------------------
Model Generated Summary:
Bob is still working on the financial projections for the client meeting tomorrow. Alice is running out of time to finish the market analysis section of the presentation. Bob will send the final agenda in a bit.
