In [4]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

In [5]:
dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(dataset_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [7]:
example_indices = [40, 200]

dash_line = "-".join('' for x in range(100))

for i, index in enumerate(example_indices):
  print(dash_line)
  print("Example", i+1)
  print(dash_line)
  print("Input Dialogue:")
  print(dataset['test'][index]['dialogue'])
  print(dash_line)
  print("Baseline Human Summary:")
  print(dataset['test'][index]['summary'])
  print(dash_line)
  print()




---------------------------------------------------------------------------------------------------
Example 1
---------------------------------------------------------------------------------------------------
Input Dialogue:
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Human Summary:
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Exam

In [8]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [10]:
sentence = "What time is it, Tom?"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0], skip_special_tokens=True)

print("Encoded Sentence:")
print(sentence_encoded["input_ids"][0])
print("\nDecoded Sentence:")
print(sentence_decoded)

Encoded Sentence:
tensor([ 363,   97,   19,   34,    6, 3059,   58,    1])

Decoded Sentence:
What time is it, Tom?


In [11]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  inputs = tokenizer(dialogue, return_tensors='pt')
  outputs = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=50,
      )[0],
      skip_special_tokens=True
  )

  print(dash_line)
  print("Example:", i+1)
  print(dash_line)
  print("Input Prompt: {}".format(dialogue))
  print(dash_line)
  print("Baseline Human Summary: ", summary)
  print(dash_line)
  print("Model Generation - without Prompt Engineering:", outputs)

---------------------------------------------------------------------------------------------------
Example: 1
---------------------------------------------------------------------------------------------------
Input Prompt: #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Human Summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Generation - without Prompt Engineering: Person1: It's ten to nine.
-------------------------------

**Summarize Dialogue with an Instruction Prompt**

*Zero Shot Inference with an Instruction Prompt*

In [16]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  prompt = "Summarize the following conversation {} Summary:".format(dialogue)

  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(
          inputs['input_ids'], max_new_tokens=50,)[0],
      skip_special_tokens=True)

  print(dash_line)
  print("Example:", i+1)
  print(dash_line)
  print("Input Prompt: {}".format(dialogue))
  print(dash_line)
  print("Baseline Human Summary: ", summary)
  print(dash_line)
  print("Model Generation - without Prompt Engineering:", output)

---------------------------------------------------------------------------------------------------
Example: 1
---------------------------------------------------------------------------------------------------
Input Prompt: #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Human Summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Generation - without Prompt Engineering: The train is about to leave.
-----------------------------

In [17]:
for i, index in enumerate(example_indices):
  dialogue = dataset['test'][index]['dialogue']
  summary = dataset['test'][index]['summary']

  prompt = "Dialogue {} What was going on?".format(dialogue)

  inputs = tokenizer(prompt, return_tensors='pt')
  output = tokenizer.decode(
      model.generate(
          inputs['input_ids'], max_new_tokens=50,)[0],
      skip_special_tokens=True)

  print(dash_line)
  print("Example:", i+1)
  print(dash_line)
  print("Input Prompt: {}".format(dialogue))
  print(dash_line)
  print("Baseline Human Summary: ", summary)
  print(dash_line)
  print("Model Generation - without Prompt Engineering:", output)

---------------------------------------------------------------------------------------------------
Example: 1
---------------------------------------------------------------------------------------------------
Input Prompt: #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
---------------------------------------------------------------------------------------------------
Baseline Human Summary:  #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Model Generation - without Prompt Engineering: The train is about to leave Tom's place.
-----------------

**Summarize Dialogue with One Shot and Few Shot Inference**

*One Shot Inference*

In [23]:
def make_prompt(example_indices_full, example_index_to_summarize):
  prompt = ""
  for index in example_indices_full:
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt += f"""Dialogue:

    {dialogue}

    What was going on?

    {summary}"""

  dialogue = dataset['test'][example_index_to_summarize]['dialogue']

  prompt += f"""Dialogue:

  {dialogue}

  What was going on?"""

  return prompt


In [24]:
example_indices_full = [40]
example_index_to_summarize = 200

one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)

Dialogue: 

    #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there. 

    What was going on? 

    #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.Dialogue: 

  #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. A

In [27]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50,)[0], skip_special_tokens=True)

print(dash_line)
print("Baseline Human Summary: {}".format(summary))
print(dash_line)
print("Model Geneartion One Shot: {}".format(output))

---------------------------------------------------------------------------------------------------
Baseline Human Summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
Model Geneartion One Shot: #Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.


In [26]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)

Dialogue: 

    #Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there. 

    What was going on? 

    #Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.Dialogue: 

    #Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No, thanks Mom. I'd like some toast and chicken wings.
#Person1#: Okay. Please take some fruit salad and crackers for me.
#Person2#: Done. Oh, don't forget to take napkins disposable plates, cups and picnic blanket.
#Per

In [30]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(model.generate(inputs["input_ids"], max_new_tokens=50,)[0], skip_special_tokens=True)

print(dash_line)
print("Baseline Human Summary: {}".format(summary))
print(dash_line)
print("Model Geneartion One Shot: {}".format(output))

---------------------------------------------------------------------------------------------------
Baseline Human Summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
Model Geneartion One Shot: #Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to upgrade his hardware.
