### Generativ AI: **Summarize Dialogue**

In [4]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM # class for loading a using Seq2Seq language models. 
from transformers import AutoTokenizer # class for loading and using tokens
from transformers import GenerationConfig # A class for configuring various settings related to text generation [temperature, beam_size, top-p. top-k and randomness]

In [6]:
huggingface_dataset_name = "knkarthick/dialogsum"

# 10,000 + Dataset with human labeled summaries and topics

dataset = load_dataset(huggingface_dataset_name)

In [9]:
example_indices = [50, 50]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)

    print('INPUT DIALOGUE: ')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)

    print('BASELINE HUMAN SUMMARY: ')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()




---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT DIALOGUE: 
#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------

-----------------------------

### Load the model

In [10]:
model_name = 'google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # download the FLAN-T% model using AutoTokenizer.from_pretrained() stuff
# load the pretraine model, FLAN-T5 model. 


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# use_fast - switches thhe fast tokenizer. 
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# use_fast = True; enables the use of the fast tokenizer implementation which is optimized fo the speed

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Testing the Tokenizer

In [27]:
sentence = "I am Jerlshin J.G, How about You?"

# tokenizes the input. 
sentence_encoded = tokenizer(sentence, return_tensors='pt')
# return_tensors = 'pt' ; specifies that the returned result should be in PyTorch format. if 'tf' tensorflow format. 

"""Converts the encoded seq back into a human readable string"""
sentence_decoded = tokenizer.decode( # convert back 
    sentence_encoded["input_ids"][0], # get the input IDs from the encoded result. [0] as the batch of size 1. 
    skip_special_tokens=True # skip special tokens like [CLS], [SEP] or padding 
)

print("Encoded Sentence: ")
print(sentence_encoded["input_ids"][0]) 
print("\nDecoded Sentence: ")
print(sentence_decoded)

Encoded Sentence: 
tensor([  27,  183,  446,   49,   40,    7, 2907,  446,    5,  517,    6,  571,
          81,  148,   58,    1])

Decoded Sentence: 
I am Jerlshin J.G, How about You?


In [31]:
sentence_encoded

{'input_ids': tensor([[  27,  183,  446,   49,   40,    7, 2907,  446,    5,  517,    6,  571,
           81,  148,   58,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
sentence_encoded["input_ids"][0]

tensor([  27,  183,  446,   49,   40,    7, 2907,  446,    5,  517,    6,  571,
          81,  148,   58,    1])

LLM Summaries wihtout Prompt Engineering

In [37]:
dataset['test'][1]['dialogue']



In [39]:
tokenizer(dataset['test'][1]['dialogue'], return_tensors='pt')

{'input_ids': tensor([[ 1713,   345, 13515,   536,  4663,    10,   283,     7,     5, 31676,
             6,    27,   174,    25,    12,   240,     3,     9,     3, 12194,
           257,    21,   140,     5,  1713,   345, 13515,   357,  4663,    10,
          2163,     6,   108,    52,   233,  1713,   345, 13515,   536,  4663,
            10,   100,   225,   281,    91,    38,    46,  6344,    18, 19632,
           140,  2528,   232,   440,    12,    66,  1652,    57,    48,  3742,
             5,  1521,    25,  1065,    58,  1713,   345, 13515,   357,  4663,
            10,  2163,     6,   108,    52,     5,  1263,  2177,     5,  1713,
           345, 13515,   536,  4663,    10, 20748,    66,   871,   233, 18652,
          2017,     6,    66,   828,  5030,    33, 12103,    12,   791, 17215,
            11,  2314, 22986,     7,     5,    37,   169,    13, 18882,     3,
         16042,  1356,    57,  1652,   383,   464,   716,    19, 15154, 18022,
             5,  1713,   345, 13515,  

In [40]:
tokenizer(dataset['test'][1]['dialogue'], return_tensors='pt')["input_ids"]

tensor([[ 1713,   345, 13515,   536,  4663,    10,   283,     7,     5, 31676,
             6,    27,   174,    25,    12,   240,     3,     9,     3, 12194,
           257,    21,   140,     5,  1713,   345, 13515,   357,  4663,    10,
          2163,     6,   108,    52,   233,  1713,   345, 13515,   536,  4663,
            10,   100,   225,   281,    91,    38,    46,  6344,    18, 19632,
           140,  2528,   232,   440,    12,    66,  1652,    57,    48,  3742,
             5,  1521,    25,  1065,    58,  1713,   345, 13515,   357,  4663,
            10,  2163,     6,   108,    52,     5,  1263,  2177,     5,  1713,
           345, 13515,   536,  4663,    10, 20748,    66,   871,   233, 18652,
          2017,     6,    66,   828,  5030,    33, 12103,    12,   791, 17215,
            11,  2314, 22986,     7,     5,    37,   169,    13, 18882,     3,
         16042,  1356,    57,  1652,   383,   464,   716,    19, 15154, 18022,
             5,  1713,   345, 13515,   357,  4663,  

In [None]:
for i, index in enumerate(example_indices):
    # fetching the dialogue and summary 
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    # input_ids = tokenized version of the input text , each token mapped to integer 
    # attention_mask = binary mask indicating which tokens in the input seq should be attended to and which should be ignored. 
    inputs = tokenizer(dialogue, return_tensors='pt')

    # generating a summary 
    output = tokenizer.decode( # as it creates tokens, decode to the human readable text
        model.generate( # generate the summary based on the tokenize inputs
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)

    print(f"INPUT PROMP:\n{dialogue}")
    print(dash_line)

    print(f"BASELINE HUMAN SUMMARY:\n{summary}")
    print(dash_line)

    print(f"MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}")

## Zero shot, One shot and Few shot learning 

### Zero shot learning with Promt Template from FLAN-T5

In [13]:
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    prompt = f"""
Dialogue:

{dialogue}

What was going on?
"""

    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"], 
            max_new_tokens=50,
        )[0], 
        skip_special_tokens=True
    )
    
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)

    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)

    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)    

    print(f'MODEL GENERATION - ZERO SHOT:\n{output}\n')

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Dialogue:

#Person1#: Yeah. Just pull on this strip. Then peel off the back.
#Person2#: You might make a few enemies this way.
#Person1#: If they don't think this is fun, they're not meant to be our friends.
#Person2#: You mean your friends. I think it's cruel.
#Person1#: Yeah. But it's fun. Look at those two ugly old ladies. . . or are they men?
#Person2#: Hurry! Get a shot!. . . Hand it over!
#Person1#: I knew you'd come around. . .

What was going on?

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# is about to make a prank. #Person2# thinks it's cruel at first but then joins.
---------------------------------------------------------------------------------------------------
M

### One shot Learning

In [19]:
def make_prompt(example_indices_full, example_index_to_summarize):
    prompt = ''
    for index in example_indices_full:
        dialogue = dataset['test'][index]['dialogue']
        summary = dataset['test'][index]['summary']
        
        # The stop sequence '{summary}\n\n\n' is important for FLAN-T5. Other models may have their own preferred stop sequence.
        prompt += f"""
Dialogue:

{dialogue}

What was going on?
{summary}


"""
    
    dialogue = dataset['test'][example_index_to_summarize]['dialogue']
    
    prompt += f"""
Dialogue:

{dialogue}

What was going on?
"""
        
    return prompt

In [20]:
example_indices_full = [40]
example_index_to_summarize = 200


one_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(one_shot_prompt)


Dialogue:

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.



Dialogue:

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also ne

In [21]:
# ground value
summary = dataset['test'][example_index_to_summarize]['summary']

# model generated value 
inputs = tokenizer(one_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 50,
    )[0],
    skip_special_tokens=True
)


print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ONE SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ONE SHOT:
#Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to add a CD-ROM drive.


### Few shot Inference 

In [22]:
example_indices_full = [40, 80, 120]
example_index_to_summarize = 200

few_shot_prompt = make_prompt(example_indices_full, example_index_to_summarize)

print(few_shot_prompt)


Dialogue:

#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.

What was going on?
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.



Dialogue:

#Person1#: May, do you mind helping me prepare for the picnic?
#Person2#: Sure. Have you checked the weather report?
#Person1#: Yes. It says it will be sunny all day. No sign of rain at all. This is your father's favorite sausage. Sandwiches for you and Daniel.
#Person2#: No, thanks Mom. I'd like some toast and chicken wings.
#Person1#: Okay. Please take some fruit salad and crackers for me.
#Person2#: Done. Oh, don't forget to take napkins disposable plates, cups and picnic blanket.
#Person1#: All set. 

In [23]:
summary = dataset['test'][example_index_to_summarize]['summary']

inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens=50,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')

Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors


---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to upgrade his hardware.


## Generative Configuration Parameters for Inference

We can change the config param of the generate() method to get a difference output from the LLM.

In [41]:
generation_config = GenerationConfig(max_new_tokens=50) # controlling the output. 
generation_config = GenerationConfig(max_new_tokens=50, do_sample=True, temperature=0.1) # temp controls the randomnss of the input

# do_sample = True, and changing the temparture value, we get more flexibility in the output


inputs = tokenizer(few_shot_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config=generation_config,
    )[0], 
    skip_special_tokens=True
)

print(dash_line)
print(f'MODEL GENERATION - FEW SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

---------------------------------------------------------------------------------------------------
MODEL GENERATION - FEW SHOT:
#Person1 wants to upgrade his system. #Person2 wants to add a painting program to his software. #Person1 wants to upgrade his hardware.
---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

