In [None]:

!pip install transformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def summarize_dialogue(dialogue):
  """Summarizes a dialogue using FLAN-T5.

  Args:
    dialogue: The dialogue to summarize.

  Returns:
    The summary of the dialogue.
  """
  prompt = f"Summarize the following dialogue:\n{dialogue}\nSummary:"
  inputs = tokenizer(prompt, return_tensors="pt")
  outputs = model.generate(**inputs, max_new_tokens=200)
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return summary

# Example dialogue
dialogue = """
Person 1: Hi, how are you doing today?
Person 2: I'm doing well, thanks for asking! How about you?
Person 1: I'm great, thanks!  I was wondering if you could help me with something.
Person 2: Sure, what's up?
Person 1: I need to find a good restaurant in the city. Do you have any recommendations?
Person 2:  I do! There's this new Italian place that just opened up downtown. It's supposed to be amazing.
Person 1:  Oh, that sounds perfect! Do you know the name?
Person 2:  It's called "La Piazza." They have a fantastic pasta menu.
Person 1:  Great, thanks so much! I'll check it out.
Person 2: No problem! Have a good time.
"""

summary = summarize_dialogue(dialogue)
print(summary)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Person 1 is looking for a good restaurant in the city. Person 2 recommends "La Piazza" which is an Italian place.


In [None]:

!pip install nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

def summarize_text(text, num_sentences=3):
  """Summarizes text using a basic frequency-based approach.

  Args:
    text: The text to summarize.
    num_sentences: The number of sentences in the summary.

  Returns:
    The summary of the text.
  """
  # Tokenize the text into sentences
  sentences = sent_tokenize(text)

  # Remove stop words and punctuation
  stop_words = set(stopwords.words('english'))
  words = []
  for sentence in sentences:
    for word in nltk.word_tokenize(sentence):
      if word.lower() not in stop_words and word.isalnum():
        words.append(word.lower())

  # Calculate word frequencies
  freq_dist = FreqDist(words)

  # Rank sentences based on word frequency
  sentence_scores = {}
  for i, sentence in enumerate(sentences):
    for word in nltk.word_tokenize(sentence):
      if word.lower() in freq_dist:
        if i not in sentence_scores:
          sentence_scores[i] = freq_dist[word.lower()]
        else:
          sentence_scores[i] += freq_dist[word.lower()]

  # Get the top N sentences
  sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
  summary_sentences = []
  for i in range(num_sentences):
    if i < len(sorted_sentences):
      summary_sentences.append(sentences[sorted_sentences[i][0]])

  # Return the summary
  return " ".join(summary_sentences)

# Example dialogue
dialogue = """
Alice: Hey, are you free this weekend?
Bob:  Yeah, why?
Alice:  I was thinking we could go to the new art exhibit downtown.
Bob:  Sounds cool! What kind of art is it?
Alice:  It's contemporary art, with a focus on sculptures.
Bob:  Okay, that's interesting.  Do you know the address?
Alice:  I'll send you a link to the website later.
Bob: Great! I'm looking forward to it.
"""

summary = summarize_text(dialogue)
print(summary)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Alice:  I was thinking we could go to the new art exhibit downtown. Alice:  It's contemporary art, with a focus on sculptures. Alice:  I'll send you a link to the website later.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#  ANOTHER METHOD

from transformers import pipeline

# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Example dialogue
dialogue = """
Alice: Hey, are you free this weekend?
Bob:  Yeah, why?
Alice:  I was thinking we could go to the new art exhibit downtown.
Bob:  Sounds cool! What kind of art is it?
Alice:  It's contemporary art, with a focus on sculptures.
Bob:  Okay, that's interesting.  Do you know the address?
Alice:  I'll send you a link to the website later.
Bob: Great! I'm looking forward to it.
"""

# Generate the summary
summary = summarizer(dialogue, max_length=130, min_length=30, do_sample=False)

# Print the summary
print(summary[0]['summary_text'])


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 130, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


Bob and Alice are planning to go to a new art exhibit. The exhibit is contemporary art, with a focus on sculptures. Do you know the address? Alice will send a link to the website later.


In [None]:
#  COMPARE WHICH ONE IS EFFECTIVE

print("FLAN-T5 Summary:")
print(summary)
print("\nFrequency-based Summary:")
print(summarize_text(dialogue))
print("\nBART-large-CNN Summary:")
print(summary[0]['summary_text'])

# Note:  Subjectivity is involved in evaluating effectiveness.
# Consider the following when comparing:
# - Accuracy: How well does the summary capture the main points of the dialogue?
# - Conciseness: Is the summary brief and to the point?
# - Coherence: Is the summary logically organized and easy to understand?
# - Fluency: Does the summary read naturally and smoothly?

# In general, FLAN-T5 seems to produce the most comprehensive and coherent summary.
# BART-large-CNN is also good, but it might be slightly less detailed.
# The frequency-based approach is the simplest, but it can sometimes miss the nuances of the conversation.


FLAN-T5 Summary:
[{'summary_text': 'Bob and Alice are planning to go to a new art exhibit. The exhibit is contemporary art, with a focus on sculptures. Do you know the address? Alice will send a link to the website later.'}]

Frequency-based Summary:
Alice:  I was thinking we could go to the new art exhibit downtown. Alice:  It's contemporary art, with a focus on sculptures. Alice:  I'll send you a link to the website later.

BART-large-CNN Summary:
Bob and Alice are planning to go to a new art exhibit. The exhibit is contemporary art, with a focus on sculptures. Do you know the address? Alice will send a link to the website later.


In [None]:
#  generate dialogue summary using other traditional methods

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

def summarize_text(text, num_sentences=3):
  """Summarizes text using a basic frequency-based approach.

  Args:
    text: The text to summarize.
    num_sentences: The number of sentences in the summary.

  Returns:
    The summary of the text.
  """
  # Tokenize the text into sentences
  sentences = sent_tokenize(text)

  # Remove stop words and punctuation
  stop_words = set(stopwords.words('english'))
  words = []
  for sentence in sentences:
    for word in nltk.word_tokenize(sentence):
      if word.lower() not in stop_words and word.isalnum():
        words.append(word.lower())

  # Calculate word frequencies
  freq_dist = FreqDist(words)

  # Rank sentences based on word frequency
  sentence_scores = {}
  for i, sentence in enumerate(sentences):
    for word in nltk.word_tokenize(sentence):
      if word.lower() in freq_dist:
        if i not in sentence_scores:
          sentence_scores[i] = freq_dist[word.lower()]
        else:
          sentence_scores[i] += freq_dist[word.lower()]

  # Get the top N sentences
  sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)
  summary_sentences = []
  for i in range(num_sentences):
    if i < len(sorted_sentences):
      summary_sentences.append(sentences[sorted_sentences[i][0]])

  # Return the summary
  return " ".join(summary_sentences)

# Example dialogue
dialogue = """
Alice: Hey, are you free this weekend?
Bob:  Yeah, why?
Alice:  I was thinking we could go to the new art exhibit downtown.
Bob:  Sounds cool! What kind of art is it?
Alice:  It's contemporary art, with a focus on sculptures.
Bob:  Okay, that's interesting.  Do you know the address?
Alice:  I'll send you a link to the website later.
Bob: Great! I'm looking forward to it.
"""

summary = summarize_text(dialogue)
print(summary)


Alice:  I was thinking we could go to the new art exhibit downtown. Alice:  It's contemporary art, with a focus on sculptures. Alice:  I'll send you a link to the website later.
