# Text summarization - Frequency based algorithm

# Preprocessing the texts

In [49]:
import re # relugar expression
import nltk # natural language toolkit
import string

In [50]:
original_text = """A plataforma IA Expert tem o objetivo de trazer cursos teóricos e práticos de fácil entendimento sobre sobre Inteligência Artificial e Ciência de Dados, para que profissionais de todas as áreas consigam entender e aplicar os benefícios que a IA pode trazer para seus negócios, bem como apresentar todas as oportunidades que essa área pode trazer para profissionais de tecnologia da informação. Também trazemos notícias atualizadas semanais sobre a área em nosso portal."""

In [51]:
original_text

'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.'

In [52]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [53]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [54]:
len(stopwords)

179

In [55]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [56]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  #print(tokens)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [57]:
formatted_text = preprocess(original_text)
formatted_text

"artificial intelligence ai intelligence demonstrated machines opposed natural intelligence displayed animals including humans leading ai textbooks define field study `` intelligent agents '' system perceives environment takes actions maximize chance achieving goals popular accounts use term `` artificial intelligence '' describe machines mimic `` cognitive '' functions humans associate human mind `` learning '' `` problem solving '' however definition rejected major ai researchers"

# Word frequency

In [58]:
word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
word_frequency

FreqDist({'``': 10, 'intelligence': 4, 'ai': 3, 'artificial': 2, 'machines': 2, 'humans': 2, 'demonstrated': 1, 'opposed': 1, 'natural': 1, 'displayed': 1, ...})

In [59]:
word_frequency.keys()

dict_keys(['artificial', 'intelligence', 'ai', 'demonstrated', 'machines', 'opposed', 'natural', 'displayed', 'animals', 'including', 'humans', 'leading', 'textbooks', 'define', 'field', 'study', '``', 'intelligent', 'agents', 'system', 'perceives', 'environment', 'takes', 'actions', 'maximize', 'chance', 'achieving', 'goals', 'popular', 'accounts', 'use', 'term', 'describe', 'mimic', 'cognitive', 'functions', 'associate', 'human', 'mind', 'learning', 'problem', 'solving', 'however', 'definition', 'rejected', 'major', 'researchers'])

In [60]:
len(word_frequency.keys())

47

In [61]:
highest_frequency = max(word_frequency.values())
highest_frequency

10

In [62]:
for word in word_frequency.keys():
  #print(word)
  word_frequency[word] = (word_frequency[word] / highest_frequency)

In [63]:
word_frequency

FreqDist({'``': 1.0, 'intelligence': 0.4, 'ai': 0.3, 'artificial': 0.2, 'machines': 0.2, 'humans': 0.2, 'demonstrated': 0.1, 'opposed': 0.1, 'natural': 0.1, 'displayed': 0.1, ...})

# Sentence tokenization

In [64]:
sentence_list = nltk.sent_tokenize(original_text)
sentence_list

['Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.',
 'Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.',
 '[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.']

# Generate the summary (score for sentences)

In [65]:
word_frequency

FreqDist({'``': 1.0, 'intelligence': 0.4, 'ai': 0.3, 'artificial': 0.2, 'machines': 0.2, 'humans': 0.2, 'demonstrated': 0.1, 'opposed': 0.1, 'natural': 0.1, 'displayed': 0.1, ...})

In [66]:
score_sentences = {}
for sentence in sentence_list:
  for word in nltk.word_tokenize(sentence.lower()):
    if sentence not in score_sentences.keys():
      score_sentences[sentence] = word_frequency[word]
    else:
      score_sentences[sentence] += word_frequency[word]

In [67]:
score_sentences

{'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.': 2.700000000000001,
 'Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.': 2.900000000000001,
 '[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.': 7.1999999999999975}

In [68]:
score_sentences.keys()

dict_keys(['Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.', 'Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.', '[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.'])

In [69]:
import heapq
best_sentences = heapq.nlargest(3, score_sentences, key = score_sentences.get)

In [70]:
best_sentences

['[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.',
 'Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.',
 'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.']

In [71]:
summary = ' '.join(best_sentences)
summary

'[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans.'

In [72]:
original_text

'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.[a] Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.'