In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import collections
nltk.download("stopwords")
nltk.download("punkt")
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
text='''
 Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. 
 Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related 
 to intelligent behavior. Developing of reasoning machines. Learn from mistakes and successes. Artificial
 intelligence is related to reasoning in everyday situations.
'''

In [3]:
def clean(text):
  to_lower=' '.join([i.lower() for i in nltk.tokenize.word_tokenize(text)])
  rem_punc=''.join([i for i in to_lower if i not in string.punctuation])
  return ' '.join([i for i in nltk.tokenize.word_tokenize(rem_punc) if i not in nltk.corpus.stopwords.words("english")])

In [4]:
data=pd.DataFrame({"sent":nltk.tokenize.sent_tokenize(text)})


In [5]:
data

Unnamed: 0,sent
0,\n Artificial intelligence is human like intel...
1,It is the study of intelligent artificial agents.
2,Science and engineering to produce \nintellige...
3,Solve problems and have intelligence.
4,Related to \nintelligent behavior.
5,Developing of reasoning machines.
6,Learn from \nmistakes and successes.
7,Artificial intelligence is related to reasonin...


In [6]:
data["sent"]=data["sent"].apply(lambda x:clean(x))

In [7]:
token_count=dict(collections.Counter(nltk.tokenize.word_tokenize(clean(text))))

In [8]:
frame=pd.DataFrame({"words":token_count.keys(),"count":token_count.values()})

In [9]:
frame.head()

Unnamed: 0,words,count
0,artificial,3
1,intelligence,4
2,human,1
3,like,1
4,study,1


In [10]:
frame["weighted_count"]=frame["count"].apply(lambda x: x/frame["count"].max())

In [11]:
frame.head()

Unnamed: 0,words,count,weighted_count
0,artificial,3,0.75
1,intelligence,4,1.0
2,human,1,0.25
3,like,1,0.25
4,study,1,0.25


In [12]:
weight_words={i:j for i,j in zip(frame["words"],frame["weighted_count"])}

In [13]:
def get_score(x):
  score=[]
  for w in nltk.tokenize.word_tokenize(x):
    if w in weight_words.keys():
      score.append(weight_words[w])
  return sum(score)
      

In [14]:
data["Score"]=data["sent"].apply(get_score)

In [15]:
data

Unnamed: 0,sent,Score
0,artificial intelligence human like intelligence,3.25
1,study intelligent artificial agents,2.0
2,science engineering produce intelligent machines,2.0
3,solve problems intelligence,1.5
4,related intelligent behavior,1.5
5,developing reasoning machines,1.25
6,learn mistakes successes,0.75
7,artificial intelligence related reasoning ever...,3.25


In [16]:
data["Original_Text"]=nltk.tokenize.sent_tokenize(text)

In [17]:
data.head()

Unnamed: 0,sent,Score,Original_Text
0,artificial intelligence human like intelligence,3.25,\n Artificial intelligence is human like intel...
1,study intelligent artificial agents,2.0,It is the study of intelligent artificial agents.
2,science engineering produce intelligent machines,2.0,Science and engineering to produce \nintellige...
3,solve problems intelligence,1.5,Solve problems and have intelligence.
4,related intelligent behavior,1.5,Related to \nintelligent behavior.


In [18]:
''.join(data[data["Score"]>=(data["Score"].max()+data["Score"].min())/2]["Original_Text"])

'\n Artificial intelligence is human like intelligence.It is the study of intelligent artificial agents.Science and engineering to produce \nintelligent machines.Artificial intelligence is related to reasoning in \neveryday situations.'

## Combine All Step to Build A Text Summarization Function

In [19]:
def SUMMARIZE_WITH_FBA(text):
  def clean(text):
    to_lower=' '.join([i.lower() for i in nltk.tokenize.word_tokenize(text)])
    rem_punc=''.join([i for i in to_lower if i not in string.punctuation])
    return ' '.join([i for i in nltk.tokenize.word_tokenize(rem_punc) if i not in nltk.corpus.stopwords.words("english")])
    
  data=pd.DataFrame({"sent":nltk.tokenize.sent_tokenize(text)})

  data["cleaned"]=data["sent"].apply(lambda x:clean(x))
  token_count=dict(collections.Counter(nltk.tokenize.word_tokenize(clean(text))))
  frame=pd.DataFrame({"words":token_count.keys(),"count":token_count.values()})
  frame["weighted_count"]=frame["count"].apply(lambda x: x/frame["count"].max())
  weight_words={i:j for i,j in zip(frame["words"],frame["weighted_count"])}
  def get_score(x):
    score=[]
    for w in nltk.tokenize.word_tokenize(x):
      if w in weight_words.keys():
        score.append(weight_words[w])
    return sum(score)
  data["Score"]=data["cleaned"].apply(get_score)
  return ' '.join(data[data["Score"]>=(data["Score"].max()+data["Score"].min())/2]["sent"]).replace("\n","")


In [20]:
nltk.tokenize.sent_tokenize(SUMMARIZE_WITH_FBA(text))

[' Artificial intelligence is human like intelligence.',
 'It is the study of intelligent artificial agents.',
 'Science and engineering to produce intelligent machines.',
 'Artificial intelligence is related to reasoning in everyday situations.']

## Optimized Function : Example 2

In [21]:
def SUMMARIZE_WITH_FBA(text):
  def clean(text):
    return ' '.join([i for i in nltk.tokenize.word_tokenize(''.join([i for i in ' '.join([i.lower() for i in nltk.
              tokenize.word_tokenize(text)]) if i not in string.punctuation])) if i not in nltk.corpus.stopwords.
              words("english")])
  data=pd.DataFrame({"sent":nltk.tokenize.sent_tokenize(text)})
  data["cleaned"]=data["sent"].apply(lambda x:clean(x))
  token_count=dict(collections.Counter(nltk.tokenize.word_tokenize(clean(text))))
  frame=pd.DataFrame({"words":token_count.keys(),"count":token_count.values()})
  frame["weighted_count"]=frame["count"].apply(lambda x: x/frame["count"].max())
  weight_words={i:j for i,j in zip(frame["words"],frame["weighted_count"])} 
  data["Score"]=data["cleaned"].apply(lambda x: sum([weight_words[w] for w in nltk.tokenize.
                              word_tokenize(x) if w in weight_words.keys()]))
  return ' '.join(data[data["Score"]>=(data["Score"].max()+data["Score"].min())/2]["sent"]).replace("\n","").strip(" ")


In [22]:
SUMMARIZE_WITH_FBA(text)

'Artificial intelligence is human like intelligence. It is the study of intelligent artificial agents. Science and engineering to produce intelligent machines. Artificial intelligence is related to reasoning in everyday situations.'

In [23]:
from IPython.core.display import HTML

In [24]:
display(HTML(f'<h1> Summary </h1>'))
display(HTML(f'<h4><i> {SUMMARIZE_WITH_FBA(text)}</i></h4>'))

## Example 3

In [29]:
def SUMMARIZE_WITH_FBA(text):
  def clean(text):
    return ' '.join([i for i in nltk.tokenize.word_tokenize(''.join([i for i in ' '.join([i.lower() for i in nltk.
              tokenize.word_tokenize(text)]) if i not in string.punctuation])) if i not in nltk.corpus.stopwords.
              words("english")])
  data=pd.DataFrame({"sent":nltk.tokenize.sent_tokenize(text)})
  data["cleaned"]=data["sent"].apply(lambda x:clean(x))
  token_count=dict(collections.Counter(nltk.tokenize.word_tokenize(clean(text))))
  frame=pd.DataFrame({"words":token_count.keys(),"count":token_count.values()})
  frame["weighted_count"]=frame["count"].apply(lambda x: x/frame["count"].max())
  weight_words={i:j for i,j in zip(frame["words"],frame["weighted_count"])} 
  data["Score"]=data["cleaned"].apply(lambda x: sum([weight_words[w] for w in nltk.tokenize.
                              word_tokenize(x) if w in weight_words.keys()]))
  summary=' '.join(data[data["Score"]>=(data["Score"].max()+data["Score"].min())/2]["sent"]).replace("\n","").strip(" ")
   
  return display(HTML(f'''<h2><i>Summary:</i></h2> {summary}'''))

 

In [30]:
SUMMARIZE_WITH_FBA(text)

## Example 4

In [None]:
!pip install goose3

In [32]:
from goose3 import Goose

In [33]:
g=Goose()
url="https://en.wikipedia.org/wiki/Automatic_summarization"
article=g.extract(url)

In [34]:
article.infos

{'meta': {'description': '',
  'lang': 'en',
  'keywords': '',
  'favicon': '/static/apple-touch/wikipedia.png',
  'canonical': 'https://en.wikipedia.org/wiki/Automatic_summarization',
  'encoding': 'UTF-8'},
 'image': None,
 'domain': 'en.wikipedia.org',
 'title': 'Automatic summarization - Wikipedia',
 'cleaned_text': 'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n\nIn addition to text, images and videos can also be summarized. Text summarization finds the most informative sentences in a document;[1] various methods of image summarization are the subject of ongoing research, with some looking to display the most representative images from a given collection or generating a video;[2][3][4] video summarization extracts the most important frames from the video content.[5]\n\nThere are two general approaches to automatic summarizati

In [35]:
text=article.cleaned_text

In [36]:
text

'Automatic summarization is the process of shortening a set of data computationally, to create a subset (a summary) that represents the most important or relevant information within the original content.\n\nIn addition to text, images and videos can also be summarized. Text summarization finds the most informative sentences in a document;[1] various methods of image summarization are the subject of ongoing research, with some looking to display the most representative images from a given collection or generating a video;[2][3][4] video summarization extracts the most important frames from the video content.[5]\n\nThere are two general approaches to automatic summarization: extraction and abstraction.\n\nHere, content is extracted from the original data, but the extracted content is not modified in any way. Examples of extracted content include key-phrases that can be used to "tag" or index a text document, or key sentences (including headings) that collectively comprise an abstract, an

In [37]:
SUMMARIZE_WITH_FBA(text)