## Text Summarization

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from heapq import nlargest

In [2]:
content = requests.get("https://en.wikipedia.org/wiki/Sekiro:_Shadows_Die_Twice").text

In [3]:
print(content[:1000])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Sekiro: Shadows Die Twice - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ce4a99ff-8a7e-4136-b8f2-3de2b7842395","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Sekiro:_Shadows_Die_Twice","wgTitle":"Sekiro: Shadows Die Twice","wgCurRevisionId":1089622715,"wgRevisionId":1089622715,"wgArticleId":57651025,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Japanese-language text","CS1 Japanese-language sources (ja)","CS1 uses Chinese-language script (

In [4]:
article = BeautifulSoup(content, 'lxml')

In [5]:
article_content = ""
for paragraph in article.find_all('p'):
    article_content += paragraph.text

In [6]:
print(article_content)



Sekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision. The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord. It was released for Microsoft Windows, PlayStation 4, and Xbox One in March 2019 and for Stadia in October 2020.
Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles. The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy. While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Souls series of games also made by FromSoftware. The developers looked to games such as The Mysterious Murasame Castle and the Tenchu series for inspiration.
Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Souls 

In [7]:
punctuation_ = punctuation + '\n'
stop_words = stopwords.words('english')

In [8]:
# Create Frequency table, excluse stopwords and punctuations

def FrequecyTable(text):
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in punctuation_ and word not in stop_words]
    table = {}
    for word in text:
        if word in table:
            table[word] += 1
        else:
            table[word] = 1
    return table

In [9]:
word_frequecies = FrequecyTable(article_content)
word_frequecies

{'Sekiro': 29,
 'Shadows': 5,
 'Die': 5,
 'Twice': 5,
 '2019': 6,
 'action-adventure': 2,
 'game': 38,
 'developed': 2,
 'FromSoftware': 10,
 'published': 3,
 'Activision': 3,
 'The': 23,
 'follows': 1,
 'shinobi': 6,
 'known': 6,
 'Wolf': 12,
 'attempts': 1,
 'take': 2,
 'revenge': 1,
 'samurai': 1,
 'clan': 3,
 'attacked': 1,
 'kidnapped': 1,
 'lord': 1,
 'It': 4,
 'released': 6,
 'Microsoft': 2,
 'Windows': 3,
 'PlayStation': 3,
 '4': 5,
 'Xbox': 3,
 'One': 3,
 'March': 3,
 'Stadia': 2,
 'October': 2,
 '2020': 5,
 'Gameplay': 1,
 'focused': 1,
 'stealth': 3,
 'exploration': 4,
 'combat': 11,
 'particular': 1,
 'emphasis': 3,
 'boss': 4,
 'battles': 2,
 'takes': 3,
 'place': 3,
 'fictionalized': 1,
 'Japan': 4,
 'Sengoku': 3,
 'period': 3,
 'makes': 1,
 'strong': 1,
 'references': 1,
 'Buddhist': 1,
 'mythology': 1,
 'philosophy': 1,
 'While': 2,
 'making': 3,
 'lead': 1,
 'director': 1,
 'Hidetaka': 2,
 'Miyazaki': 3,
 'wanted': 2,
 'create': 3,
 'new': 7,
 'intellectual': 1,
 'prop

In [10]:
# Normalizing the values of our word_frequencies dictionary

max_value = max(word_frequecies.values())

for k, v in word_frequecies.items():
    word_frequecies[k] = v / max_value

In [11]:
# Normalized dictionary

word_frequecies

{'Sekiro': 0.5178571428571429,
 'Shadows': 0.08928571428571429,
 'Die': 0.08928571428571429,
 'Twice': 0.08928571428571429,
 '2019': 0.10714285714285714,
 'action-adventure': 0.03571428571428571,
 'game': 0.6785714285714286,
 'developed': 0.03571428571428571,
 'FromSoftware': 0.17857142857142858,
 'published': 0.05357142857142857,
 'Activision': 0.05357142857142857,
 'The': 0.4107142857142857,
 'follows': 0.017857142857142856,
 'shinobi': 0.10714285714285714,
 'known': 0.10714285714285714,
 'Wolf': 0.21428571428571427,
 'attempts': 0.017857142857142856,
 'take': 0.03571428571428571,
 'revenge': 0.017857142857142856,
 'samurai': 0.017857142857142856,
 'clan': 0.05357142857142857,
 'attacked': 0.017857142857142856,
 'kidnapped': 0.017857142857142856,
 'lord': 0.017857142857142856,
 'It': 0.07142857142857142,
 'released': 0.10714285714285714,
 'Microsoft': 0.03571428571428571,
 'Windows': 0.05357142857142857,
 'PlayStation': 0.05357142857142857,
 '4': 0.08928571428571429,
 'Xbox': 0.05357

In [12]:
# Tokenizing the article into a sentences

sentences = nltk.sent_tokenize(article_content)
sentences

['\n\nSekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision.',
 'The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord.',
 'It was released for Microsoft Windows, PlayStation 4, and Xbox One in March 2019 and for Stadia in October 2020.',
 'Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles.',
 'The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy.',
 'While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Souls series of games also made by FromSoftware.',
 'The developers looked to games such as The Mysterious Murasame Castle and the Tenchu series for inspiration.',
 'Sekiro was praised by critics, who complimented its gameplay and settin

In [13]:
sentence_weights = {}
for sent in sentences:
    for key_words in word_frequecies:
        if key_words in sent:
            value = word_frequecies.get(key_words, 0)
            if sent in sentence_weights:
                sentence_weights[sent] += value
            else:
                sentence_weights[sent] = value

In [14]:
sentence_weights

{'\n\nSekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision.': 2.125,
 'The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord.': 1.8214285714285712,
 'It was released for Microsoft Windows, PlayStation 4, and Xbox One in March 2019 and for Stadia in October 2020.': 1.0535714285714288,
 'Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles.': 0.7500000000000001,
 'The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy.': 1.5714285714285712,
 'While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Souls series of games also made by FromSoftware.': 2.0535714285714284,
 'The developers looked to games such as The Mysterious Murasame Castle and the

In [15]:
select_lenght = int(len(sentences) * 0.30)

summary = nlargest(select_lenght, sentence_weights, key=sentence_weights.get)

In [16]:
# Rearranging the summary so, we can have structured sentences

new_summary = []
for sent in sentences:
    if sent in summary:
        new_summary.append(sent)
        
final_summary = " ".join([word for word in new_summary])

In [17]:
print(final_summary)



Sekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision. While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Souls series of games also made by FromSoftware. Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Souls games, although opinions on its difficulty were mixed. [1][2][3][4] Compared to FromSoftware's Souls series, the game features fewer role-playing elements, lacking character creation and the ability to level up a variety of stats, as well as having no multiplayer elements. Rather than attacking to whittle an enemy's health points, combat in Sekiro revolves around using a katana to attack their posture and balance instead, which eventually leads to an opening that allows for a single killing blow. [3][7]
The game also features stealth elements, allowing players to instantly eliminate so

### Creating the function

In [18]:
def SummarizeText():
    link = input("Please paste/type the wikipedia link: \n")
    percent = float(input("\nHow short do you want to summarize the page? (0.0 - 1.0): \n"))
    content = requests.get(link).text
    article = BeautifulSoup(content, 'lxml')
    article_content = ""
    for paragraph in article.find_all('p'):
        article_content += paragraph.text
        
    punctuation_ = punctuation + '\n'
    stop_words = stopwords.words('english')
    text_ = [word for word in nltk.word_tokenize(article_content) if word not in punctuation_ and word not in stop_words]
    word_frequecies = {}
    for word in text_:
        if word in word_frequecies:
            word_frequecies[word] += 1
        else:
            word_frequecies[word] = 1
    
    max_value = max(word_frequecies.values())
    
    for k, v in word_frequecies.items():
        word_frequecies[k] = v / max_value
        
    sentences = nltk.sent_tokenize(article_content)
    
    sentence_weights = {}
    for sent in sentences:
        for key_words in word_frequecies:
            if key_words in sent:
                value = word_frequecies.get(key_words, 0)
                if sent in sentence_weights:
                    sentence_weights[sent] += value
                else:
                    sentence_weights[sent] = value
                    
    select_lenght = int(len(sentences)*percent)
    
    summary = nlargest(select_lenght, sentence_weights, key=sentence_weights.get)
    
    new_summary = []
    for sent in sentences:
        if sent in summary:
            new_summary.append(sent)
        
    final_summary = " ".join([word for word in new_summary])
    
    print("\n\n"," "*50, "SUMMARIZED TEXT")
    print("="*120)
    print(final_summary)
    print("\n","="*120)

In [21]:
SummarizeText()

Please paste/type the wikipedia link: 
https://en.wikipedia.org/wiki/Boat

How short do you want to summarize the page? (0.0 - 1.0): 
0.3


                                                    SUMMARIZED TEXT

A boat is a watercraft of a large range of types and sizes, but generally smaller than a ship, which is distinguished by its larger size, shape, cargo or passenger capacity, or its ability to carry boats. Small boats are typically found on inland waterways such as rivers and lakes, or in protected coastal areas. Pleasure craft used in recreational boating include ski boats, pontoon boats, and sailboats. After the Homo erectus possibly using watercrafts more than a million years ago crossing straits between landmasses,[2][3] boats have served as transportation far into the pre-historic. [4]  Circumstantial evidence, such as the early settlement of Australia over 40,000 years ago, findings in Crete dated 130,000 years ago,[5] and in Flores dated to 900,000 years ago,[6] suggest that