## Text Summarization
by: Jeremiah Chinyelugo

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from heapq import nlargest

In [2]:
content = requests.get("https://en.wikipedia.org/wiki/Sekiro:_Shadows_Die_Twice").text

In [3]:
print(content[:1000])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Sekiro: Shadows Die Twice - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"80556b85-c144-4fd6-9557-3757a3de6197","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Sekiro:_Shadows_Die_Twice","wgTitle":"Sekiro: Shadows Die Twice","wgCurRevisionId":1108418109,"wgRevisionId":1108418109,"wgArticleId":57651025,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Japanese-language text","CS1 Japanese-language sources (ja)","CS1 uses Chinese-language script (

In [4]:
article = BeautifulSoup(content, 'lxml')

In [5]:
article_content = ""
for paragraph in article.find_all('p'):
    article_content += paragraph.text

In [6]:
print(article_content)


Sekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision. The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord. It was released for PlayStation 4, Windows, and Xbox One in March 2019 and for Stadia in October 2020.
Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles. The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy. While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Dark Souls series of games also made by FromSoftware. The developers looked to games such as the Tenchu series for inspiration.
Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Dark Souls games, although opinions on its diff

In [7]:
punctuation_ = punctuation + '\n'
stop_words = stopwords.words('english')

In [8]:
# Create Frequency table, excluse stopwords and punctuations

def FrequecyTable(text):
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in punctuation_ and word not in stop_words]
    table = {}
    for word in text:
        if word in table:
            table[word] += 1
        else:
            table[word] = 1
    return table

In [9]:
word_frequecies = FrequecyTable(article_content)
word_frequecies

{'Sekiro': 28,
 'Shadows': 5,
 'Die': 5,
 'Twice': 5,
 '2019': 6,
 'action-adventure': 2,
 'game': 37,
 'developed': 2,
 'FromSoftware': 8,
 'published': 3,
 'Activision': 3,
 'The': 22,
 'follows': 1,
 'shinobi': 6,
 'known': 5,
 'Wolf': 13,
 'attempts': 1,
 'take': 1,
 'revenge': 1,
 'samurai': 1,
 'clan': 2,
 'attacked': 1,
 'kidnapped': 1,
 'lord': 1,
 'It': 4,
 'released': 6,
 'PlayStation': 3,
 '4': 5,
 'Windows': 3,
 'Xbox': 3,
 'One': 3,
 'March': 3,
 'Stadia': 2,
 'October': 2,
 '2020': 5,
 'Gameplay': 1,
 'focused': 1,
 'stealth': 3,
 'exploration': 4,
 'combat': 11,
 'particular': 1,
 'emphasis': 3,
 'boss': 4,
 'battles': 2,
 'takes': 2,
 'place': 3,
 'fictionalized': 1,
 'Japan': 4,
 'Sengoku': 3,
 'period': 3,
 'makes': 1,
 'strong': 1,
 'references': 1,
 'Buddhist': 1,
 'mythology': 1,
 'philosophy': 1,
 'While': 2,
 'making': 2,
 'lead': 1,
 'director': 1,
 'Hidetaka': 2,
 'Miyazaki': 3,
 'wanted': 2,
 'create': 3,
 'new': 7,
 'intellectual': 1,
 'property': 1,
 'IP': 1

In [10]:
# Normalizing the values of our word_frequencies dictionary

max_value = max(word_frequecies.values())

for k, v in word_frequecies.items():
    word_frequecies[k] = v / max_value

In [11]:
# Normalized dictionary

word_frequecies

{'Sekiro': 0.6511627906976745,
 'Shadows': 0.11627906976744186,
 'Die': 0.11627906976744186,
 'Twice': 0.11627906976744186,
 '2019': 0.13953488372093023,
 'action-adventure': 0.046511627906976744,
 'game': 0.8604651162790697,
 'developed': 0.046511627906976744,
 'FromSoftware': 0.18604651162790697,
 'published': 0.06976744186046512,
 'Activision': 0.06976744186046512,
 'The': 0.5116279069767442,
 'follows': 0.023255813953488372,
 'shinobi': 0.13953488372093023,
 'known': 0.11627906976744186,
 'Wolf': 0.3023255813953488,
 'attempts': 0.023255813953488372,
 'take': 0.023255813953488372,
 'revenge': 0.023255813953488372,
 'samurai': 0.023255813953488372,
 'clan': 0.046511627906976744,
 'attacked': 0.023255813953488372,
 'kidnapped': 0.023255813953488372,
 'lord': 0.023255813953488372,
 'It': 0.09302325581395349,
 'released': 0.13953488372093023,
 'PlayStation': 0.06976744186046512,
 '4': 0.11627906976744186,
 'Windows': 0.06976744186046512,
 'Xbox': 0.06976744186046512,
 'One': 0.06976744

In [12]:
# Tokenizing the article into a sentences

sentences = nltk.sent_tokenize(article_content)
sentences

['\nSekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision.',
 'The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord.',
 'It was released for PlayStation 4, Windows, and Xbox One in March 2019 and for Stadia in October 2020.',
 'Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles.',
 'The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy.',
 'While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Dark Souls series of games also made by FromSoftware.',
 'The developers looked to games such as the Tenchu series for inspiration.',
 'Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Dark Souls games

In [13]:
sentence_weights = {}
for sent in sentences:
    for key_words in word_frequecies:
        if key_words in sent:
            value = word_frequecies.get(key_words, 0)
            if sent in sentence_weights:
                sentence_weights[sent] += value
            else:
                sentence_weights[sent] = value

In [14]:
sentence_weights

{'\nSekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision.': 2.651162790697675,
 'The game follows a shinobi known as Wolf as he attempts to take revenge on a samurai clan who attacked him and kidnapped his lord.': 2.2558139534883725,
 'It was released for PlayStation 4, Windows, and Xbox One in March 2019 and for Stadia in October 2020.': 1.3023255813953494,
 'Gameplay is focused on stealth, exploration, and combat, with a particular emphasis on boss battles.': 0.9767441860465118,
 'The game takes place in a fictionalized Japan during the Sengoku period and makes strong references to Buddhist mythology and philosophy.': 1.9534883720930234,
 'While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Dark Souls series of games also made by FromSoftware.': 2.6744186046511635,
 'The developers looked to games such as the Tenchu series for inspiration

In [15]:
select_lenght = int(len(sentences) * 0.30)

summary = nlargest(select_lenght, sentence_weights, key=sentence_weights.get)

In [16]:
# Rearranging the summary so, we can have structured sentences

new_summary = []
for sent in sentences:
    if sent in summary:
        new_summary.append(sent)
        
final_summary = " ".join([word for word in new_summary])

In [17]:
print(final_summary)


Sekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision. While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Dark Souls series of games also made by FromSoftware. Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Dark Souls games, although opinions on its difficulty were mixed. [1][2][3][4] Compared to FromSoftware's Dark Souls series, the game features fewer role-playing elements, lacking character creation and the ability to level up a variety of stats, as well as having no multiplayer elements. Rather than attacking to whittle an enemy's health points, combat in Sekiro revolves around using a katana to attack their posture and balance instead, which eventually leads to an opening that allows for a single killing blow. [3][7]
The game also features stealth elements, allowing players to instantl

### Creating the function

In [18]:
def SummarizeText():
    link = input("Please paste/type the wikipedia link: \n")
    percent = float(input("\nHow short do you want to summarize the page? (0.0 - 1.0): \n"))
    content = requests.get(link).text
    article = BeautifulSoup(content, 'lxml')
    article_content = ""
    for paragraph in article.find_all('p'):
        article_content += paragraph.text
        
    punctuation_ = punctuation + '\n'
    stop_words = stopwords.words('english')
    text_ = [word for word in nltk.word_tokenize(article_content) if word not in punctuation_ and word not in stop_words]
    word_frequecies = {}
    for word in text_:
        if word in word_frequecies:
            word_frequecies[word] += 1
        else:
            word_frequecies[word] = 1
    
    max_value = max(word_frequecies.values())
    
    for k, v in word_frequecies.items():
        word_frequecies[k] = v / max_value
        
    sentences = nltk.sent_tokenize(article_content)
    
    sentence_weights = {}
    for sent in sentences:
        for key_words in word_frequecies:
            if key_words in sent:
                value = word_frequecies.get(key_words, 0)
                if sent in sentence_weights:
                    sentence_weights[sent] += value
                else:
                    sentence_weights[sent] = value
                    
    select_lenght = int(len(sentences)*percent)
    
    summary = nlargest(select_lenght, sentence_weights, key=sentence_weights.get)
    
    new_summary = []
    for sent in sentences:
        if sent in summary:
            new_summary.append(sent)
        
    final_summary = " ".join([word for word in new_summary])
    
    print("\n\n"," "*50, "SUMMARIZED TEXT")
    print("="*120)
    print(final_summary)
    print("\n","="*120)

In [19]:
SummarizeText()

Please paste/type the wikipedia link: 
https://en.wikipedia.org/wiki/Sekiro:_Shadows_Die_Twice

How short do you want to summarize the page? (0.0 - 1.0): 
0.3


                                                    SUMMARIZED TEXT

Sekiro: Shadows Die Twice[a] is a 2019 action-adventure game developed by FromSoftware and published by Activision. While making the game, lead director Hidetaka Miyazaki wanted to create a new intellectual property (IP) that marked a departure from the Dark Souls series of games also made by FromSoftware. Sekiro was praised by critics, who complimented its gameplay and setting, and compared it to the Dark Souls games, although opinions on its difficulty were mixed. [1][2][3][4] Compared to FromSoftware's Dark Souls series, the game features fewer role-playing elements, lacking character creation and the ability to level up a variety of stats, as well as having no multiplayer elements. Rather than attacking to whittle an enemy's health points, combat in Sekiro