## **Text Summarization**

In [None]:
import spacy
from typing import Set, List
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

In [None]:
def read_txt(text_file):
    string_list = []
    with open(text_file, "r+" ) as file:
        string_list = file.readlines()
    text = '. '.join(string_list)
    return text

text = read_txt(text_file="/content/tiny_shakespeare.txt")
text



In [None]:
def sentence_segment(text):
    doc = nlp(text)
    sentences = []
    for sent in doc.sents:
        sentences.append(sent)

    return sentences

sentences = sentence_segment(text)
sentences[0]

Conflict between Israel & Palestinians in Gaza
.

### **TF**

In [None]:
def lemmatize_sentence(sent: List, not_include_tokens: Set, remove_stop_words=True):
    lemms = []
    for token in sent:
        if not token.is_stop and token.lemma_ not in not_include_tokens:
            lemms.append(token.lemma_)

    return lemms

lemma_sentences = [
    lemmatize_sentence(
        sentences[i],
        not_include_tokens=set(["&", "!", "\n", ".", "?"]),
        remove_stop_words=True
    ) for i in range(len(sentences))
]
lemma_sentences[0], len(lemma_sentences)

(['conflict', 'Israel', 'Palestinians', 'Gaza'], 91)

In [None]:
def tf_calculate(text):
    tf = defaultdict(int)
    for sent in sentence_segment(text):
        for lemma in lemmatize_sentence(
            sent,
            not_include_tokens=set(["&", "!", "\n", ".", "?"]),
            remove_stop_words=True
        ):
            tf[lemma]+=1

    return tf

tf = tf_calculate(text)
len(tf)

396

In [None]:
def scoring(text):
    scores: List[int, int] = []
    for i, sent in enumerate(sentence_segment(text)):
        score = 0
        for lemma in lemmatize_sentence(
            sent,
            not_include_tokens=set(["&", "!", "\n", ".", "?"]),
            remove_stop_words=True
        ):
            score+=tf[lemma]
        scores.append([score, i])

    return scores

scores = scoring(text)
scores[:5]

[[78, 0], [229, 1], [214, 2], [127, 3], [495, 4]]

In [None]:
def main(text):
    text = read_txt(text_file="/content/tiny_shakespeare.txt")

    sentences = sentence_segment(text)
    tf = tf_calculate(text)
    scores = scoring(text)

    scores.sort(reverse=True)
    percentage = 0.25

    count_take = int(percentage*len(sentences))
    count_take

    poss = []
    for score, pos in scores[:count_take]:
        poss.append(pos)

    poss.sort()

    return '.'.join(str(sentences[pos]) for pos in poss)


In [None]:
summary = main(text)
summary

'Hamas militant group has started a war that \'Israel will win\', defence minister says\n..The ruling Hamas militant group in the Gaza Strip carried out an unprecedented, multi-front attack on Israel at daybreak on Saturday, firing thousands of rockets as dozens of Hamas fighters infiltrated the heavily fortified border in several locations by air, land and sea and catching the country off-guard on a major holiday.\n..Palestinian militant group Hamas launched a large-scale surprise attack against Israel Saturday, firing thousands of rockets from Gaza and sending ground units to kill or abduct people as Israel retaliated with air strikes..Barrages of rockets were fired at Israel from the blockaded Gaza Strip at dawn on Saturday as militants from the Palestinian enclave infiltrated Israel, with at least one person killed, the army and medics said..Israeli PM Benjamin Netanyahu said Israel was at \'war\' with Palestinian militant group Hamas after barrages of rockets were fired from the G