In [57]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

from collections import Counter

In [58]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [71]:
with open('CNN-dailymail_1.txt', 'r', encoding='utf-8') as file:
    text = file.read()

with open('Reference_summary.txt', 'r', encoding='utf-8') as file:
    reference_summary = file.read()

print(text)
print(reference_summary)

Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive. That's according to Washington State University, where the dog -- a friendly white-and-black bully breed mix now named Theia -- has been receiving care at the Veterinary Teaching Hospital. Four days after her apparent death, the dog managed to stagger to a nearby farm, dirt-covered and emaciated, where she was found by a worker who took her to a vet for help. She was taken in by Moses Lake, Washington, resident Sara Mellado. "Considering everything that she's been through, she's incredibly gentle and loving," Mellado said, according to WSU News. "She's a true miracle dog and she deserves a good life." Theia is only one year old but the dog's brush with death did not leave her unscathed. She suffered a dislocated jaw, leg injuries and a 

#### Предобработка текста

In [60]:
def lemmatize_text(text):
    
    sentences = sent_tokenize(text)
    lemmatized_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        
        lemmas = []
        for word in words:
            if word.isalnum() and word not in stop_words:
                lemmas.append(lemmatizer.lemmatize(word))
                
        lemmatized_sentences.append(lemmas)
        
    return lemmatized_sentences

#### Метрика ROUGE

In [74]:
from rouge import Rouge

rouge = Rouge()

## Экстрактивные методы суммаризации

#### Эвристический метод. Алгоритм Луна.

In [61]:
def luhn_summarizer(text, summarized_length, threshold_factor=1.5):
    lemmatized_sentences = lemmatize_text(text)

    words_combined = [word for sentence in lemmatized_sentences for word in sentence]
    words_freqences = Counter(words_combined)
    
    avg_words_freq = sum(words_freqences.values()) / len(words_freqences)
    keywords = []
    
    for word, count in words_freqences.items():
        if count >= avg_words_freq * threshold_factor:
            keywords.append(word)
    
    sentences = sent_tokenize(text)
    sentence_scores = {}  
    
    for i, sentence in enumerate(sentences):
        
        sentence_lemmas = lemmatized_sentences[i]
        
        start_index = 0
        max_score = 0
        score = 0
        keywords_indices = [i for i, word in enumerate(sentence_lemmas) if word in keywords]
        
        for i in keywords_indices:
            keywords_in_interval = 0
            interval_size = 0
            
            for j in range(start_index, i+1):
                if sentence_lemmas[j] in keywords:
                    keywords_in_interval += 1
                
                interval_size += 1
                
                if interval_size - keywords_in_interval > 4:
                    break
                
            if interval_size > 0:
                score = (keywords_in_interval**2) / interval_size
                max_score = max(max_score, score)

            start_index = i + 1
            
            sentence_scores[sentence] = max_score

    sorted_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
    summary = " ".join([sentence for sentence, score in sorted_sentences[:summarized_length]])  

    return summary 
    

In [76]:
luhn_summary = luhn_summarizer(text, 2)
luhn_rouge = rouge.get_scores(luhn_summary, reference_summary, avg=True)

print(luhn_summary)
print(luhn_rouge)

Never mind cats having nine lives. A stray pooch in Washington State has used up at least three of her own after being hit by a car, apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive.
{'rouge-1': {'r': 0.35294117647058826, 'p': 0.26666666666666666, 'f': 0.30379746345137}, 'rouge-2': {'r': 0.1951219512195122, 'p': 0.16666666666666666, 'f': 0.17977527592980694}, 'rouge-l': {'r': 0.3235294117647059, 'p': 0.24444444444444444, 'f': 0.2784810077551675}}


#### Графовый метод. TextRank.

In [63]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx

In [64]:
def textrank_summarizer(text, summarization_length, damping_factor = 0.85, max_iterations=100, threshold=1e-5):
    lemmatized_sentences = lemmatize_text(text)

    vectorizer = TfidfVectorizer() 
    joined_lemmas = [' '.join(sentence) for sentence in lemmatized_sentences]
    
    tfidf_matrix = vectorizer.fit_transform(joined_lemmas)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    graph = nx.from_numpy_array(similarity_matrix)
    pagerank = nx.pagerank(graph, alpha=damping_factor, max_iter=max_iterations, tol=threshold)

    sentences = sent_tokenize(text)
    
    sorted_sentences = sorted(range(len(sentences)), key=lambda i: pagerank[i], reverse=True)
    summary = ' '.join([sentences[i] for i in sorted_sentences[:summarization_length]])

    return summary

In [77]:
text_rank_summary = textrank_summarizer(text,2)

text_rank_rouge = rouge.get_scores(text_rank_summary, reference_summary, avg=True)

print(text_rank_summary)
print(text_rank_rouge)

The veterinary hospital's Good Samaritan Fund committee awarded some money to help pay for the dog's treatment, but Mellado has set up a fundraising page to help meet the remaining cost of the dog's care. That's according to Washington State University, where the dog -- a friendly white-and-black bully breed mix now named Theia -- has been receiving care at the Veterinary Teaching Hospital.
{'rouge-1': {'r': 0.17647058823529413, 'p': 0.11538461538461539, 'f': 0.13953487893996774}, 'rouge-2': {'r': 0.024390243902439025, 'p': 0.01639344262295082, 'f': 0.01960783832948984}, 'rouge-l': {'r': 0.17647058823529413, 'p': 0.11538461538461539, 'f': 0.13953487893996774}}


## Абстрактивная суммаризация

In [78]:
from transformers import pipeline

t5_base_abst_summarizer = pipeline("summarization", model="t5-base", device='cuda')
facebook_bart_abst_summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device='cuda')
distilbart_summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device='cuda')


: 

In [None]:
t5_base_summary = t5_base_abst_summarizer(text, min_length=30, max_length=100)
t5_base_rouge = rouge.get_scores(t5_base_summary, reference_summary, avg=True)

print(t5_base_summary)
print(t5_base_rouge)

[{'summary_text': 'a stray pooch in washington state has used up at least three of her own after being hit by a car . the dog staggers to a nearby farm, dirt-covered and emaciated, where she is found . "she\'s a true miracle dog and she deserves a good life," a resident says .'}]

In [None]:
facebook_bart_summary = facebook_bart_abst_summarizer(text, min_length=30, max_length=100)
facebook_bart_rouge = rouge.get_scores(facebook_bart_summary , reference_summary, avg=True)

print(facebook_bart_summary)
print(facebook_bart_rouge)

[{'summary_text': 'Theia, a one-year-old bully breed mix, was hit by a car and buried in a field. She managed to stagger to a nearby farm, dirt-covered and emaciated. She suffered a dislocated jaw, leg injuries and a caved-in sinus cavity. A fundraising page has raised more than $10,000 for her care.'}]

In [None]:
distilbart_summary = distilbart_summarizer(text, min_length=30, max_length=100)
distilbart_rouge = rouge.get_scores(distilbart_summary, reference_summary, avg=True)

print(distilbart_summary)
print(distilbart_rouge)

[{'summary_text': ' Theia is a one-year-old bully breed mix who was hit by a car and buried in a field . The dog was found by a worker at a nearby farm, dirt-covered and emaciated, days after her apparent death . She suffered a dislocated jaw, leg injuries and a caved-in sinus cavity . A fundraising page has been set up to help pay for her treatment .'}]