In [171]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

from collections import Counter

In [172]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Remix\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [173]:
def preprocess_text(text):
    
    sentences = sent_tokenize(text)
    lemmatized_sentences = []
    all_lemmas = set()
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        
        lemmas = []
        for word in words:
            if word.isalnum() and word not in stop_words:
                lemmas.append(lemmatizer.lemmatize(word))
                
        lemmatized_sentences.append(lemmas)
        all_lemmas.update(lemmas)
        
    return lemmatized_sentences

In [174]:
with open('text.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    
print(text)

Scientists find the biggest coral ever. It is in the Pacific Ocean. The coral is 34 meters long, bigger than a blue whale.

It may be over 300 years old. Scientists find it during a trip to study the ocean and climate change. They first think that it is a shipwreck. There is a big meeting about climate change right now. Small countries worry because warming oceans kill coral reefs. But this coral is still alive. Scientists want to study it to learn why it does not die. There is hope that not all coral reefs will die.


## Экстрактивные методы суммаризации

#### Алгоритм Луна

In [175]:
def luhn_summarizer(text, summarized_length, threshold_factor=1.5):
    lemmatized_sentences = preprocess_text(text)

    words_combined = [word for sentence in lemmatized_sentences for word in sentence]
    words_freqences = Counter(words_combined)
    
    avg_words_freq = sum(words_freqences.values()) / len(words_freqences)
    keywords = []
    
    for word, count in words_freqences.items():
        if count >= avg_words_freq * threshold_factor:
            keywords.append(word)
    
    sentences = sent_tokenize(text)
    sentence_scores = {}  
    
    for i, sentence in enumerate(sentences):
        
        sentence_lemmas = lemmatized_sentences[i]
        
        start_index = 0
        max_score = 0

        keywords_indices = [i for i, word in enumerate(sentence_lemmas) if word in keywords]
        
        for i in keywords_indices:
            keywords_in_interval = 0
            interval_size = 0
            
            for j in range(start_index, i+1):
                if sentence_lemmas[j] in keywords:
                    keywords_in_interval += 1
                
                interval_size += 1
                
                if interval_size - keywords_in_interval > 4:
                    break
                
            if interval_size >0: # предотвращение ошибки деления на 0
                score = (keywords_in_interval**2) / interval_size
                max_score = max(max_score, score)

            start_index = i + 1
            
            sentence_scores[sentence] = max_score

    sorted_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
    summary = " ".join([sentence for sentence, score in sorted_sentences[:summarized_length]])  
        
    return summary  
    

In [183]:
print(luhn_summarizer(text, 2))

Scientists find the biggest coral ever. The coral is 34 meters long, bigger than a blue whale.
