In [2]:
import json
import numpy as np
from collections import Counter

import spacy
from sklearn.feature_extraction.text import CountVectorizer

nlp = spacy.load('ru2_combined_400ks_96')
def_token_sent = nlp.create_pipe('sentencizer')  # add sentence tokenizing
nlp.add_pipe(def_token_sent)

In [3]:
with open('gazeta_jsonl/gazeta_train.jsonl', "r", encoding='utf-8') as f:
    json_list = list(f)

all_news_text_summ = []
for i, str in enumerate(json_list):
    if i > 1:
        break
    one_news = json.loads(str)
    del one_news['url']; del one_news['title']; del one_news['date']
    all_news_text_summ.append(one_news)
    # print(one_news)
    # print(isinstance(one_news, dict))  # TRUE

In [4]:
new_text = all_news_text_summ[0]['text']
doc = nlp(new_text)
all_preproc_sent_list = []

for sent in doc.sents:
    # lemmatiz + delete stopwords,punct
    preproc_sent_list = [token.lemma_ for token in sent if not token.is_stop and not token.is_punct]
    all_preproc_sent_list.append(preproc_sent_list)

In [5]:
vectorizer = CountVectorizer()
all_sent_str = [" ".join(sent_list) for sent_list in all_preproc_sent_list]
bag_of_words = vectorizer.fit_transform(all_sent_str)

# print(bag_of_words[:1]) # первое предложение
# print(bag_of_words[1:2]) # второе предложение (номер будет 0 у всех, если по одному выводить)

In [6]:
# print(bag_of_words.toarray())  # столбцы - слова, строки - предложения
freq = bag_of_words.toarray().sum(axis=0)  # count frequency every word
sort_freq_index = np.argsort(freq)
top_words_index = sort_freq_index[-int(len(freq)/10):]  # 10% most freq words
# print(freq[sort_freq_index[-int(len(freq)/10):]])

In [7]:
# count range for sentences
sent_range_list = []
for i, sent in enumerate(bag_of_words.toarray()): # take word-index string
    use_words_index = np.nonzero(sent)[0]  # [0], because take array from info-tuple
    words_count = len(use_words_index)
    top_words_count = len(np.intersect1d(top_words_index, use_words_index))
    
    sent_range = (top_words_count**2)/words_count
    sent_range_list.append(sent_range)

sort_sent_range = np.argsort(sent_range_list)
sent_for_summ_index = np.sort(sort_sent_range[-int(len(sent_range_list)*0.1):])  # 10% most important sentences in right order
print(sent_for_summ_index)

[7 9]


In [8]:
# summarize_text = " ".join([sent for i, sent in enumerate(doc.sents) if any(i==sent_for_summ_index)])
summarize_text = [sent for i, sent in enumerate(doc.sents) if any(i == sent_for_summ_index)]
print(summarize_text)

[После октябрьских данных Минэкономразвития вынуждено было изменить оценку, настаивать на $70 млрд означало ожидать серьезного замедления оттока капитала на непонятно каких причинах», — говорит главный экономист BNP Paribas Юлия Цепляева. «, Прогнозируемый Минэкономразвития отток капитала — один из самых высоких за последние 20 лет.]


In [9]:
# summary rating
summary_model = all_news_text_summ[0]['summary']

# preprocessing
doc_rate = nlp(summary_model)
all_summ_preproc_sent_list = []

# add generated summary
for i, sent_list in enumerate(all_preproc_sent_list):
    if any(i == sent_for_summ_index):
        all_summ_preproc_sent_list.append(sent_list)
# add model summary
for sent in doc_rate.sents:
    # lemmatiz + delete stopwords,punct
    summ_preproc_sent_list = [token.lemma_ for token in sent if not token.is_stop and not token.is_punct]
    all_summ_preproc_sent_list.append(summ_preproc_sent_list)

In [10]:
# summary rating
# making bag of words

all_summ_sent_str = [" ".join(sent_list) for sent_list in all_summ_preproc_sent_list]
bag_of_words_summ = vectorizer.fit_transform(all_summ_sent_str).toarray()

In [16]:
# summary rating

# search center vector (vectors = sentence, n-space = number of words in bag)
center_gener = np.sum(bag_of_words_summ[:len(sent_for_summ_index)], axis=0)
center_model = np.sum(bag_of_words_summ[len(sent_for_summ_index):], axis=0)

# count angle between center vectors
center_gener_unit = center_gener / np.linalg.norm(center_gener)
center_model_unit = center_model / np.linalg.norm(center_model)
rating = np.arccos(np.clip(np.dot(center_gener_unit, center_model_unit), -1.0, 1.0))
print(rating)
# print(np.sqrt(np.sum((center_gener - center_model)**2)))

1.2701436204546728
