In [7]:
# MAIN
from string import punctuation

from method_Luhn import choose_most_freq_words, count_range_sentences, choose_sent_for_summ
from rating import count_rating
from with_mystem.preprocessing_bag_words import unpack, preprocess, bag_words_vector

# PREPROCESS
import json

from nltk.corpus import stopwords
import nltk
from pymystem3 import Mystem
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [26]:
def unpack(file):
    with open(file, "r", encoding='utf-8') as f:
        json_list = list(f)

    all_news_text_summ = []
    for i, str in enumerate(json_list):
        if i > 1:
            break
        one_news = json.loads(str)
        del one_news['url']
        del one_news['title']
        del one_news['date']
        all_news_text_summ.append(one_news)
    return all_news_text_summ


def preprocess(text, punctuation_pack):
    mystem = Mystem()
    rus_stopwords = stopwords.words("russian")
    punctuation_pack = punctuation_pack + '—' + '«' + '»' + '», —'

    text_sent_with_stop = ' # '.join(nltk.sent_tokenize(text))  # between sentences not ., but #
    print(text_sent_with_stop)
    all_sent_token_lemmas = mystem.lemmatize(text_sent_with_stop.lower())
    all_sent_token_lemmas = [lemma for lemma in all_sent_token_lemmas if lemma != ' ' \
                             and (lemma.strip() not in punctuation_pack or lemma.strip() == '#') \
                             and lemma not in rus_stopwords]

    sent_token_lemmas = spliting_list(all_sent_token_lemmas, '#')
    return sent_token_lemmas, nltk.sent_tokenize(text)

In [14]:
def spliting_list(list, delimiter):
    splitted = [[]]
    for lemma in list:
        if lemma == delimiter:
            splitted.append([])
        else:
            splitted[-1].append(lemma)
    return splitted


def bag_words_vector(all_sent_list):
    vectorizer = CountVectorizer()
    
    all_sent_str = [" ".join(one_sent_list) for one_sent_list in all_sent_list]
    bag_of_words = vectorizer.fit_transform(all_sent_str)
    return bag_of_words

def tfidf_bag_words(all_sent_list):
    tf_vectorizer = TfidfVectorizer()
    
    all_sent_str = [" ".join(one_sent_list) for one_sent_list in all_sent_list]
    bag_of_words = tf_vectorizer.fit_transform(all_sent_str)
    return bag_of_words

In [27]:
# MAIN

all_news_text_summ = unpack('C:/Users/Acer/PycharmProjects/ML_NLP_course3/gazeta_jsonl/gazeta_train.jsonl')
text = all_news_text_summ[0]['text']
text_sent_token_lemmas, text_sent_token = preprocess(text, punctuation)

«По итогам 2011 года чистый отток может составить примерно $80 млрд, в следующем году — около $20 млрд. # При этом мы ожидаем, что со второго полугодия 2012 года начнется приток капитала», — заявил «Интерфаксу» замминистра экономического развития Андрей Клепач. # Официальные прогнозы по выводу капитала из России становятся все пессимистичными: еще летом власти полагали, что из страны уйдет не более $35 млрд, в сентябре Минэкономразвития назвал цифру $50 млрд, в начале ноября Центробанк пересмотрел оценку до $70 млрд. # Очередное изменение прогноза было ожидаемо: по расчетам Центробанка , за январь — октябрь чистый отток капитала достиг $64 млрд, причем в последние месяцы он ускорился: в сентябре он составил $14 млрд, в октябре — $13 млрд против среднего ежемесячного оттока в $6—8 млрд в первом полугодии. # «После октябрьских данных Минэкономразвития вынуждено было изменить оценку, настаивать на $70 млрд означало ожидать серьезного замедления оттока капитала на непонятно каких причинах»

bag_of_words = bag_words_vector(text_sent_token_lemmas)
tf_bag_of_words = tfidf_bag_words(text_sent_token_lemmas)

In [17]:
# METHOD LUHN
import numpy as np


def choose_most_freq_words(bag_of_words):
    freq = bag_of_words.toarray().sum(axis=0)  # count frequency every word
    sort_freq_index = np.argsort(freq)
    top_words_index = sort_freq_index[-int(len(freq) / 10):]  # 10% most freq words
    # print(freq[sort_freq_index[-int(len(freq)/10):]])
    return top_words_index


def count_range_sentences(bag_of_words, top_words_index):
    sent_range_list = []
    for i, sent in enumerate(bag_of_words.toarray()):  # take word-index string
        use_words_index = np.nonzero(sent)[0]  # [0], because take array from info-tuple
        words_count = len(use_words_index)
        top_words_count = len(np.intersect1d(top_words_index, use_words_index))

        sent_range = (top_words_count ** 2) / words_count
        sent_range_list.append(sent_range)
    return sent_range_list


def choose_sent_for_summ(sent_range_list, text_sent_token):
    sort_sent_range = np.argsort(sent_range_list)
    sent_for_summ_index = np.sort(
        sort_sent_range[-int(len(sent_range_list) * 0.1):])  # 10% most important sentences in right order

    # summarize_text = " ".join([sent for i, sent in enumerate(doc.sents) if any(i==sent_for_summ_index)])
    total_summarize = [sent for i, sent in enumerate(text_sent_token) if any(i == sent_for_summ_index)]
    # print(summarize)
    return sent_for_summ_index, total_summarize

In [18]:
# MAIN

top_words_index = choose_most_freq_words(bag_of_words)
sent_range_list = count_range_sentences(bag_of_words, top_words_index)
sent_for_summ_index, total_summarize = choose_sent_for_summ(sent_range_list, text_sent_token)

top_words_index_tf = choose_most_freq_words(tf_bag_of_words)
sent_range_list_tf = count_range_sentences(tf_bag_of_words, top_words_index_tf)
sent_for_summ_index_tf, total_summarize_tf = choose_sent_for_summ(sent_range_list_tf, text_sent_token)

In [19]:
# MAIN

# compare generated with model summarize (count rating)
all_summ_sent_lemmas, nlp_summ_text = preprocess(all_news_text_summ[0]['summary'], punctuation)
count_model_summ_sent = len(all_summ_sent_lemmas)
# add generated summary, to make bag
for i, sent_list in enumerate(text_sent_token_lemmas):
    if any(i == sent_for_summ_index):
        all_summ_sent_lemmas.append(sent_list)

bag_of_words_summ = bag_words_vector(all_summ_sent_lemmas)
bag_of_words_summ_tf = tfidf_bag_words(all_summ_sent_lemmas)

In [22]:
def count_rating(bag_of_words_summ, count_model_summ_sent):
    # search center vector (vectors = sentence, n-space = number of words in bag)
    bag_of_words_summ = bag_of_words_summ.toarray()
    center_gener = np.sum(bag_of_words_summ[count_model_summ_sent:], axis=0)
    center_model = np.sum(bag_of_words_summ[:count_model_summ_sent], axis=0)
    print(center_gener, center_model)

    # count angle between center vectors
    center_gener_unit = center_gener / np.linalg.norm(center_gener)
    center_model_unit = center_model / np.linalg.norm(center_model)
    rating = np.arccos(np.clip(np.dot(center_gener_unit, center_model_unit), -1.0, 1.0))
    return rating

In [25]:
# MAIN

score = count_rating(bag_of_words_summ, count_model_summ_sent)
score_tf = count_rating(bag_of_words_summ_tf, count_model_summ_sent)
# print(score, score_tf) # почему-то для tf угол ближе к 90*, что плохо

[1 0 1 0 1 1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 0 1 2 0 0 0 0 2 1 0 1 1 0
 1 1 1 2 1 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 1 1 1 0] [0 1 1 1 0 0 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 2 1 1 1 1 1 2 1 0 0 1
 0 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 1 0 0 0 1]
[0.378425   0.         0.17837068 0.         0.21752142 0.21752142
 0.         0.         0.         0.         0.21752142 0.
 0.378425   0.21752142 0.21752142 0.31031392 0.         0.21752142
 0.         0.         0.         0.         0.         0.21752142
 0.         0.21752142 0.35355067 0.         0.         0.
 0.         0.41258111 0.15059278 0.         0.21752142 0.21752142
 0.         0.21752142 0.21752142 0.21752142 0.41258111 0.17837068
 0.         0.         0.         0.         0.378425   0.
 0.         0.21752142 0.         0.378425   0.         0.
 0.378425   0.21752142 0.         0.         0.         0.
 0.         0.         0.21752142 0.21752142 0.21752142 0.        ] [0.         0.38833805 0.2492439