In [None]:
import json
import pickle
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def save_to(pickle_file, data):
    with open(pickle_file, 'wb') as f:
    pickle.dump(data, f)

# распаковка файла и выбор нужных данных оттуда
def unpack(file):
    with open(file, "r", encoding='utf-8') as f:
        json_list = list(f)

    all_news_text_X = []
    all_news_summ_Y = []
    for i, str in enumerate(json_list):
        one_news = json.loads(str)
        del one_news['url']
        del one_news['title']
        del one_news['date']
        all_news_text_X.append(one_news['text'])
        all_news_summ_Y.append(one_news['summary'])
    return all_news_text_X, all_news_summ_Y

# разбиение листа внутри на несколько листов по заданному разделителю
def spliting_list(list_, delimiter):
    splitted = [[]]
    for lemma in list_:
        if lemma == delimiter:
            splitted.append([])
        else:
            splitted[-1].append(lemma)
    return splitted

# предобработка (нормализация, лемматизация)
def preprocess(text_str, punctuation_re):
    mystem = Mystem()
    rus_stopwords = stopwords.words("russian")

    text_lemmas = mystem.lemmatize(text_str.lower())
    text_lemmas = [punctuation_re.sub('', lemma) for lemma in text_lemmas ]  # удаляю все кроме букв и цифр
    text_lemmas = [lemma for lemma in text_lemmas if lemma.strip() != '' and lemma not in rus_stopwords]

    return text_lemmas

In [None]:
# TF-IDF векторизация с созданием базового мешка слов
def tfidf_bag_words_make_vocab(all_sent_list):
    tf_vectorizer = TfidfVectorizer(max_features=50000, max_df=0.85)  # слово встречается > чем в 85% -> не берем
    all_sent_str = [" ".join(one_sent_list) for one_sent_list in all_sent_list]  
    bag_of_words = tf_vectorizer.fit_transform(all_sent_str)
    vocabulary = tf_vectorizer.get_feature_names()
    return bag_of_words, vocabulary

# TF-IDF векторизация по базовому мешку слов
def tfidf_bag_words_with_vocab(all_sent_list, train_vocabulary):
    tf_vectorizer = TfidfVectorizer(vocabulary = train_vocabulary)
    all_sent_str = [" ".join(one_sent_list) for one_sent_list in all_sent_list] 
    bag_of_words = tf_vectorizer.fit_transform(all_sent_str)
    return bag_of_words

# сравнение двух векторов-предложений
def score_compare_sent(vec_X, vec_Y):
    # угол между векторами
    vec_X_unit = (vec_X / np.linalg.norm(vec_X)) if np.linalg.norm(vec_X) != 0 else np.zeros(vec_X.shape)
    vec_Y_unit = (vec_Y / np.linalg.norm(vec_Y)) if np.linalg.norm(vec_Y) != 0 else np.zeros(vec_Y.shape)
    angles = np.arccos(np.clip(np.dot(vec_X_unit, vec_Y_unit), -1.0, 1.0))
    # привожу [0 - хорошо, pi/2 - плохо] к [0 - плохо, 1 - хорошо]
    score_01 = 1 - (angles / (np.pi/2))
    return score_01

# оценка всех предложений по их схожести с аннотацией
def score_for_all_sent(tfidf_text_X, tfidf_text_Y):
    score_all_sent = []
    for text_index in range(len(tfidf_text_X)):
        sentences_X = tfidf_text_X[text_index].toarray()
        sentences_Y = tfidf_text_Y[text_index].toarray()
        for vec_x in sentences_X:
            best_min_score = np.min([score(vec_x, vec_y) for vec_y in sentences_Y])
            score_all_sent.append(best_min_score)      
    return np.array(score_all_sent)

In [None]:
# вспомогательный метод предобработки с формированием пакетов и токенизацией по предложениям
def preprocessing_big_text_pack(all_text, punctuation_re):
    all_preproc_sent_text = []
    count_sent_in_text = []
    pack_size = 1000
    start_ind = 0  
    while start_ind < len(all_text):
        pack_list_for_preproc = []
        end_ind = start_ind + pack_size
        if end_ind > len(all_text):
            end_ind = len(all_text)
            
        # соединение и предобработка
        for i in range(start_ind, end_ind):
            sentences = nltk.sent_tokenize(all_text[i])
            count_sent_in_text.append(len(sentences))
            pack_list_for_preproc.append(' bbreakk '.join(sentences))  # между предложениями'bbreakk'
        pack_str_for_preproc = ' bbreakk '.join(pack_list_for_preproc)  # между текстами'bbreakk'
        pack_lemmas = preprocess(pack_str_for_preproc, punctuation_re)

        # разделение на предложения отдельные
        preproc_sentences = spliting_list(pack_lemmas, 'bbreakk') 
        all_preproc_sent_text.extend(preproc_sentences)

        start_ind += pack_size
    
    return np.array(all_preproc_sent_text), np.array(count_sent_in_text)    

In [None]:
# разделение предложений по текстам, а тексты по наборам X и Y
def split_sent_in_text_one_pack(sent, count_X, count_Y):
    all_text_X = []
    all_text_Y = []
    already_taked = 0
    for count in count_sent_in_text_X:
        all_text_X.append(sent[already_taked:already_taked + count])
        already_taked += count
    for count in count_sent_in_summ_Y:
        all_text_Y.append(sent[already_taked:already_taked + count])
        already_taked += count
    
    return all_text_X, all_text_Y

# разделение предложений по текстам
def split_sent_in_text_two_pack(sent_X, sent_Y, count_X, count_Y):
    all_text_X = []
    all_text_Y = []
    already_taked = 0
    for count in count_sent_in_text_X:
        all_text_X.append(sent_X[already_taked:already_taked + count])
        already_taked += count
    already_taked = 0
    for count in count_sent_in_summ_Y:
        all_text_Y.append(sent_Y[already_taked:already_taked + count])
        already_taked += count
    
    return all_text_X, all_text_Y

### Main

In [None]:
if __name__ == '__main__':
    punctuation_re = re.compile('[^a-zA-Zа-яА-Я0-9]+')
    all_text_X_train, all_text_Y_train = unpack(
        'C:/Users/Acer/PycharmProjects/ML_NLP_course3/gazeta_jsonl/gazeta_train.jsonl')
    all_text_X_val, all_text_Y_val = unpack(
        'C:/Users/Acer/PycharmProjects/ML_NLP_course3/gazeta_jsonl/gazeta_val.jsonl')
    all_text_X_test, all_text_Y_test = unpack(
        'C:/Users/Acer/PycharmProjects/ML_NLP_course3/gazeta_jsonl/gazeta_test.jsonl')
    # предобработка тестовых данных, аналогичная валидацинным, не приведена
    
    all_preproc_sent_X_train, count_sent_in_text_X_train = preprocessing_big_text_pack(all_text_X_train, punctuation_re)
    all_preproc_sent_Y_train, count_sent_in_text_Y_train = preprocessing_big_text_pack(all_text_Y_train, punctuation_re)
    all_preproc_sent_X_val, count_sent_in_text_X_val = preprocessing_big_text_pack(all_text_X_val, punctuation_re)
    all_preproc_sent_Y_val, count_sent_in_text_Y_val = preprocessing_big_text_pack(all_text_Y_val, punctuation_re)
    
    # надо обработанные тренировочные Х У объединить, чтобы сделать общий мешок слов
    all_sent_train = np.concatenate((all_preproc_sent_X_train, all_preproc_sent_Y_train), axis=None)
    
    tfidf_XY_train, train_vocabulary = tfidf_bag_words_make_vocab(all_sent_train)
    tfidf_X_val = tfidf_bag_words(all_preproc_sent_X_val, vocabulary)
    tfidf_Y_val = tfidf_bag_words(all_preproc_sent_Y_val, vocabulary)
    
    # получить список предложений тренировочного набора Х, не разделенных на тексты, для обучения модели 
    tfidf_no_text_X_train = tfidf_XY_train[:np.sum(count_sent_in_text_X_train)]
    
    # разделить предложения братно по текстам, а тексты по наборам X и Y
    tfidf_text_X_train, tfidf_text_Y_train = split_sent_in_text_one_pack(
        tfidf_XY_train, count_sent_in_text_X_train, count_sent_in_text_Y_train
    )
    tfidf_text_X_val, tfidf_text_Y_val = split_sent_in_text_two_pack(
        tfidf_X_val, tfidf_Y_val, count_sent_in_text_X_val, count_sent_in_text_Y_val
    )
    
    # для каждого предложения получить его оценку
    score_Y_train = score_for_all_sent(tfidf_text_X_train, tfidf_text_Y_train)
    score_Y_val = score_for_all_sent(tfidf_text_X_val, tfidf_text_Y_val)
    
    # сохранение необходимых данных в файлы .pickle
    