In [1]:
import json
import pickle
# from string import punctuation
import re
import numpy as np

import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm

In [2]:
def unpack(file):
    with open(file, "r", encoding='utf-8') as f:
        json_list = list(f)

    all_news_text_X = []
    all_news_summ_Y = []
    for i, str in enumerate(json_list):
        one_news = json.loads(str)
        del one_news['url']
        del one_news['title']
        del one_news['date']
        all_news_text_X.append(one_news['text'])
        all_news_summ_Y.append(one_news['summary'])
    return all_news_text_X, all_news_summ_Y

In [3]:
def spliting_list(list, delimiter):
    splitted = [[]]
    for lemma in list:
        if lemma == delimiter:
            splitted.append([])
        else:
            splitted[-1].append(lemma)
    return splitted

In [4]:
def preprocess(text_str, punctuation_re):
    mystem = Mystem()
    rus_stopwords = stopwords.words("russian")

    text_lemmas = mystem.lemmatize(text_str.lower())
    text_lemmas = [punctuation_re.sub('', lemma) for lemma in text_lemmas ] # удаляю все кроме букв и цифр
    text_lemmas = [lemma for lemma in text_lemmas if lemma.strip() != '' and lemma not in rus_stopwords]

    return text_lemmas

In [2]:
def tfidf_bag_words(all_sent_list, train_vocabulary):
    tf_vectorizer = TfidfVectorizer(vocabulary = train_vocabulary)

    all_sent_str = [" ".join(one_sent_list) for one_sent_list in all_sent_list]  # список из предложений=строк
    bag_of_words = tf_vectorizer.fit_transform(all_sent_str)
    return bag_of_words

In [6]:
def score(vec_X, vec_Y):
    # count angle between center vectors
    vec_X_unit = (vec_X / np.linalg.norm(vec_X)) if np.linalg.norm(vec_X) != 0 else np.zeros(vec_X.shape)
    vec_Y_unit = (vec_Y / np.linalg.norm(vec_Y)) if np.linalg.norm(vec_Y) != 0 else np.zeros(vec_Y.shape)
    angles = np.arccos(np.clip(np.dot(vec_X_unit, vec_Y_unit), -1.0, 1.0))
    # привожу [0 - хорошо, pi/2 - плохо] к [0 - плохо, 1 - хорошо]
    score_01 = 1 - (angles / (np.pi/2))
    return score_01

##MAIN

In [7]:
all_news_text_X, all_news_summ_Y = unpack(
        'C:/Users/Acer/PycharmProjects/ML_NLP_course3/gazeta_jsonl/gazeta_test.jsonl')
# my_punctuation = punctuation + '–' + '—' + '«' + '»' + '», —' + '%, ' + '» — ' + ': «' + '» (' + '), ' + ', «'
punctuation_re = re.compile('[^a-zA-Zа-яА-Я0-9]+')

In [8]:
all_preproc_sent_text_X = []
count_sent_in_text_X = []
pack_size = 1000
start_ind = 0  # to count start new 1000 text pack
while start_ind < len(all_news_text_X):
#     # временное ограничение!!!!
#     if start_ind >= 6:
#         break
    
    # join and preprocessing
    pack_list_for_preproc = []
    end_ind = start_ind + pack_size
    if end_ind > len(all_news_text_X):
        end_ind = len(all_news_text_X)
        
    for i in range(start_ind, end_ind):
        sentences = nltk.sent_tokenize(all_news_text_X[i])
        count_sent_in_text_X.append(len(sentences))
        pack_list_for_preproc.append(' bbreakk '.join(sentences))  # between sentences 'bbreakk'
    pack_str_for_preproc = ' bbreakk '.join(pack_list_for_preproc)
    pack_lemmas = preprocess(pack_str_for_preproc, punctuation_re)

    # split
    preproc_sentences = spliting_list(pack_lemmas, 'bbreakk') # список из предложений=строк
    all_preproc_sent_text_X.extend(preproc_sentences)
    
    start_ind += pack_size
    print(end_ind)
    
    if sum(count_sent_in_text_X) != len(all_preproc_sent_text_X):
        print('error')
        break
    
count_sent_in_text_X = np.array(count_sent_in_text_X)    
all_preproc_sent_text_X = np.array(all_preproc_sent_text_X)

1000
2000
3000
4000
5000
5770


  all_preproc_sent_text_X = np.array(all_preproc_sent_text_X)


In [9]:
with open('all_preproc_sent_X_test.pickle', 'wb') as f:
    pickle.dump(all_preproc_sent_text_X, f)
with open('count_sent_in_text_X_test.pickle', 'wb') as f:
    pickle.dump(count_sent_in_text_X, f)

In [10]:
all_preproc_sent_summ_Y = []
count_sent_in_summ_Y = []
pack_size = 1000
start_ind = 0  # to count start new 1000 text pack
while start_ind < len(all_news_summ_Y):
    
    # join and preprocessing
    pack_list_for_preproc = []
    end_ind = start_ind + pack_size
    if end_ind > len(all_news_summ_Y):
        end_ind = len(all_news_summ_Y)
        
    for i in range(start_ind, end_ind):
        sentences = nltk.sent_tokenize(all_news_summ_Y[i])
        count_sent_in_summ_Y.append(len(sentences))
        pack_list_for_preproc.append(' bbreakk '.join(sentences))  # between sentences '<break>'
    pack_str_for_preproc = ' bbreakk '.join(pack_list_for_preproc)
    pack_lemmas = preprocess(pack_str_for_preproc, punctuation_re)

    # split
    preproc_sentences = spliting_list(pack_lemmas, 'bbreakk') # список из предложений=строк
    all_preproc_sent_summ_Y.extend(preproc_sentences)
    
    start_ind += pack_size
    print(end_ind)
    
    if sum(count_sent_in_summ_Y) != len(all_preproc_sent_summ_Y):
        print('error')
        break
    
count_sent_in_summ_Y = np.array(count_sent_in_summ_Y)    
all_preproc_sent_summ_Y = np.array(all_preproc_sent_summ_Y)

1000
2000
3000
4000
5000
5770


  all_preproc_sent_summ_Y = np.array(all_preproc_sent_summ_Y)


In [11]:
with open('all_preproc_sent_Y_test.pickle', 'wb') as f:
    pickle.dump(all_preproc_sent_summ_Y, f)
with open('count_sent_in_summ_Y_test.pickle', 'wb') as f:
    pickle.dump(count_sent_in_summ_Y, f)

In [12]:
# разложение предложений по мешку слов train

with open('Vocabulary.pickle', 'rb') as f:
    vocabulary = pickle.load(f)

In [13]:
tfidf_X_val = tfidf_bag_words(all_preproc_sent_text_X, vocabulary)
tfidf_Y_val = tfidf_bag_words(all_preproc_sent_summ_Y, vocabulary)
# print(sum(tfidf_X_val[2]), len(tfidf_X_val[2]))

In [14]:
# разделить предложения по текстам
tfidf_X_val_texts = []
tfidf_Y_val_texts = []
already_taked = 0
for count in count_sent_in_text_X:
    tfidf_X_val_texts.append(tfidf_X_val[already_taked:already_taked + count])
    already_taked += count
already_taked = 0
for count in count_sent_in_summ_Y:
    tfidf_Y_val_texts.append(tfidf_Y_val[already_taked:already_taked + count])
    already_taked += count

In [15]:
'''
посчитать score для каждого предложения
сравнить каждое предложение с каждым предложением из аннотации, выбрать минимальный score
'''
score_all_sent = []
for text_index in tqdm(range(len(tfidf_X_val_texts))):
    sentences_X = tfidf_X_val_texts[text_index].toarray()
    sentences_Y = tfidf_Y_val_texts[text_index].toarray()
    for vec_x in sentences_X:
        best_min_score = np.min([score(vec_x, vec_y) for vec_y in sentences_Y])
        score_all_sent.append(best_min_score)
score_all_sent = np.array(score_all_sent)

  0%|          | 0/5770 [00:00<?, ?it/s]

In [16]:
print(len(np.array(score_all_sent)))
print(sum(np.array(score_all_sent) != 0))

217689
35637


In [17]:
with open('X_out_test.pickle', 'wb') as f:
    # отправляем список предложений набора Х, не разделенных на тексты, для модели
    # !!! не toarray(), а sparce matrix
    pickle.dump(tfidf_X_val, f) 
with open('Y_out_test.pickle', 'wb') as f:
    pickle.dump(score_all_sent, f)