In [None]:
import numpy as np
import pickle
import xgboost as xgb
from tqdm.auto import tqdm

In [None]:
def load_from(pickle_file):
    with open(pickle_file, 'rb') as f:
        data = pickle.load(f) 
    return data

In [None]:
# Метод Луна: поиск наиболее частых слов
def choose_most_freq_words(bag_of_words):
    freq = bag_of_words.sum(axis=0)  # подсчет частоты каждого слова
    sort_freq_index = np.argsort(freq)
    top_words_index = sort_freq_index[-int(len(freq) / 10):]  # 10% наиболее частых
    return top_words_index

# Метод Луна: подсчет ранга предложений
def count_range_sentences(bag_of_words, top_words_index):
    sent_range_list = []
    for i, sent in enumerate(bag_of_words):  # take word-index string
        use_words_index = np.nonzero(sent)  
        words_count = len(use_words_index)
        top_words_count = len(np.intersect1d(top_words_index, use_words_index))

        sent_range = (top_words_count ** 2) / words_count
        sent_range_list.append(sent_range)
    return sent_range_list

# Метод Луна: выбор предложений для аннотации
def choose_sent_for_summ(sent_range_list, y_sent_count):
    sort_sent_range = np.argsort(sent_range_list)
    sent_for_summ_index = np.sort(sort_sent_range[-y_sent_count:]) 
    return sent_for_summ_index

# Метод Луна
def method_Luhn(tfidf_texts_X, count_sent_in_text_Y):
    index_sent_for_summ = []

    for i, tfidf_text in tqdm(enumerate(tfidf_texts_X)):
        tfidf_text = tfidf_text.toarray()
        top_words_index = choose_most_freq_words(tfidf_text)
        sent_range_list = count_range_sentences(tfidf_text, top_words_index)
        sent_for_summ_index = choose_sent_for_summ(sent_range_list, count_sent_in_text_Y[i])
        index_sent_for_summ.append(sent_for_summ_index)
    return index_sent_for_summ

In [None]:
# Классификация: обучение модели с валидацией
def tree_model_training(X_train, Y_train, X_val, Y_val):
    xgb_params = {'eta': 0.2, 
              'max_depth': 8, 
              'subsample': 0.6, 
              'colsample_bytree': 0.7, 
              'objective': 'binary:logistic', 
              'eval_metric': 'logloss', 
              }
    num_rounds = 200
    d_train = xgb.DMatrix(X_train, Y_train)
    d_val = xgb.DMatrix(X_val, Y_val)
    evallist = [(d_val, 'val')]

    model_xgb = xgb.train(xgb_params, d_train, num_rounds, evallist, early_stopping_rounds=10)
    return model_xgb

# Классификация: получение предсказания модели и выбор предложений для аннотации
def tree_predict(model, X_test, count_sent_Y_test):
    d_test = xgb.DMatrix(X_test)
    Y_predict = model.predict(d_test, iteration_range=(0, my_model.best_iteration))
    
    index_sent_for_summ = []
    new_text_start = 0
    for text_ind in tqdm(range(len(count_sent_X_test))):
        text_end = new_text_start + count_sent_X_test[text_ind]
        predict_for_one_text = Y_predict[new_text_start : text_end]
        sort_index = np.argsort(predict_for_one_text)[::-1] 
        # выбираем с наибольшим предсказанием
        index_sent_for_summ.append(sort_index[:count_sent_Y_test[text_ind]])  
        new_text_start = text_end  
    return index_sent_for_summ

In [None]:
# Кластеризация: выбор предложений для аннотации
def clustering(tfidf_texts_X, count_sent_in_text_Y):
    index_sent_for_summ = []
    new_text_start = 0
    for i, text in tqdm(enumerate(tfidf_texts_X)):
        cluster_count = count_sent_in_text_Y[i]

        model_km = KMeans(n_clusters = cluster_count)
        cluster_field = model_km.fit(text)
        distance_to_cluster = model_km.transform(text)

        sent_ind_close_centroids = np.argmin(distance_to_cluster, axis=0)
        index_sent_for_summ.append(sent_ind_close_centroids)
    return index_sent_for_summ

### Main

In [None]:
if __name__ == '__main__':
    tfidf_texts_X_test = load_from('file_name.pickle')
    count_sent_in_text_Y_test = load_from('file_name.pickle')
    tfidf_sents_X_train = load_from('file_name.pickle')
    tfidf_sents_Y_train = load_from('file_name.pickle')
    tfidf_sents_X_val = load_from('file_name.pickle')
    tfidf_sents_Y_val = load_from('file_name.pickle')
    tfidf_sents_X_test = load_from('file_name.pickle')
    
    # Получение индексов предложений для аннотации по методу Луна
    Luhn_index_sent_for_summ = method_Luhn(tfidf_texts_X_test, count_sent_in_text_Y_test)
    
    # Обучение модели классификатора
    classif_model = Tree_model_training(
        tfidf_sents_X_train, tfidf_sents_Y_train, tfidf_sents_X_val, tfidf_sents_Y_val
    )
    # Получение индексов предложений для аннотации по методу классификации
    Tree_index_sent_for_summ = Tree_predict(
        classif_model, tfidf_sents_X_test, count_sent_in_text_Y_test
    )
    
    # Получение индексов предложений для аннотации по методу кластеризации
    Cluster_index_sent_for_summ = clustering(tfidf_texts_X_test, count_sent_in_text_Y_test)
    
    # сохранение результатов работы методов