In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def initialize_tfidf(file_path):
    # Открываем файл и читаем его содержимое
    with open(file_path, 'r', encoding='utf-8') as f:
        documents = f.readlines()

    # Инициализация TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=50000, stop_words='english')

    # Применение TF-IDF ко всему тексту
    tfidf_matrix = vectorizer.fit_transform(documents)

    return vectorizer, tfidf_matrix

def reduce_text(text, vectorizer, tfidf_matrix, threshold=0.1):
    # Токенизация текста и преобразование в вектор TF-IDF
    tfidf_vector = vectorizer.transform([text])

    # Получаем имена признаков (слов)
    feature_names = np.array(vectorizer.get_feature_names_out())

    # Индексы слов, у которых вес выше порогового значения
    significant_indices = np.where(tfidf_vector.toarray() > threshold)[1]

    # Извлекаем важные слова
    reduced_text = ' '.join(feature_names[significant_indices])

    return reduced_text

def new_reduced_func(text, vectorizer):
    # Токенизация текста и преобразование в вектор TF-IDF
    tfidf_vector = vectorizer.transform([text])

    # Получаем имена признаков (слов)
    feature_names = np.array(vectorizer.get_feature_names_out())

    # Преобразуем вектор в массив и сортируем веса
    tfidf_values = tfidf_vector.toarray().flatten()
    sorted_indices = np.argsort(tfidf_values)[::-1]  # Сортировка от большего к меньшему

    # Рассчитываем порог, чтобы оставить ~50% слов
    num_words_to_keep = max(1, len(sorted_indices) // 2)  # Оставляем половину слов
    significant_indices = sorted_indices[:num_words_to_keep]  # Берем топ 50% по весу

    # Извлекаем важные слова
    reduced_text = ' '.join(feature_names[significant_indices])

    return reduced_text

In [33]:
vectorizer, tfidf_matrix = initialize_tfidf('/content/ReadMe.txt')

In [24]:
with open('/content/ReadMe.txt', 'r', encoding='utf-8') as f:
  documents = f.readlines()

In [25]:
print(documents)



In [34]:
text = '''Agreement

Concerning the Adoption of Harmonized Technical United Nations Regulations for Wheeled Vehicles, Equipment and Parts which can be Fitted and/or be Used on Wheeled Vehicles and the Conditions for Reciprocal Recognition of Approvals Granted on the Basis of these United Nations Regulations*
(Revision 3, including the amendments which entered into force on 14 September 2017)


Addendum 137: UN Regulation No. 138

Revision 1
01 series of amendments- Date of entry into force: 10 October 2017
Supplement 1 to the 01 series of amendments - Date of entry into force: 11 January 2020 Supplement 2 to the 01 series of amendments - Date of entry into force: 3 January 2021

Uniform provisions concerning the approval of Quiet Road Transport Vehicles with regard to their reduced audibility

This document is meant purely as documentation tool. The authentic and legal binding texts are: ECE/TRANS/WP.29/2016/26, ECE/TRANS/WP.29/2017/6, ECE/TRANS/WP.29/2017/7, as
amended by paragraph 67. of the report (ECE/TRANS/WP.29/1129)



UNITED NATIONS
'''

In [35]:
reduce_text(text, vectorizer, tfidf_matrix, 0.05)

'equipment'

In [36]:
new_reduced_func(text, vectorizer)

