# Векторное представление слов


<hr>

С.Ю. Папулин (papulin.study@yandex.ru)

### Содержание

- [Векторное представление GloVe](#Векторное-представление-GloVe)
- [Классификация текстовых документов](#Классификация-текстовых-документов)
- [Слой векторного представления слов в Keras/TensorFlow](#Слой-векторного-представления-слов-в-Keras/TensorFlow)
- [Источники](#Источники)

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

## Векторное представление `GloVe`

Предобученная модель `GloVe` [[ссылка](https://github.com/stanfordnlp/GloVe)]. Далее используется модель, обученная на Wikipedia 2014 + Gigaword 5 

In [None]:
# !wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip 

In [None]:
def load_vectors(path_to_file):
    """Загрузка словаря и весов."""
    embeddings_index = {}
    with open(path_to_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs
    return embeddings_index

In [None]:
# Размерность вектора слов
EMBEDDING_DIM = 100

# Загрузка модели
FILEPATH = f'/YOUR_PATH/glove.6B/glove.6B.{EMBEDDING_DIM}d.txt'
embeddings_index = load_vectors(FILEPATH)

In [None]:
# Массив слова
words = np.array(list(embeddings_index.keys()))
words[:5]

In [None]:
# Массив весов слов
E = np.zeros((len(embeddings_index), EMBEDDING_DIM))
for indx, (word, vector) in enumerate(embeddings_index.items()):
    E[indx] = vector

In [None]:
# Список запросов
q1 = embeddings_index['king']
q2 = embeddings_index['king'] - embeddings_index['man'] + embeddings_index['woman']
q3 = embeddings_index['soldier'] - embeddings_index['braveness']
q4 = embeddings_index['unemployment'] + embeddings_index['work']
q5 = embeddings_index['democracy'] - embeddings_index['law'] + embeddings_index['corruption']
q6 = embeddings_index['pilot'] - embeddings_index['plane'] +  embeddings_index['car']

Q = np.vstack([q1, q2, q3, q4, q5, q6])
Q.shape

In [None]:
# Вычисление косинусного сходства (дистанция от 0 до 1)
S = 1 - cosine_similarity(E, Q)
S.shape

In [None]:
# Список слова запросов
query_words = [
    ['king'], 
    ['king', 'man', 'woman'],
    ['soldier', 'braveness'],
    ['unemployment', 'work'],
    ['democracy', 'law', 'corruption'],
    ['pilot', 'plane', 'car']
]

n_top = 10  # топ-10 слов релевантных запросу

W_top = np.empty((S.shape[1], n_top), dtype='object')

for i in range(S.shape[1]):
    """
    Удаляем из результата слова запроса и
    сохраняем топ-n слов для каждого запроса
    """
    # Фильтруем слова i-го запроса
    mask = np.isin(words, query_words[i])
    # Применяем фильтр, сортируем по близости и 
    # оставляем топ 10
    W_top[i] = words[~mask][S[~mask, i].argsort()][:n_top]

# Вывод в виде датафрейма
pd.DataFrame(data=W_top.T, columns=[f"q{i+1}" for i in range(W_top.shape[0])])

## Классификация текстовых документов

### Загрузка набора данных

In [None]:
RANDOM_STATE = 100

In [None]:
data = fetch_20newsgroups(
    subset="all", 
    shuffle=True, 
    remove=("headers", "footers", "quotes"), 
    random_state=RANDOM_STATE
)

In [None]:
names = np.array(data.target_names)
names

In [None]:
def make_dataset(data):
    X = np.array(data.data, dtype='object')
    y = data.target
    X, y = shuffle(X, y, random_state=RANDOM_STATE)
    topics = (names=='comp.graphics')\
        | (names=='comp.os.ms-windows.misc')\
        | (names=='comp.sys.ibm.pc.hardware')\
        | (names=='comp.sys.mac.hardware')\
        | (names=='comp.windows.x')\
        | (names=='sci.electronics')
    topic_labels = np.where(topics)[0]
    topics_mask = np.isin(y, topic_labels)
    X_pos = X[topics_mask]
    n_pos = X_pos.shape[0]
    X_neg = X[~topics_mask][:n_pos]
    y_pos, y_neg = np.ones(n_pos), np.zeros(n_pos)
    return shuffle(np.r_[X_pos, X_neg], np.r_[y_pos, y_neg], random_state=RANDOM_STATE)

In [None]:
X, y = make_dataset(data)
X[:2], y[:2]

In [None]:
np.unique(y, return_counts=True)

In [None]:
# Средняя длина текста в символах
np.mean(list(map(lambda x: len(x), X)))

In [None]:
# Формирование тестового множества
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=RANDOM_STATE
)
# Формирование проверочного множества
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, 
    test_size=0.3, 
    random_state=RANDOM_STATE
)

### Преобразование документов в вектор

#### Наивный Байес

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
NUM_FEATURES = 10000

In [None]:
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english",
                             use_idf=False, ngram_range=(1,1),
                             max_features=NUM_FEATURES,
                             smooth_idf=True)
clr = MultinomialNB()

In [None]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clr', clr)
])

In [None]:
pipeline.fit(X_trainval, y_trainval)
# Доля правильных классификаций на тестовом подмножестве
print("Accuracy =", pipeline.score(X_test, y_test))
print("Precision =", precision_score(pipeline.predict(X_test), y_test))

In [None]:
new_text = """
OpenAI Chief Technology Officer Mira Murati said the updated version of ChatGPT will 
now also have memory capabilities, meaning it can learn from previous conversations 
with users, and can do real-time translation.

“This is the first time that we are really making a huge step forward when it comes to 
the ease of use,” Murati said during the live demo from the company’s San Francisco 
headquarters. “This interaction becomes much more natural and far, far easier.”

The new release comes as OpenAI seeks to stay ahead of the growing competition in the 
AI arms race. Rivals including Google and Meta have been working to build increasingly 
powerful large language models that power chatbots and can be used to bring AI technology 
to various other products.

The OpenAI event came one day ahead of Google’s annual I/O developer conference, at which 
it’s expected to announce updates to its Gemini AI model. Like the new GPT-4o, Google’s 
Gemini is also multimodal, meaning it can interpret and generate text, images and audio. 
OpenAI’s update also comes ahead of expected AI announcements from Apple at its Worldwide 
Developers Conference next month, which could include new ways of incorporating AI into 
the next iPhone or iOS releases.
"""

In [None]:
pipeline.predict([new_text, ])

#### Метод опорных векторов

In [None]:
from sklearn.svm import SVC, LinearSVC

In [None]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clr', LinearSVC())
])

pipeline.fit(X_trainval, y_trainval)
# Доля правильных классификаций на тестовом подмножестве
print("Accuracy =", pipeline.score(X_test, y_test))
print("Precision =", precision_score(pipeline.predict(X_test), y_test))

### Преобразование слов в вектор с использованием предобученной матрицы

In [None]:
def convert_to_mean_text(X, E_dict, analyzer):
    X_ = np.zeros((len(X), EMBEDDING_DIM))
    for i, post in enumerate(X):
        words = analyzer(post)
        words_vectors = [E_dict.get(word) for word in words if word in E_dict]
        if words_vectors:
            X_[i] = np.vstack(words_vectors).mean(axis=0)
    return X_

In [None]:
tokenizer = vectorizer.build_tokenizer()  # разбивает текст на слова
analyzer = vectorizer.build_analyzer()    # tokenizer + применяет преобразования

In [None]:
words = analyzer(X[0])
words, len(words)

In [None]:
X_trainval__mean_emb = convert_to_mean_text(
    X=X_trainval, 
    E_dict=embeddings_index, 
    analyzer=analyzer
)
X_test__mean_emb = convert_to_mean_text(
    X=X_test, 
    E_dict=embeddings_index, 
    analyzer=analyzer
)

#### Метод опорных векторов

In [None]:
clr = SVC(kernel='rbf', gamma='scale')
clr.fit(X_trainval__mean_emb, y_trainval)
# Доля правильных классификаций на тестовом подмножестве
print("Accuracy =", clr.score(X_test__mean_emb, y_test))
print("Precision =", precision_score(clr.predict(X_test__mean_emb), y_test))

In [None]:
clr.predict(
    convert_to_mean_text(
        X=[new_text, ], 
        E_dict=embeddings_index, 
        analyzer=analyzer
    )
)

#### Многослойная нейронная сеть

In [None]:
def plot_train_val_scores(train_history):
    
    INDX = 0

    # Построение графиков ошибок обучения
    plt.figure(figsize=[14, 4])

    epochs = np.arange(1, len(train_history.history["loss"])+1)

    plt.subplot(1,2,1)  # кросс-энтропия
    plt.plot(epochs[INDX:], train_history.history["loss"][INDX:], "-og", label="train")
    plt.plot(epochs[INDX:], train_history.history["val_loss"][INDX:], "-o", color="orange", label="val")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.grid(True)
    plt.legend()

    plt.subplot(1,2,2)  # доля правильных классификаций
    plt.plot(epochs[INDX:], train_history.history["binary_accuracy"][INDX:], "-og", label="train")
    plt.plot(epochs[INDX:], train_history.history["val_binary_accuracy"][INDX:], "-o", color="orange", label="val")
    plt.xlabel("epochs")
    plt.ylabel("accuracy")
    plt.grid(True)
    plt.legend()

    plt.show()

In [None]:
X_train__mean_emb = convert_to_mean_text(
    X=X_train, 
    E_dict=embeddings_index, 
    analyzer=analyzer
)
X_val__mean_emb = convert_to_mean_text(
    X=X_val, 
    E_dict=embeddings_index, 
    analyzer=analyzer
)

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Dense(128, activation="relu", input_shape=(EMBEDDING_DIM,)))
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision()])
    return model

In [None]:
# Построение модели
model = build_model()

# Описание модели
model.summary()

In [None]:
# Обучение
train_history = model.fit(
    X_train__mean_emb, y_train, 
    epochs=30, 
    validation_data=(X_val__mean_emb, y_val),
    batch_size=50,
    verbose=1
)

In [None]:
plot_train_val_scores(train_history)

In [None]:
# Выбираем количество эпох и заново обучаем сеть на всём обучающем множестве
best_num_epochs = 10

# Построение модели
model = build_model()

# Обучение
train_history = model.fit(X_trainval__mean_emb, y_trainval,
                          epochs=best_num_epochs, 
                          batch_size=50,
                          verbose=1)

# Оценка качества модели
_, train_error__acc, train_error__prec = model.evaluate(X_trainval__mean_emb, y_trainval)
_, test_error__acc, test_error__prec = model.evaluate(X_test__mean_emb, y_test)

print("Train:\n")
print("\tAccuracy = \t", train_error__acc)
print("\tPrecision = \t", train_error__prec)
print("Test:\n")
print("\tAccuracy = \t", test_error__acc)
print("\tPrecision = \t", test_error__prec)

## Слой векторного представления слов в Keras/TensorFlow

In [None]:
from keras.layers import Embedding

### Подготовка набора данных

In [None]:
BATCH_SIZE = 128

In [None]:
"""
Набор данных для выбора эпохи
"""

# Обучение
X_train__batches = tf.data.Dataset.from_tensor_slices(X_train)
y_train__batches = tf.data.Dataset.from_tensor_slices(y_train)

train__batches = (
    tf.data.Dataset.zip((X_train__batches, y_train__batches))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
)

# Проверка
X_val__batches = tf.data.Dataset.from_tensor_slices(X_val)
y_val__batches = tf.data.Dataset.from_tensor_slices(y_val)

val__batches = (
    tf.data.Dataset.zip((X_val__batches, y_val__batches))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
)


for batch in train__batches.take(1):
    print(batch[0].shape, batch[1].shape)

In [None]:
"""
Набор данных для повторного обучения и тестирования
"""

# Обучение
X_trainval__batches = tf.data.Dataset.from_tensor_slices(X_trainval)
y_trainval__batches = tf.data.Dataset.from_tensor_slices(y_trainval)

trainval__batches = (
    tf.data.Dataset.zip((X_trainval__batches, y_trainval__batches))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
)

# Тестирование
X_test__batches = tf.data.Dataset.from_tensor_slices(X_test)
y_test__batches = tf.data.Dataset.from_tensor_slices(y_test)

test__batches = (
    tf.data.Dataset.zip((X_test__batches, y_test__batches))
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
)

### Слой преобразования слов в индексы

In [None]:
MAX_TEXT_LENGTH = 200
NUM_FEATURES = 10000

In [None]:
# Преобразование текста в набор индексов словаря
vectorizer_layer = layers.TextVectorization(
    max_tokens=NUM_FEATURES, 
    output_sequence_length=MAX_TEXT_LENGTH
)

# Формирование словаря
vectorizer_layer.adapt(X_train__batches)

print(f'Количество элементов словаря:\t{len(vectorizer_layer.get_vocabulary())}')
print(f'Первые элементы словаря:\t{vectorizer_layer.get_vocabulary()[:5]}')

### Инициализация слоя векторизации

In [None]:
# Формирование матрицы весов для embedding слоя
E = np.zeros((NUM_FEATURES, EMBEDDING_DIM))
for i, word in enumerate(vectorizer_layer.get_vocabulary()):
    if word in embeddings_index:
        E[i] = embeddings_index.get(word)

In [None]:
# Инициализация embedding слоя
embedding_layer = Embedding(
    input_dim=NUM_FEATURES,
    output_dim=EMBEDDING_DIM,
    input_length=MAX_TEXT_LENGTH,
    trainable=False  # отключаем обучение слоя
)

# Инициализация весов
embedding_layer.build((1,))

# Установка весов
embedding_layer.set_weights([E])

### Построение и обучение модели

#### Модель 1

In [None]:
def build_model():
    model = models.Sequential()
    model.add(vectorizer_layer)
    model.add(embedding_layer)
    model.add(layers.Flatten())
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision()]
    )
    model.build(input_shape=(1, ))
    return model

In [None]:
# Построение модели
model = build_model()

# Описание модели
model.summary()

In [None]:
# Обучение
train_history = model.fit(
    train__batches, 
    epochs=20, 
    batch_size=BATCH_SIZE,
    validation_data=val__batches,
    verbose=1
)

In [None]:
plot_train_val_scores(train_history)

In [None]:
# Выбираем количество эпох и заново обучаем сеть на всём обучающем множестве
best_num_epochs = 5

# Построение модели
model = build_model()

# Обучение
train_history = model.fit(trainval__batches,
                          epochs=best_num_epochs, 
                          batch_size=50,
                          verbose=1)

# Оценка качества модели
_, train_error__acc, train_error__prec = model.evaluate(trainval__batches)
_, test_error__acc, test_error__prec = model.evaluate(test__batches)

print("Train:\n")
print("\tAccuracy = \t", train_error__acc)
print("\tPrecision = \t", train_error__prec)
print("Test:\n")
print("\tAccuracy = \t", test_error__acc)
print("\tPrecision = \t", test_error__prec)

#### Модель 2

In [None]:
def build_model():
    model = models.Sequential()
    model.add(vectorizer_layer)
    model.add(embedding_layer)
    model.add(layers.Bidirectional(layers.LSTM(32)))
    model.add(layers.Dense(32, activation="relu"))
    model.add(layers.Dense(1,  activation="sigmoid"))
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision()])
    model.build(input_shape=(1, ))
    return model

In [None]:
# Построение модели
model = build_model()

# Вывод описания модели
model.summary()

In [None]:
# Запуск обучения
train_history = model.fit(
    train__batches, 
    epochs=10, 
    batch_size=128,
    validation_data=val__batches,
    verbose=1
)

In [None]:
plot_train_val_scores(train_history)

In [None]:
# Выбираем количество эпох и заново обучаем сеть на всём обучающем множестве
best_num_epochs = 10

# Построение модели
model = build_model()

# Обучение
train_history = model.fit(trainval__batches,
                          epochs=best_num_epochs, 
                          batch_size=128,
                          verbose=1)

# Оценка качества модели
_, train_error__acc, train_error__prec = model.evaluate(trainval__batches)
_, test_error__acc, test_error__prec = model.evaluate(test__batches)

print("Train:\n")
print("\tAccuracy = \t", train_error__acc)
print("\tPrecision = \t", train_error__prec)
print("Test:\n")
print("\tAccuracy = \t", test_error__acc)
print("\tPrecision = \t", test_error__prec)

In [None]:
# Предсказание для новых данных
model.predict([new_text,])

### Слои модели нейронной сети

In [None]:
# Список слоев
model.layers

In [None]:
# Имена слоев
[layer.name for layer in model.layers]

In [None]:
# Доступ к слою по имени
layer = model.get_layer('embedding')
print(f'Trainable weights: {layer.trainable_variables}')
print(f'Non-trainable weights: {layer.non_trainable_variables}')

In [None]:
# Выходные данные 1го слоя
output_l1 = model.layers[0](np.array(new_text))
output_l1.numpy()

In [None]:
# Выходные данные 2го слоя
output_l2 = model.layers[1](output_l1)
output_l2.numpy()

In [None]:
# Выходные данные 3го слоя
output_l3 = model.layers[2](np.array([output_l2]))
output_l3.numpy()

In [None]:
# Выходные данные 4го слоя
output_l4 = model.layers[3](output_l3)
output_l4.numpy()

In [None]:
# Выходные данные 5го слоя
output_l5 = model.layers[4](output_l4)
output_l5.numpy()

In [None]:
# Предсказание
model.predict([new_text,])

### Обучение слоя векторного представления слов

In [None]:
def build_model():
    model = models.Sequential()
    model.add(vectorizer_layer)
    model.add(layers.Embedding(
        input_length=MAX_TEXT_LENGTH,
        input_dim=NUM_FEATURES,
        output_dim=128,
        mask_zero=True))
    model.add(layers.Flatten())
    model.add(layers.Dense(1, activation="sigmoid"))
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision()])
    model.build(input_shape=(1,))
    return model

In [None]:
# Построение модели
model = build_model()

# Вывод описания модели
model.summary()

In [None]:
# Запуск обучения
train_history = model.fit(
    train__batches, 
    epochs=30, 
    batch_size=128,
    validation_data=val__batches,
    verbose=1
)

In [None]:
# Извлечение весов словаря
E_new = model.layers[1].get_weights()[0]
E_new.shape

In [None]:
# Массив слова словаря
words_new = np.array(vectorizer_layer.get_vocabulary())

In [None]:
# Формирование структуры вида: [слово]->[вектор]
embeddings_new_index = {word: E_new[i] for i, word in enumerate(vectorizer_layer.get_vocabulary())}
len(embeddings_new_index)

In [None]:
# Запрос
q = embeddings_new_index['computer']

# Вычисление косинусного сходства (дистанция от 0 до 1)
S = 1 - cosine_similarity(E_new, np.array([q, ]))
S.shape

In [None]:
# Список слова запросов
query_words = [
    ['computer']
]

n_top = 10  # топ-10 слов релевантных запросу

W_top = np.empty((S.shape[1], n_top), dtype='object')

for i in range(S.shape[1]):
    """
    Удаляем из результата слова запроса и
    сохраняем топ-n слов для каждого запроса
    """
    # Фильтруем слова i-го запроса
    mask = np.isin(words_new, query_words[i])
    # Применяем фильтр, сортируем по близости и 
    # оставляем топ 10
    W_top[i] = words_new[~mask][S[~mask, i].argsort()][:n_top]

# Вывод в виде датафрейма
pd.DataFrame(data=W_top.T, columns=[f"q{i+1}" for i in range(W_top.shape[0])])

## Источники

- [Using pre-trained word embeddings](https://keras.io/examples/nlp/pretrained_word_embeddings/)
- [GloVe: Global Vectors for Word Representation](https://github.com/stanfordnlp/GloVe)
- [Pre-trained word vectors trained using fastText](https://fasttext.cc/docs/en/english-vectors.html)