# Реализация Fast Text векторизации

In [1]:
import os
import json
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import math

In [21]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
from google.colab import files

uploaded = files.upload()

Saving transformer_df.csv to transformer_df.csv


In [2]:
df = pd.read_csv("transformer_df.csv")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Reviews,Summary
0,"['Кроссовки удобные, но быстро порвались.', 'Л...","Кроссовки удобные и лёгкие, но не очень долгов..."
1,"['Телефон быстрый, батарея держит долго.', 'По...",Телефон хороший за свою цену. Камера хорошая д...
2,"['Чайник красивый, но шумный.', 'Пахнет пласти...","Чайник быстро кипятит воду и выглядит стильно,..."
3,"['Качество норм, но после стирки ткань стала ж...","Футболка приятная к телу, но могут быть пробле..."
4,"['Телефон красивый, но корпус маркий – остаютс...",Телефон с хорошей камерой и производительность...


In [3]:
df["Reviews"] = df["Reviews"].apply(lambda x: " ".join(eval(x)).lower())
df["Reviews"] = df["Reviews"].apply(lambda x: sent_tokenize(x))
df['Reviews'][0]

['кроссовки удобные, но быстро порвались.',
 'лёгкие, но скользкие на мокром асфальте.',
 'размер в размер, не натирают.',
 'через месяц начала отклеиваться подошва.',
 'отличные, ношу каждый день!']

In [4]:
df['Reviews'][0][0]

'кроссовки удобные, но быстро порвались.'

In [5]:
from sentence_transformers import SentenceTransformer

In [7]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
df["Reviews"] = df["Reviews"].apply(lambda x: model.encode(x))
df['Reviews'][0]

array([[-0.29153085,  0.09215779,  0.40644842, ..., -0.09732504,
        -0.00676169,  0.24600308],
       [ 0.28391683,  0.07282894,  0.25040245, ..., -0.03359343,
         0.17172515,  0.26508424],
       [ 0.09772547,  0.21985465,  0.11679916, ..., -0.13981734,
         0.29028225, -0.0219874 ],
       [ 0.17187361, -0.00571185,  0.06981334, ...,  0.10872033,
         0.0911116 ,  0.1021126 ],
       [-0.08329684,  0.16207618, -0.04685418, ..., -0.32903215,
        -0.06131635, -0.07821988]], dtype=float32)

In [9]:
df['Summary'] = df['Summary'].apply(lambda x: model.encode(x))

In [10]:
df.head()

Unnamed: 0,Reviews,Summary
0,"[[-0.29153085, 0.09215779, 0.40644842, 0.04274...","[0.12072697, 0.04646999, 0.17353131, -0.010355..."
1,"[[-0.34263098, 0.7583325, -0.24740529, 0.00759...","[-0.26656342, 0.4403357, -0.14647679, -0.04963..."
2,"[[0.33664483, -0.09720072, -0.023643577, 0.341...","[0.1088616, 0.0028593147, -0.07166311, 0.20391..."
3,"[[-0.024744416, 0.002452761, 0.20475379, 0.155...","[0.06804938, 0.13916674, 0.0005558752, 0.17586..."
4,"[[-0.18416476, 0.54426146, -0.18336855, -0.098...","[-0.19028546, 0.6323152, -0.23953237, -0.00046..."


In [53]:
df.to_csv("bert_transformer.csv")

In [54]:
from google.colab import files

files.download("bert_transformer.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
df['Reviews'][0].shape

(5, 384)

In [12]:
df['Summary'][0].shape

(384,)

In [14]:
!pip install keras-tuner



# Реализация нейронной сети

In [25]:
X = np.array(df['Reviews'].tolist())
y = np.array(df['Summary'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from keras_tuner import HyperModel
from keras_tuner.tuners import RandomSearch

class TransformerHyperModel(HyperModel):
    def build(self, hp):
        inputs = layers.Input(shape=(5, 384))

        hidden_units = hp.Int('hidden_units', min_value=128, max_value=1024, step=128)
        attention_heads = hp.Int('attention_heads', min_value=1, max_value=8, step=1)
        dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)

        x = layers.MultiHeadAttention(num_heads=attention_heads, key_dim=hidden_units)(inputs, inputs)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.LayerNormalization()(x)

        x = layers.Dense(hidden_units, activation='relu')(x)
        x = layers.Dropout(dropout_rate)(x)

        x = layers.Flatten()(x)
        x = layers.Dense(384, activation='linear')(x)

        model = Model(inputs, x)

        model.compile(optimizer=Adam(), loss='mse', metrics=['mae'])
        return model


tuner = RandomSearch(
    TransformerHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='transformer_search',
    project_name='transformer'
)

tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

best_model = tuner.get_best_models(num_models=1)[0]

loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

perplexity = math.exp(loss)
print(f"Perplexity: {perplexity:.2f}")

Trial 10 Complete [00h 00m 50s]
val_loss: 0.022084364667534828

Best val_loss So Far: 0.021783048287034035
Total elapsed time: 00h 44m 59s


  saveable.load_own_variables(weights_store.get(inner_path))


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 211ms/step - loss: 0.0217 - mae: 0.1160
Test Loss: 0.0218
Test Accuracy: 0.1161
Perplexity: 1.02
