# Imports

In [36]:
import numpy as np
import pandas as pd
import re
import emoji


In [37]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN, GRU


In [38]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\artem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Data preparation

In [39]:
df = pd.read_csv("data/data.csv")
df


Unnamed: 0,product_name,overall_rating,no_ratings,no_reviews,rating,title,review
0,Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...,4.7,15210,900,5,Perfect product!,"Loved it, it's my first MacBook that I earned ..."
1,Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...,4.7,15210,900,5,Fabulous!,Battery lasted longer than my first relationsh...
2,Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...,4.7,15210,900,5,Fabulous!,Such a great deal.. very happy with the perfor...
3,Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...,4.7,15210,900,4,Delightful,"Awesome build quality and very good display, b..."
4,Apple MacBook AIR Apple M2 - (8 GB/256 GB SSD/...,4.7,15210,900,5,Awesome,When i ordered and came to know about seller r...
...,...,...,...,...,...,...,...
24108,MSI Modern 14 Intel Core i5 13th Gen 1335U - (...,4.3,156,24,5,Perfect product!,MSI Laptop is high performance and the best. c...
24109,MSI Modern 14 Intel Core i5 13th Gen 1335U - (...,4.3,156,24,5,Perfect product!,Excellent performance best laptop.
24110,MSI Modern 14 Intel Core i5 13th Gen 1335U - (...,4.3,156,24,4,"Good product, Lacks features.",Decent battery life. Exceptional build quality...
24111,Lenovo IdeaPad 5 2-in-1 WUXGA IPS AMD Ryzen 7 ...,4.4,7,2,3,Nice,The product does not support facial recognitio...


In [40]:
df[["review"]]

Unnamed: 0,review
0,"Loved it, it's my first MacBook that I earned ..."
1,Battery lasted longer than my first relationsh...
2,Such a great deal.. very happy with the perfor...
3,"Awesome build quality and very good display, b..."
4,When i ordered and came to know about seller r...
...,...
24108,MSI Laptop is high performance and the best. c...
24109,Excellent performance best laptop.
24110,Decent battery life. Exceptional build quality...
24111,The product does not support facial recognitio...


In [41]:
def remove_emoji_from_dataframe(df_i, columns=None):
    """
    Удаляет эмодзи из указанных столбцов DataFrame с помощью библиотеки emoji.
    
    Params:
        df_i (pd.DataFrame): Исходный DataFrame
        columns (list): Список столбцов для обработки. Если None, обрабатываются все строковые столбцы.
    
    Returns:
        pd.DataFrame: Очищенный DataFrame (новая копия)
    """
    # Создаем копию DataFrame чтобы не изменять исходный
    df_clean = df_i.copy()
    
    # Определяем целевые столбцы
    if columns is None:
        columns = df_clean.select_dtypes(include=['object', 'string']).columns.tolist()
    
    # Функция для удаления эмодзи
    def remove_emoji(text):
        return emoji.replace_emoji(text, replace='') if isinstance(text, str) else text
    
    # Обрабатываем каждый столбец
    for col in columns:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].apply(remove_emoji)
    
    return df_clean

In [42]:
df = remove_emoji_from_dataframe(df, ["review"])
df[["review"]]

Unnamed: 0,review
0,"Loved it, it's my first MacBook that I earned ..."
1,Battery lasted longer than my first relationsh...
2,Such a great deal.. very happy with the perfor...
3,"Awesome build quality and very good display, b..."
4,When i ordered and came to know about seller r...
...,...
24108,MSI Laptop is high performance and the best. c...
24109,Excellent performance best laptop.
24110,Decent battery life. Exceptional build quality...
24111,The product does not support facial recognitio...


In [43]:
df["review"][0]

"Loved it, it's my first MacBook that I earned from my hardwork "

In [44]:
text = ".".join(df["review"][:10**3])

# Data tokenize

In [45]:

# Токенизация текста
tokens = word_tokenize(text.lower())

# Создание словаря
vocab = sorted(set(tokens))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)


In [46]:

# Параметры
n_gram = 3  # Размер N-граммы
window_size = 5  # Размер окна для BoW

# Подготовка данных для N-грамм
sequences = []
for i in range(len(tokens) - n_gram):
    sequences.append(tokens[i:i + n_gram])

X_ngram, y_ngram = [], []
for seq in sequences:
    X_ngram.append([word_to_idx[word] for word in seq[:-1]])
    y_ngram.append(word_to_idx[seq[-1]])

X_ngram = np.array(X_ngram)
y_ngram = np.array(y_ngram)


In [47]:

# Подготовка данных для BoW
corpus, y_bow = [], []
for i in range(len(tokens) - window_size):
    context = tokens[i:i + window_size]
    corpus.append(' '.join(context))
    y_bow.append(word_to_idx[tokens[i + window_size]])

vectorizer = CountVectorizer(vocabulary=vocab)
X_bow = vectorizer.fit_transform(corpus).toarray()
y_bow = np.array(y_bow)


In [48]:

# Разделение данных
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y_bow, test_size=0.2)
X_train_ng, X_test_ng, y_train_ng, y_test_ng = train_test_split(X_ngram, y_ngram, test_size=0.2)


In [49]:

# Модель BoW
model_bow = Sequential([
    Dense(128, activation='relu', input_shape=(vocab_size,)),
    Dense(vocab_size, activation='softmax')
])
model_bow.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bow.fit(X_train_bow, y_train_bow, epochs=10, validation_data=(X_test_bow, y_test_bow))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.0349 - loss: 6.6878 - val_accuracy: 0.0440 - val_loss: 6.0120
Epoch 2/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0522 - loss: 5.7765 - val_accuracy: 0.0589 - val_loss: 5.8575
Epoch 3/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.0786 - loss: 5.3928 - val_accuracy: 0.0834 - val_loss: 5.6720
Epoch 4/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.1229 - loss: 4.9582 - val_accuracy: 0.0995 - val_loss: 5.4759
Epoch 5/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.1684 - loss: 4.4723 - val_accuracy: 0.1250 - val_loss: 5.2699
Epoch 6/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.2427 - loss: 3.8644 - val_accuracy: 0.1487 - val_loss: 5.0948
Epoch 7/10
[1m543/543

<keras.src.callbacks.history.History at 0x1fae6fa1f10>

In [50]:

# Модель RNN
model_rnn = Sequential([
    Embedding(vocab_size, 64, input_length=n_gram-1),
    SimpleRNN(128),
    Dense(vocab_size, activation='softmax')
])
model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(X_train_ng, y_train_ng, epochs=10, validation_data=(X_test_ng, y_test_ng))


Epoch 1/10




[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.0342 - loss: 6.6087 - val_accuracy: 0.0702 - val_loss: 5.8755
Epoch 2/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.0822 - loss: 5.5295 - val_accuracy: 0.1276 - val_loss: 5.4388
Epoch 3/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.1498 - loss: 4.7953 - val_accuracy: 0.1540 - val_loss: 5.1680
Epoch 4/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.2115 - loss: 4.1784 - val_accuracy: 0.1955 - val_loss: 4.9561
Epoch 5/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.2908 - loss: 3.6646 - val_accuracy: 0.2162 - val_loss: 4.8578
Epoch 6/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.3482 - loss: 3.2320 - val_accuracy: 0.2464 - val_loss: 4.7734
Epoch 7/10
[1m543/543[0m [32m━

<keras.src.callbacks.history.History at 0x1faefb90510>

In [51]:

# Модель GRU
model_gru = Sequential([
    Embedding(vocab_size, 64, input_length=n_gram-1),
    GRU(128),
    Dense(vocab_size, activation='softmax')
])
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.fit(X_train_ng, y_train_ng, epochs=10, validation_data=(X_test_ng, y_test_ng))


Epoch 1/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.0310 - loss: 6.6762 - val_accuracy: 0.0359 - val_loss: 6.0475
Epoch 2/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0497 - loss: 5.8392 - val_accuracy: 0.0739 - val_loss: 5.8444
Epoch 3/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0899 - loss: 5.3934 - val_accuracy: 0.1232 - val_loss: 5.4785
Epoch 4/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.1447 - loss: 4.8437 - val_accuracy: 0.1633 - val_loss: 5.2055
Epoch 5/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.2034 - loss: 4.2696 - val_accuracy: 0.1824 - val_loss: 5.0483
Epoch 6/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.2533 - loss: 3.8219 - val_accuracy: 0.2082 - val_loss: 4.9388
Epoch 7/10
[1m543/54

<keras.src.callbacks.history.History at 0x1fae65c5990>

In [52]:

# Оценка моделей
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test).argmax(axis=1)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))


In [53]:

evaluate_model(model_bow, X_test_bow, y_test_bow, "BoW")
evaluate_model(model_rnn, X_test_ng, y_test_ng, "RNN")
evaluate_model(model_gru, X_test_ng, y_test_ng, "GRU")

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step

BoW Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.78      0.52         9
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         1
           9       0.30      0.51      0.38        35
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         9
          12       0.40      0.25      0.31         8
          13       0.44      0.54      0.48        13
          14       0.12      0.23      0.16       122
          15       0.40      0.50      0.44        12
          16       0.00      0.00      0.00         0
          19       0.14      0.18      0.16       147
          20       0.11      0.17      0.13        65
          21       0.08      0.03      0.04        37
          22       0.50      0.2

In [55]:
def predict_next_word(
    model, input_sequence, word_to_idx, idx_to_word, mode="ngram", top_k=3
):
    """
    Предсказывает следующее слово на основе входной последовательности.

    Параметры:
        model: обученная модель (Keras или sklearn).
        input_sequence: исходное предложение (строка).
        word_to_idx: словарь для преобразования слов в индексы.
        idx_to_word: словарь для преобразования индексов в слова.
        mode: тип модели ("ngram" или "bow").
        top_k: количество вариантов для вывода.
    """
    # Токенизация и преобразование в нижний регистр
    tokens = word_tokenize(input_sequence.lower())
    tokens_idx = [
        word_to_idx.get(word, -1) for word in tokens
    ]  # -1 для неизвестных слов

    # Обработка неизвестных слов (замена на <UNK> или пропуск)
    tokens_idx = [
        idx if idx != -1 else word_to_idx.get("<UNK>", -1) for idx in tokens_idx
    ]
    if -1 in tokens_idx:
        print("Есть неизвестные слова!")
        return []

    # Подготовка данных в зависимости от типа модели
    if mode == "bow":
        # Используем последние window_size слов как контекст
        window_size = 5  # Должно совпадать с обучением!
        context = tokens_idx[-window_size:]
        if len(context) < window_size:
            # Дополняем нулями слева (pad_sequences)
            context = [0] * (window_size - len(context)) + context

        # Создаем вектор BoW (количество вхождений каждого слова)
        bow_vector = np.zeros(len(word_to_idx))
        for idx in context:
            if idx < len(word_to_idx):
                bow_vector[idx] += 1
        input_data = bow_vector.reshape(1, -1)

    elif mode == "ngram":
        # Используем последние n-1 слов для N-граммной модели
        n_gram = 3  # Должно совпадать с обучением!
        seq_length = n_gram - 1
        context = tokens_idx[-seq_length:]
        if len(context) < seq_length:
            # Дополняем нулями слева
            context = [0] * (seq_length - len(context)) + context

        input_data = np.array([context])

    else:
        raise ValueError("Режим должен быть 'bow' или 'ngram'")

    # Предсказание
    preds = model.predict(input_data)[0]
    top_indices = preds.argsort()[-top_k:][::-1]  # Топ-K индексов
    top_words = [idx_to_word[idx] for idx in top_indices if idx in idx_to_word]

    return top_words

In [61]:
input_sentence = "I love"
idx_to_word = {v: k for k, v in word_to_idx.items()}  # Создаем обратный словарь

# Предсказание через BoW
bow_prediction = predict_next_word(
    model_bow, 
    input_sentence, 
    word_to_idx, 
    idx_to_word, 
    mode="bow",
    top_k=3
)
" ".join([input_sentence, bow_prediction[0]])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step


'I love it'

In [63]:
input_sentence = "I love"
idx_to_word = {v: k for k, v in word_to_idx.items()}  # Создаем обратный словарь

# Предсказание через BoW
bow_prediction = predict_next_word(
    model_rnn, 
    input_sentence, 
    word_to_idx, 
    idx_to_word, 
    mode="ngram",
    top_k=3
)
" ".join([input_sentence, bow_prediction[0]])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


'I love this'