In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, FastText
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:

# Загрузка необходимых данных для NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/marat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/marat/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Загрузка файла CSV
file_path = "mc.csv"  # Замените на путь к вашему файлу
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [4]:
# Удаление дубликатов
data.drop_duplicates(inplace=True)

# Удаление строк с пропущенными значениями в колонке Comment
data = data.dropna(subset=["review"])

In [6]:
# Очистка текста: удаление ссылок, эмодзи, лишних пробелов
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Удаление ссылок
    text = re.sub(r"[^a-zA-Zа-яА-Я0-9\s]", '', text)  # Удаление всех символов кроме букв и цифр
    text = re.sub(r'\s+', ' ', text).strip()  # Удаление лишних пробелов
    return text

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [7]:
data["cleaned_review"] = data["review"].apply(clean_text)

# Нормализация: перевод текста в нижний регистр
data["cleaned_review"] = data["cleaned_review"].str.lower()

# Токенизация
data["tokens"] = data["cleaned_review"].apply(word_tokenize)

# Удаление стоп-слов (по желанию)
stop_words = set(stopwords.words("russian") + stopwords.words("english"))  # Добавьте языки по необходимости

data["tokens_no_stopwords"] = data["tokens"].apply(remove_stopwords)

In [19]:
# Сохранение результата в новый файл
output_file = "processed_data2.xlsx"
data.to_excel(output_file, index=False)

print(f"Предобработка завершена. Файл сохранен: {output_file}")

Предобработка завершена. Файл сохранен: processed_data2.xlsx


In [13]:
# 1. TF-IDF
def apply_tfidf(data):
    vectorizer = TfidfVectorizer(max_features=1000)  # Максимум 1000 признаков для примера
    tfidf_matrix = vectorizer.fit_transform(data["cleaned_review"]).toarray()
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

tfidf_matrix, tfidf_features = apply_tfidf(data)
print(f"TF-IDF векторизация завершена. Размерность: {tfidf_matrix.shape}")

TF-IDF векторизация завершена. Размерность: (33396, 1000)


In [14]:
# 2. Word2Vec
def train_word2vec(tokens):
    model = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4)
    return model

word2vec_model = train_word2vec(data["tokens"])
word2vec_vectors = np.array([
    np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0)
    if len(tokens) > 0 else np.zeros(word2vec_model.vector_size)
    for tokens in data["tokens"]
])
print(f"Word2Vec векторизация завершена. Размерность: {word2vec_vectors.shape}")

Word2Vec векторизация завершена. Размерность: (33396, 100)


In [15]:
# 3. FastText
def train_fasttext(tokens):
    model = FastText(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4)
    return model

fasttext_model = train_fasttext(data["tokens"])
fasttext_vectors = np.array([
    np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0)
    if len(tokens) > 0 else np.zeros(fasttext_model.vector_size)
    for tokens in data["tokens"]
])
print(f"FastText векторизация завершена. Размерность: {fasttext_vectors.shape}")

FastText векторизация завершена. Размерность: (33396, 100)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# 2. Word2Vec
def train_vectorizer(tokens):
    vectorizer = CountVectorizer(max_features=1000)  # Максимум 1000 признаков
    X_counts = vectorizer.fit_transform(data["cleaned_review"])
    return X_counts.toarray()

vectorizer_vectors  = train_vectorizer(data["tokens"]) 
print(f"Word2Vec векторизация завершена. Размерность: {vectorizer_vectors.shape}")

Word2Vec векторизация завершена. Размерность: (33396, 1000)


In [17]:
# Сохранение результатов для дальнейшей работы
np.save("tfidf_vectors.npy", tfidf_matrix)
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
np.save("word2vec_vectors.npy", word2vec_vectors)
word2vec_vectors

array([[ 0.24798714,  0.15006742, -0.31912231, ..., -0.5464887 ,
         0.0266407 , -0.23988548],
       [ 0.31747019,  0.05300501,  0.16994421, ..., -0.22851613,
        -0.17921297, -0.03659187],
       [-0.02589575,  0.35349786, -0.39811826, ..., -0.82562613,
         0.22126336, -0.54033977],
       ...,
       [ 0.51317132, -0.0359251 ,  0.0462436 , ..., -0.20405555,
         0.11780679, -0.45187187],
       [ 0.28966463, -0.1966936 ,  0.2243028 , ...,  0.19946809,
        -0.32800543,  0.01052366],
       [ 0.25528821, -0.37508866,  0.44096556, ..., -0.69754839,
         0.67834592, -0.55464244]])

In [19]:
np.save("fasttext_vectors.npy", fasttext_vectors)
fasttext_vectors

array([[ 0.10970908, -0.35354751, -0.57972389, ..., -0.12764983,
         0.43826881,  0.11531623],
       [-0.4964397 , -0.02631063, -0.27072978, ..., -0.4892619 ,
         0.06523692,  0.36658874],
       [ 0.18118294, -0.29088449, -0.51115525, ..., -0.49640018,
        -0.2248057 ,  0.10768735],
       ...,
       [-0.15396152,  0.28586802, -0.06608979, ...,  0.22940528,
         0.5257901 ,  0.11973723],
       [-0.3999916 , -0.35895768,  0.16134413, ..., -0.23990907,
         0.07201809,  0.13066594],
       [-0.90981585, -0.7439577 , -0.22964275, ..., -0.1411529 ,
         0.43326142,  0.61729252]])

In [20]:
np.save("vectorizer_vectors.npy", vectorizer_vectors)
vectorizer_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# vectors = tfidf_matrix  # Используем TF-IDF как векторизацию (можно заменить на Word2Vec или FastText)
# vectors = word2vec_vectors
# vectors = fasttext_vectors
vectors = vectorizer_vectors

labels = data["rating"]    

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Функция для обучения и оценки модели
def train_and_evaluate(model, vector, name):

    # Разделение на обучающую и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(vector, labels, test_size=0.2, random_state=42, stratify=labels)

    # Проверка на наличие отрицательных значений
    print("Min value in X_train:", X_train.min())
    print("Min value in X_test:", X_test.min())

    # Проверка на наличие отрицательных значений
    print("Max value in X_train:", X_train.max())
    print("Max value in X_test:", X_test.max())

    print("Len value in X_train:", len(X_train))
    print("Len value in X_test:", len(X_test))


    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Метрики
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

    print(f"\n=== {name} ===")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=1))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {
        "model": name,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [24]:
results = []

In [25]:
# 1. Наивный байесовский классификатор
nb_model = MultinomialNB()
results.append(train_and_evaluate(nb_model, tfidf_matrix, "Наивный Байес"))

Min value in X_train: 0.0
Min value in X_test: 0.0
Max value in X_train: 1.0
Max value in X_test: 1.0
Len value in X_train: 26716
Len value in X_test: 6680

=== Наивный Байес ===
Classification Report:
              precision    recall  f1-score   support

      1 star       0.60      0.91      0.72      1886
     2 stars       0.78      0.12      0.20       617
     3 stars       0.64      0.32      0.43       964
     4 stars       0.60      0.37      0.46      1158
     5 stars       0.66      0.80      0.73      2055

    accuracy                           0.63      6680
   macro avg       0.66      0.51      0.51      6680
weighted avg       0.64      0.63      0.59      6680

Confusion Matrix:
[[1720   17   29   20  100]
 [ 416   71   39   29   62]
 [ 356    2  313   99  194]
 [ 174    1   66  433  484]
 [ 222    0   39  146 1648]]


In [26]:
# 2. Логистическая регрессия
lr_model = LogisticRegression()
results.append(train_and_evaluate(lr_model, word2vec_vectors, "Логистическая регрессия"))


Min value in X_train: -3.240269899368286
Min value in X_test: -3.091440200805664
Max value in X_train: 4.25397253036499
Max value in X_test: 4.25397253036499
Len value in X_train: 26716
Len value in X_test: 6680

=== Логистическая регрессия ===
Classification Report:
              precision    recall  f1-score   support

      1 star       0.61      0.88      0.72      1886
     2 stars       0.63      0.13      0.21       617
     3 stars       0.49      0.37      0.42       964
     4 stars       0.61      0.35      0.44      1158
     5 stars       0.65      0.75      0.70      2055

    accuracy                           0.61      6680
   macro avg       0.59      0.50      0.50      6680
weighted avg       0.60      0.61      0.57      6680

Confusion Matrix:
[[1665   17  101   14   89]
 [ 395   79   56   22   65]
 [ 302   18  355   76  213]
 [ 156    7  105  407  483]
 [ 233    5  114  153 1550]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# 3. Метод опорных векторов (SVM)
svm_model = SVC()
results.append(train_and_evaluate(svm_model, fasttext_vectors, "SVM"))

Min value in X_train: -8.752981185913086
Min value in X_test: -3.74164080619812
Max value in X_train: 6.57462739944458
Max value in X_test: 3.836000919342041
Len value in X_train: 26716
Len value in X_test: 6680

=== SVM ===
Classification Report:
              precision    recall  f1-score   support

      1 star       0.59      0.90      0.71      1886
     2 stars       0.97      0.10      0.19       617
     3 stars       0.55      0.33      0.42       964
     4 stars       0.77      0.27      0.40      1158
     5 stars       0.61      0.82      0.70      2055

    accuracy                           0.61      6680
   macro avg       0.70      0.49      0.48      6680
weighted avg       0.66      0.61      0.56      6680

Confusion Matrix:
[[1690    1   57    3  135]
 [ 410   64   48   10   85]
 [ 330    1  320   30  283]
 [ 178    0   93  315  572]
 [ 254    0   60   51 1690]]


In [28]:
# 4. Дерево решений
dt_model = DecisionTreeClassifier()
results.append(train_and_evaluate(dt_model, vectorizer_vectors, "Дерево решений"))


Min value in X_train: 0
Min value in X_test: 0
Max value in X_train: 37
Max value in X_test: 37
Len value in X_train: 26716
Len value in X_test: 6680

=== Дерево решений ===
Classification Report:
              precision    recall  f1-score   support

      1 star       0.74      0.74      0.74      1886
     2 stars       0.47      0.44      0.46       617
     3 stars       0.50      0.46      0.48       964
     4 stars       0.54      0.52      0.53      1158
     5 stars       0.69      0.74      0.72      2055

    accuracy                           0.63      6680
   macro avg       0.59      0.58      0.58      6680
weighted avg       0.63      0.63      0.63      6680

Confusion Matrix:
[[1388  174  127   61  136]
 [ 181  274   67   46   49]
 [ 158   67  448  132  159]
 [  80   26  121  604  327]
 [  76   42  142  274 1521]]


In [29]:
vectors = vectorizer_vectors

labels = data["rating"] 
mapped_labels = labels.map({'1 star': 0, '2 stars': 1, '3 stars': 2, '4 stars': 3, '5 stars': 4})

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(vectors, mapped_labels, test_size=0.2, random_state=42, stratify=labels)
# Нормализация данных для MultinomialNB

def train_and_evaluateBoosting(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

    print(f"\n=== {name} ===")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=1))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return {
        "model": name,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [30]:
print(type(mapped_labels))

<class 'pandas.core.series.Series'>


In [31]:
# 5. Бустинг (XGBoost)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
results.append(train_and_evaluateBoosting(xgb_model, "XGBoost"))

Parameters: { "use_label_encoder" } are not used.




=== XGBoost ===
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.87      0.77      1886
           1       0.63      0.25      0.36       617
           2       0.62      0.41      0.50       964
           3       0.60      0.44      0.50      1158
           4       0.67      0.83      0.74      2055

    accuracy                           0.66      6680
   macro avg       0.64      0.56      0.57      6680
weighted avg       0.65      0.66      0.64      6680

Confusion Matrix:
[[1633   55   39   30  129]
 [ 300  156   50   34   77]
 [ 246   24  399   90  205]
 [ 105   10   90  505  448]
 [  90    4   63  188 1710]]


In [32]:
results_df = pd.DataFrame(results)

# Вывод результатов
print(results_df)

                     model  accuracy  precision    recall        f1
0            Наивный Байес  0.626497   0.640176  0.626497  0.587104
1  Логистическая регрессия  0.607186   0.602546  0.607186  0.574022
2                      SVM  0.610629   0.657721  0.610629  0.563617
3           Дерево решений  0.633982   0.630165  0.633982  0.631711
4                  XGBoost  0.659132   0.650033  0.659132  0.636343


In [33]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score 

vectors = word2vec_vectors
n_clusters = 5  # Количество кластеров

# Применение K-средних
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(vectors)

print(1)
# Оценка качества кластеризации
silhouette_avg_k = silhouette_score(vectors, kmeans_labels)
davies_bouldin_k = davies_bouldin_score(vectors, kmeans_labels)
calinski_harabasz_k = calinski_harabasz_score(vectors, kmeans_labels)


if 'labels' in locals():
    ari = adjusted_rand_score(labels, kmeans_labels)
    print(f"ARI (KMeans): {ari:.4f}")

print(f"Silhouette Score: {silhouette_avg_k:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_k:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz_k:.4f}")

1
ARI (KMeans): 0.0809
Silhouette Score: 0.1675
Davies-Bouldin Index: 1.9217
Calinski-Harabasz Index: 6435.8273


In [34]:
from sklearn.mixture import GaussianMixture

# Подбор количества кластеров
n_clusters = 5  # Задайте желаемое количество кластеров

# Обучение GMM
gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=42)
gmm_labels = gmm.fit_predict(vectors)

# Оценка качества кластеризации
silhouette_avg_g = silhouette_score(vectors, gmm_labels)
davies_bouldin_g = davies_bouldin_score(vectors, gmm_labels)
calinski_harabasz_g = calinski_harabasz_score(vectors, gmm_labels)

print(f"Silhouette Score: {silhouette_avg_g:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_g:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz_g:.4f}")


Silhouette Score: 0.1620
Davies-Bouldin Index: 3.2858
Calinski-Harabasz Index: 4408.8550


In [35]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Применение иерархической кластеризации
linkage_matrix = linkage(vectors, method='ward')

# Выделение кластеров
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
hierarchical_labels = hierarchical.fit_predict(vectors)

# Метрика силуэт
silhouette_avg_h = silhouette_score(vectors, hierarchical_labels)
davies_bouldin_h = davies_bouldin_score(vectors, hierarchical_labels)
calinski_harabasz_h = calinski_harabasz_score(vectors, hierarchical_labels)

print(f"Silhouette Score (Hierarchical): {silhouette_avg_h:.4f}")
print(f"Davies-Bouldin Index: {davies_bouldin_h:.4f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz_h:.4f}")

Silhouette Score (Hierarchical): 0.1229
Davies-Bouldin Index: 1.8923
Calinski-Harabasz Index: 6174.9417


In [36]:
dataClustering = {
    "GMM": [silhouette_avg_g, davies_bouldin_g, calinski_harabasz_g],
    "AgglomerativeClustering": [silhouette_avg_h, davies_bouldin_h, calinski_harabasz_h],
    "K-Means": [silhouette_avg_k, davies_bouldin_k, calinski_harabasz_k]
}

# Индексы для строк
metrics = ["Silhouette Score", "Davies-Bouldin Index", "Calinski-Harabasz Index"]

# Создание DataFrame
dfClustering = pd.DataFrame(dataClustering, index=metrics)

# Вывод таблицы
print(dfClustering)

                                 GMM  AgglomerativeClustering      K-Means
Silhouette Score            0.161985                 0.122929     0.167490
Davies-Bouldin Index        3.285804                 1.892295     1.921726
Calinski-Harabasz Index  4408.854991              6174.941662  6435.827253


In [37]:
data

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,cleaned_review,tokens,tokens_no_stopwords
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star,why does it look like someone spit on my food ...,"[why, does, it, look, like, someone, spit, on,...","[look, like, someone, spit, food, normal, tran..."
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,itd mcdonalds it is what it is as far as the f...,"[itd, mcdonalds, it, is, what, it, is, as, far...","[itd, mcdonalds, far, food, atmosphere, go, st..."
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star,made a mobile order got to the speaker and che...,"[made, a, mobile, order, got, to, the, speaker...","[made, mobile, order, got, speaker, checked, l..."
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,my mc crispy chicken sandwich was customer ser...,"[my, mc, crispy, chicken, sandwich, was, custo...","[mc, crispy, chicken, sandwich, customer, serv..."
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,i repeat my order 3 times in the drive thru an...,"[i, repeat, my, order, 3, times, in, the, driv...","[repeat, order, 3, times, drive, thru, still, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33391,33392,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,4 years ago,They treated me very badly.,1 star,they treated me very badly,"[they, treated, me, very, badly]","[treated, badly]"
33392,33393,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,The service is very good,5 stars,the service is very good,"[the, service, is, very, good]","[service, good]"
33393,33394,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,a year ago,To remove hunger is enough,4 stars,to remove hunger is enough,"[to, remove, hunger, is, enough]","[remove, hunger, enough]"
33394,33395,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,5 years ago,"It's good, but lately it has become very expen...",5 stars,its good but lately it has become very expensive,"[its, good, but, lately, it, has, become, very...","[good, lately, become, expensive]"


In [38]:
vectors = word2vec_vectors

In [39]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

tokens = data["tokens_no_stopwords"]
labels = data["rating"] 
y = labels.map({'1 star': 0, '2 stars': 1, '3 stars': 2, '4 stars': 3, '5 stars': 4})
y = tf.keras.utils.to_categorical(y, num_classes=5)

X = vectors

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
def create_cnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Dropout(0.5),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')  # 5 классов
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_lstm_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.LSTM(64, return_sequences=False),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(5, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Определение модели GRU
def create_gru_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.GRU(64, return_sequences=False),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(5, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Определение модели BiRNN
def create_birnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(5, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [41]:
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping

models = {
    'CNN': create_cnn_model(X_train.shape[1]),
    'LSTM': create_lstm_model(X_train.shape[1]),
    'GRU': create_gru_model(X_train.shape[1]),
    'BiRNN': create_birnn_model(X_train.shape[1]),
}

results = {}

for model_name, model in models.items():
    print(f"Training {model_name} model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test accuracy: {accuracy*100:.2f}%')
    
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)  # Получаем индекс с максимальной вероятностью
    # Преобразуем y_test обратно в метки, так как они были в one-hot encoding
    y_test_classes = np.argmax(y_test, axis=1)

    results[model_name] = classification_report(y_test_classes, y_pred_classes, target_names=['1 star', '2 stars', '3 stars', '4 stars', '5 stars'])

# Вывод метрик для каждой модели
for model_name, report in results.items():
    print(f"\n{model_name} Model Classification Report:")
    print(report)

Training CNN model...
Epoch 1/5




[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.4386 - loss: 1.3520 - val_accuracy: 0.5006 - val_loss: 1.2553
Epoch 2/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.4925 - loss: 1.2470 - val_accuracy: 0.5027 - val_loss: 1.2443
Epoch 3/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.4953 - loss: 1.2426 - val_accuracy: 0.5075 - val_loss: 1.2377
Epoch 4/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.4999 - loss: 1.2287 - val_accuracy: 0.5034 - val_loss: 1.2419
Epoch 5/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.5035 - loss: 1.2216 - val_accuracy: 0.5058 - val_loss: 1.2399
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5024 - loss: 1.2453
Test accuracy: 50.75%
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 37ms/step - accuracy: 0.3606 - loss: 1.4882 - val_accuracy: 0.4660 - val_loss: 1.4297
Epoch 2/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 32ms/step - accuracy: 0.4516 - loss: 1.3450 - val_accuracy: 0.4853 - val_loss: 1.3097
Epoch 3/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 36ms/step - accuracy: 0.4671 - loss: 1.3131 - val_accuracy: 0.4901 - val_loss: 1.2833
Epoch 4/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.4803 - loss: 1.2934 - val_accuracy: 0.4829 - val_loss: 1.2786
Epoch 5/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 33ms/step - accuracy: 0.4757 - loss: 1.2967 - val_accuracy: 0.4856 - val_loss: 1.2825
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4803 - loss: 1.2829
Test accuracy: 48.29%
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0

In [42]:
vectors = word2vec_vectors

In [43]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

tokens = data["tokens_no_stopwords"]
labels = data["rating"] 
y = labels.map({'1 star': 0, '2 stars': 0, '3 stars': 0, '4 stars': 1, '5 stars': 1})
y = tf.keras.utils.to_categorical(y, num_classes=2)

X = vectors

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
def create_cnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Dropout(0.5),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')  # 5 классов
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_lstm_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.LSTM(64, return_sequences=False),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(2, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Определение модели GRU
def create_gru_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.GRU(64, return_sequences=False),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(2, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Определение модели BiRNN
def create_birnn_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.BatchNormalization(),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),  # Добавление Dropout для предотвращения переобучения
        tf.keras.layers.Dense(2, activation='softmax')  # 5 классов для многоклассовой классификации
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [45]:
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping

models = {
    'CNN': create_cnn_model(X_train.shape[1]),
    'LSTM': create_lstm_model(X_train.shape[1]),
    'GRU': create_gru_model(X_train.shape[1]),
    'BiRNN': create_birnn_model(X_train.shape[1]),
}

results = {}

for model_name, model in models.items():
    print(f"Training {model_name} model...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test accuracy: {accuracy*100:.2f}%')
    
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)  # Получаем индекс с максимальной вероятностью
    # Преобразуем y_test обратно в метки, так как они были в one-hot encoding
    y_test_classes = np.argmax(y_test, axis=1)

    results[model_name] = classification_report(y_test_classes, y_pred_classes, target_names=['Negative', 'Positive'])

# Вывод метрик для каждой модели
for model_name, report in results.items():
    print(f"\n{model_name} Model Classification Report:")
    print(report)

Training CNN model...
Epoch 1/5




[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.6947 - loss: 0.5641 - val_accuracy: 0.7400 - val_loss: 0.5234
Epoch 2/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.7415 - loss: 0.5119 - val_accuracy: 0.7418 - val_loss: 0.5150
Epoch 3/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.7414 - loss: 0.5089 - val_accuracy: 0.7403 - val_loss: 0.5116
Epoch 4/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.7412 - loss: 0.5102 - val_accuracy: 0.7431 - val_loss: 0.5109
Epoch 5/5
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.7469 - loss: 0.5079 - val_accuracy: 0.7458 - val_loss: 0.5127
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7448 - loss: 0.5123
Test accuracy: 74.31%
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3m

In [46]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokens = data["tokens_no_stopwords"]
labels = data["rating"] 
y = labels.map({'1 star': 0, '2 stars': 0, '3 stars': 0, '4 stars': 1, '5 stars': 1})
y  = tf.keras.utils.to_categorical(y , num_classes=2)


tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(data["cleaned_review"])
sequences = tokenizer.texts_to_sequences(data["cleaned_review"])

X = pad_sequences(sequences, padding='post', maxlen=50)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
from tensorflow.keras.layers import Attention

def create_gru_model(input_shape, num_classes=2):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.GRU(64, return_sequences=False),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_lstm_model(input_shape, num_classes=2):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.LSTM(64, return_sequences=False),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_birnn_model(input_shape, num_classes=2):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=input_shape),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


def create_seq2seq_model(input_shape, num_classes=2):
    # Encoder
    encoder_input = tf.keras.layers.Input(shape=(input_shape,))
    embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=128)(encoder_input)
    encoder_output, state_h, state_c = tf.keras.layers.LSTM(64, return_state=True)(embedding)

    # Attention
    attention = tf.keras.layers.Attention()([encoder_output, embedding])

    # Decoder
    decoder_input = tf.keras.layers.RepeatVector(1)(attention)
    decoder_output = tf.keras.layers.LSTM(64, return_sequences=False)(decoder_input)
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(decoder_output)

    model = tf.keras.Model(inputs=encoder_input, outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_seq2seq_rnn_model(input_shape, num_classes=2):
    # Seq2Seq Encoder
    encoder_input = tf.keras.layers.Input(shape=(input_shape,))
    embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=128)(encoder_input)
    encoder_output, state_h, state_c = tf.keras.layers.LSTM(64, return_state=True)(embedding)
    
    # RNN Classifier
    rnn_input = tf.keras.layers.RepeatVector(1)(encoder_output)
    rnn_output = tf.keras.layers.LSTM(64, return_sequences=False)(rnn_input)
    dense_output = tf.keras.layers.Dense(64, activation='relu')(rnn_output)
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(dense_output)
    
    # Model definition
    model = tf.keras.Model(inputs=encoder_input, outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_seq2seq_birnn_model(input_shape, num_classes=2):
    # Seq2Seq Encoder
    encoder_input = tf.keras.layers.Input(shape=(input_shape,))
    embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=128)(encoder_input)
    encoder_output, state_h, state_c = tf.keras.layers.LSTM(64, return_state=True)(embedding)
    
    # BiRNN Classifier
    birnn_output = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=False)
    )(tf.keras.layers.RepeatVector(1)(encoder_output))
    dense_output = tf.keras.layers.Dense(64, activation='relu')(birnn_output)
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(dense_output)
    
    # Model definition
    model = tf.keras.Model(inputs=encoder_input, outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [48]:
# Модели для обучения
models = {
    "GRU": create_gru_model(input_shape=X_train.shape[1]),
    "LSTM": create_lstm_model(input_shape=X_train.shape[1]),
    "BiRNN": create_birnn_model(input_shape=X_train.shape[1]),
    "Seq2Seq+RNN": create_seq2seq_rnn_model(input_shape=X_train.shape[1]),
    "Seq2Seq+BiRNN": create_seq2seq_birnn_model(input_shape=X_train.shape[1]),
}

# Обучение и оценка моделей
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
    y_pred = np.argmax(model.predict(X_test), axis=1)
    y_true = np.argmax(y_test, axis=1)
    results[model_name] = classification_report(y_true, y_pred, target_names=["Negative", "Positive"])
    print(results[model_name])


Training GRU...
Epoch 1/5




[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.6161 - loss: 0.6281 - val_accuracy: 0.8395 - val_loss: 0.3770
Epoch 2/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.8644 - loss: 0.3364 - val_accuracy: 0.8636 - val_loss: 0.3104
Epoch 3/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.8974 - loss: 0.2575 - val_accuracy: 0.8707 - val_loss: 0.2951
Epoch 4/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.9149 - loss: 0.2143 - val_accuracy: 0.8757 - val_loss: 0.3164
Epoch 5/5
[1m835/835[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - accuracy: 0.9293 - loss: 0.1842 - val_accuracy: 0.8734 - val_loss: 0.3297
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

    Negative       0.90      0.86      0.88      3482
    Positiv

In [49]:
def ensemble_predict(models, X):
    # Получаем предсказания всех моделей
    predictions = [model.predict(X) for model in models.values()]
    # Усредняем вероятности
    avg_prediction = np.mean(predictions, axis=0)
    return np.argmax(avg_prediction, axis=1)


In [50]:
y_pred = ensemble_predict(models, X_test)

# Оценка ансамбля
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))

[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m209/209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      3482
    Positive       0.87      0.89      0.88      3198

    accuracy                           0.89      6680
   macro avg       0.89      0.89      0.89      6680
weighted avg       0.89      0.89      0.89      6680



In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

2025-01-16 19:01:45.671632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-16 19:01:45.856396: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-01-16 19:01:45.856413: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-01-16 19:01:46.636612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-

In [9]:
def map_rating_to_label(rating):
    if rating <= 3:
        return "negative"
    else:
        return "positive"

# Предположим, вы уже загрузили ваш DataFrame `data`
labels = data['rating'].map({'1 star': 1, '2 stars': 2, '3 stars': 3, '4 stars': 4, '5 stars': 5})
data['label'] = labels.apply(map_rating_to_label)

In [35]:
# Преобразуем рейтинг в категории (отрицательный, положительный)


# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_review'], data['label'], test_size=0.2, random_state=42
)

# Создание кастомного Dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = [0 if label == "negative" else 1 for label in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encodings = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        encodings = {key: val.squeeze(0) for key, val in encodings.items()}
        encodings["labels"] = label
        return encodings

# Токенизация
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = CustomDataset(X_train, y_train, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

# Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Настройка Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.1,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Обучение модели
trainer.train()

# Сохранение модели
model.save_pretrained("./custom_model")
tokenizer.save_pretrained("./custom_model")

# Оценка модели на тестовых данных
predictions = trainer.predict(test_dataset)
predicted_labels = [0 if pred < 0.5 else 1 for pred in predictions.predictions.argmax(axis=1)]
y_test_labels = [0 if label == "negative" else 1 for label in y_test]

print("Classification Report:")
print(classification_report(y_test_labels, predicted_labels))



Epoch,Training Loss,Validation Loss
1,0.27,0.304154


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      3482
           1       0.89      0.90      0.89      3198

    accuracy                           0.90      6680
   macro avg       0.90      0.90      0.90      6680
weighted avg       0.90      0.90      0.90      6680

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (inotify)
Traceback (most recent call last):
  File "/home/marat/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/marat/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/home/marat/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
  File "/home/marat/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 678, in initialize
    self.init_sockets()
  File "/home/marat/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 317, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/home/marat/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 252, in _bind_socket
    return self._try_bind_

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [10]:
import torch
from sklearn.metrics import accuracy_score


model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

texts = data["cleaned_review"].tolist()  # Предположим, у вас есть колонка "text"
labels = data['label'].tolist()  # И колонка "label"

batch_size = 512  # Можно уменьшить, если ошибка сохраняется
predictions = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i : i + batch_size]
    inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")

    print(f"Processing batch {i//batch_size + 1}: {len(batch_texts)} samples")  # Проверка размера батча

    with torch.no_grad():
        outputs = model(**inputs)

    batch_predictions = torch.argmax(outputs.logits, dim=1).tolist()
    predictions.extend(batch_predictions)  # Добавляем в общий список

print(f"Final Predictions shape: {len(predictions)}") 

Processing batch 1: 512 samples
Processing batch 2: 512 samples
Processing batch 3: 512 samples
Processing batch 4: 512 samples
Processing batch 5: 512 samples
Processing batch 6: 512 samples
Processing batch 7: 512 samples
Processing batch 8: 512 samples
Processing batch 9: 512 samples
Processing batch 10: 512 samples
Processing batch 11: 512 samples
Processing batch 12: 512 samples
Processing batch 13: 512 samples
Processing batch 14: 512 samples
Processing batch 15: 512 samples
Processing batch 16: 512 samples
Processing batch 17: 512 samples
Processing batch 18: 512 samples
Processing batch 19: 512 samples
Processing batch 20: 512 samples
Processing batch 21: 512 samples
Processing batch 22: 512 samples
Processing batch 23: 512 samples
Processing batch 24: 512 samples
Processing batch 25: 512 samples
Processing batch 26: 512 samples
Processing batch 27: 512 samples
Processing batch 28: 512 samples
Processing batch 29: 512 samples
Processing batch 30: 512 samples
Processing batch 31

In [15]:
print(f"Labels shape: {len(labels)}")
print(f"Predictions shape: {len(predictions)}")
print(predictions)
print(labels)

label_mapping = {'negative': 0, 'positive': 1}
numeric_labels = [label_mapping[label] for label in labels]
print(numeric_labels)

Labels shape: 33396
Predictions shape: 33396
[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 

In [16]:
accuracy = accuracy_score(numeric_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8626
