In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import spacy
import re
import string
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import *
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel
import numpy as np
from pprint import pprint
from tqdm.notebook import tqdm
from transformers import pipeline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Датасет

Для выполгнения лабораторной работы был выбран датасет с отзывами на электронику (https://huggingface.co/datasets/rkf2778/amazon_reviews_mobile_electronics).

В качестве классов рассматриваются оценки пользователей по 5-балльной шкале (столбец `star_rating`).

In [None]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
train_df = pd.read_csv("hf://datasets/rkf2778/amazon_reviews_mobile_electronics/" + splits["train"])[['review_body', 'star_rating']]
test_df = pd.read_csv("hf://datasets/rkf2778/amazon_reviews_mobile_electronics/" + splits["test"])[['review_body', 'star_rating']]

display(train_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,review_body,star_rating
0,Item was as discribed ; was shipped fast and f...,5
1,Came in perfect. Works perfectly,5
2,"Good quality, came with out the extra boom mic...",2
3,I do not like reading my Kindle in a case so t...,5
4,The case came 3 days after I ordered it. It fi...,5
...,...,...
68899,Nice very pretty,3
68900,I absolutley love this! So glad that I bought ...,4
68901,I bought this to give me addtional battery pow...,5
68902,"Great sound from a little box, and what seems ...",5


## Предподготовка данных

Удаляем пустые отзывы, поскольку такие имеются в датасете.

In [None]:
train_df = train_df.dropna(subset=['review_body'])
test_df = test_df.dropna(subset=['review_body'])

Удаляем html-фрагменты и перенос строки ('\n') из отзывов.

In [None]:
train_df['review_body'] =  train_df['review_body'].apply(lambda x: re.sub(r'<(.*?)>', "", str(x)))
test_df['review_body'] =  test_df['review_body'].apply(lambda x: re.sub(r'<(.*?)>', "", str(x)))

train_df['review_body'] =  train_df['review_body'].apply(lambda x: str(x).replace("\n", ""))
test_df['review_body'] =  test_df['review_body'].apply(lambda x: str(x).replace("\n", ""))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['review_body'] =  train_df['review_body'].apply(lambda x: re.sub(r'<(.*?)>', "", str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['review_body'] =  train_df['review_body'].apply(lambda x: str(x).replace("\n", ""))


Приводим все слова к нижнему регистру.

Также появлется новый столбец (`tkn`), в которому будут находиться токены каждого текста.

In [None]:
train_df['tkn'] = train_df['review_body'].apply(str.lower)
test_df['tkn'] = test_df['review_body'].apply(str.lower)

train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['tkn'] = train_df['review_body'].apply(str.lower)


Unnamed: 0,review_body,star_rating,tkn
0,Item was as discribed ; was shipped fast and f...,5,item was as discribed ; was shipped fast and f...
1,Came in perfect. Works perfectly,5,came in perfect. works perfectly
2,"Good quality, came with out the extra boom mic...",2,"good quality, came with out the extra boom mic..."
3,I do not like reading my Kindle in a case so t...,5,i do not like reading my kindle in a case so t...
4,The case came 3 days after I ordered it. It fi...,5,the case came 3 days after i ordered it. it fi...
...,...,...,...
68899,Nice very pretty,3,nice very pretty
68900,I absolutley love this! So glad that I bought ...,4,i absolutley love this! so glad that i bought ...
68901,I bought this to give me addtional battery pow...,5,i bought this to give me addtional battery pow...
68902,"Great sound from a little box, and what seems ...",5,"great sound from a little box, and what seems ..."


Токенизируем и лемманизируем каждый отзыв.

In [None]:
tokenize = spacy.load("en_core_web_sm")
train_df['tkn'] =  train_df['tkn'].apply(lambda x: [i.lemma_ for i in tokenize(x)])
test_df['tkn'] =  test_df['tkn'].apply(lambda x: [i.lemma_ for i in tokenize(x)])

Получется следующий результат:

In [None]:
display(train_df)

Unnamed: 0,review_body,star_rating,tkn
0,Item was as discribed ; was shipped fast and f...,5,"[item, be, as, discribe, ;, be, ship, fast, an..."
1,Came in perfect. Works perfectly,5,"[come, in, perfect, ., work, perfectly]"
2,"Good quality, came with out the extra boom mic...",2,"[good, quality, ,, come, with, out, the, extra..."
3,I do not like reading my Kindle in a case so t...,5,"[I, do, not, like, read, my, kindle, in, a, ca..."
4,The case came 3 days after I ordered it. It fi...,5,"[the, case, come, 3, day, after, I, order, it,..."
...,...,...,...
68899,Nice very pretty,3,"[nice, very, pretty]"
68900,I absolutley love this! So glad that I bought ...,4,"[I, absolutley, love, this, !, so, glad, that,..."
68901,I bought this to give me addtional battery pow...,5,"[I, buy, this, to, give, I, addtional, battery..."
68902,"Great sound from a little box, and what seems ...",5,"[great, sound, from, a, little, box, ,, and, w..."


Как видно, среди токенов есть знаки препинания, числа и слова, которые не несут практического применения в нашей задаче, но везде используются (стоп-слова). Все это удаляем.

In [None]:
stop_words = stopwords.words('english')
print(stop_words)
print(len(stop_words))
punct = string.digits + '...' + string.punctuation + ' I'
train_df['tkn'] = train_df['tkn'].apply(lambda x: ' '.join([i for i in x if (i not in punct) and (i not in stop_words)]))
test_df['tkn'] = test_df['tkn'].apply(lambda x: ' '.join([i for i in x if (i not in punct) and (i not in stop_words)]))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Получется следующий результат:

In [None]:
train_df

Unnamed: 0,review_body,star_rating,tkn
0,Item was as discribed ; was shipped fast and f...,5,item discribe ship fast function well would bu...
1,Came in perfect. Works perfectly,5,come perfect work perfectly
2,"Good quality, came with out the extra boom mic...",2,good quality come extra boom mic cushion cord ...
3,I do not like reading my Kindle in a case so t...,5,like read kindle case sleeve great good protec...
4,The case came 3 days after I ordered it. It fi...,5,case come day order fit ipod really well color...
...,...,...,...
68899,Nice very pretty,3,nice pretty
68900,I absolutley love this! So glad that I bought ...,4,absolutley love glad buy review read would exp...
68901,I bought this to give me addtional battery pow...,5,buy give addtional battery power cell phone us...
68902,"Great sound from a little box, and what seems ...",5,great sound little box seem excellent battery ...


## LinearSVC

Используем `LinearSVC`, который находит гиперплоскости для максимизации расстояния между классифицированными образцами.

In [None]:
vector = TfidfVectorizer(ngram_range=(1, 3))
vector_train = vector.fit_transform(train_df["tkn"])
clf = LinearSVC()
clf.fit(vector_train, train_df["star_rating"])
vector_test = vector.transform(test_df["tkn"])
pred = clf.predict(vector_test)
report = classification_report(test_df["star_rating"], pred)
print(report)

              precision    recall  f1-score   support

           1       0.65      0.77      0.70      3043
           2       0.35      0.07      0.12      1267
           3       0.32      0.13      0.19      1591
           4       0.37      0.30      0.33      2867
           5       0.72      0.89      0.80      8459

    accuracy                           0.64     17227
   macro avg       0.48      0.43      0.43     17227
weighted avg       0.59      0.64      0.60     17227



`Вывод:` Как видно, модель делает меньше ошибок для 1 и 5 классов, что логично, ведь в обучающей выборке классы представлены в неравных пропорциях. Отзывов из 1 и 5 групп больше, чем остальных, поэтому и модель выявляет их лучше.

Учитывая то, что модель находиит гиперплоскости для разделения классов, можно сказать, что результат получился достаточно неплохим, но не идеальным.

## BERT

### Без дообучения

Используем теперь нейронную сеть `BERT`. Посмотрим какой результат покажет модель без дообучения.

In [None]:
train_review = train_df["review_body"]
test_review = test_df["review_body"]
train_labels = train_df["star_rating"] - 1
test_labels = test_df["star_rating"] - 1

token_bert = BertTokenizer.from_pretrained('bert-base-cased')
train_encodings = token_bert(list(train_review), truncation=True, padding=True, return_tensors="tf")
test_encodings = token_bert(list(test_review), truncation=True, padding=True, return_tensors="tf")

train_dataset_bert = tf.data.Dataset.from_tensor_slices((dict(train_encodings), list(train_labels))).batch(8)

model_1 = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
model_1.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.0001), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

test_dataset_bert = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(8)
pred_1 = model_1.predict(test_dataset_bert).logits
pred_labels = tf.argmax(pred_1, axis=1).numpy()

print(pred_labels)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[3 3 3 ... 3 3 3]


Оценим полученный результат:



In [None]:
accuracy = accuracy_score(test_labels, pred_labels)
precision = precision_score(test_labels, pred_labels, average='macro')
recall = recall_score(test_labels, pred_labels, average='macro')
f1 = f1_score(test_labels, pred_labels, average='macro')
conf_matrix = confusion_matrix(test_labels, pred_labels)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.16
Precision: 0.22
Recall: 0.20
F1 Score: 0.08
Confusion Matrix:
[[   1    0  676 2357    9]
 [   0    0  219 1046    2]
 [   1    0  234 1353    3]
 [   1    0  435 2426    5]
 [   1    0 1437 6995   26]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


`Вывод:` Как видно, без дообучения, модель работает очень плохо и распределяет отзывы преимущественно в две группы, причем они не самые популярные в рассматриваемых данных (самые популярные 1 и 5 классы, а модель решила, что 3 и 4). Она совсем не смогла понять данные и критерии, по которым надо классифицировать.

### С дообучением

Теперь посмотрим какой результат покажет BERT, если дообучить модель. Добавим три эпохи, чтобы модель могла изучить конкретно наши данные и найти закономерности.

In [None]:
train_review = train_df["review_body"]
test_review = test_df["review_body"]
train_labels = train_df["star_rating"] - 1
test_labels = test_df["star_rating"] - 1

token_bert = BertTokenizer.from_pretrained('bert-base-cased')
train_encodings_2 = token_bert(list(train_review), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_encodings_2 = token_bert(list(test_review), truncation=True, padding=True, max_length=128, return_tensors="tf")

train_dataset_bert_2 = tf.data.Dataset.from_tensor_slices((dict(train_encodings_2), list(train_labels))).batch(2)

model_2 = TFBertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
model_2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model_2.fit(train_dataset_bert_2, epochs=3)

test_dataset_bert_2 = tf.data.Dataset.from_tensor_slices(dict(test_encodings_2)).batch(2)
pred_2 = model_2.predict(test_dataset_bert_2).logits
pred_labels_2 = tf.argmax(pred_2, axis=1).numpy()

print(pred_labels_2)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
[4 4 4 ... 4 4 4]


Оценим полученный результат:

In [None]:
accuracy = accuracy_score(test_labels, pred_labels_2)
precision = precision_score(test_labels, pred_labels_2, average='macro')
recall = recall_score(test_labels, pred_labels_2, average='macro')
f1 = f1_score(test_labels, pred_labels_2, average='macro')
conf_matrix = confusion_matrix(test_labels, pred_labels_2)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.49
Precision: 0.10
Recall: 0.20
F1 Score: 0.13
Confusion Matrix:
[[   0    0    0    0 3043]
 [   0    0    0    0 1267]
 [   0    0    0    0 1591]
 [   0    0    0    0 2867]
 [   0    0    0    0 8459]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


`Вывод:` Можно увидеть, что модель с дообучением выявила, что больше всего отзывов 5-ой группы, и, видимо, решила, что в этом и заключается логика датасета. Теперь она распределяет все только в 5-ый класс, поскольку он самый частые в датасете. Из-за этого показатели выше, но по факту модель нельзя назвать рабочей.

Из-за большого количества классов эта проблема выражена ярче, нежели бы у нас было 2 класса.

## Zero-shot-classification



Теперь используем `zero-shot-classification`, который сразу принимает тестовые данные, не обучаясь на тренировочных.

Будем использовать модель DeBERTa.

In [None]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device="cuda")
names_labels = ['one star', 'two stars', 'three stars', 'four stars', 'five stars']
pred_zc = []
for text in tqdm(test_df.review_body.tolist()):
  output = classifier(text, names_labels)
  pred_zc.append(names_labels.index(output["labels"][0]))

  0%|          | 0/17227 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Оценим полученный результат:

In [None]:
accuracy = accuracy_score(test_df.star_rating.tolist(), pred_zc)
precision = precision_score(test_df.star_rating.tolist(), pred_zc, average='macro')
recall = recall_score(test_df.star_rating.tolist(), pred_zc, average='macro')
f1 = f1_score(test_df.star_rating.tolist(), pred_zc, average='macro')
conf_matrix = confusion_matrix(test_df.star_rating.tolist(), pred_zc)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.07
Precision: 0.07
Recall: 0.08
F1 Score: 0.07
Confusion Matrix:
[[   0    0    0    0    0    0]
 [2156  408  218  139  122    0]
 [ 704  261  111   93   98    0]
 [ 770  328  169  113  211    0]
 [ 939  733  290  380  525    0]
 [2281 2201 1000  705 2272    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


`Вывод:` Модель на представленных данных совсем не справляется со своей задачей. Результат невероятно плохой. Видимо все также сказывается неравномерное распределение отзывов по классам.

`WINER:` в итоге, лучше всего справился с задачей метод опорных векторов `LinearSVC`.