In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
!pip install datasets
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import json

# Zero shot

In [None]:
# Загрузка предобученной модели
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
# Кандидатные категории
categories = ["Претензия", "Предложение", "Благодарность"]

In [None]:
# Текст для классификации
text = ["Сервис ужасный, товар доставили с опозданием.",
        "Сервис отличный, товар доставили вовремя!",
        "Сервис ужасный, мне ничего не понравилось, у меня к вас много претензий",
        "Предлагаю вам улучшить качество обслуживания. Как вы смотрите на мои идеи и предложения сотрудничать?",
        "Спасибо большое за такой сервис, я вам очень благодарен! Самый лучший сервис!"]
# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
text = ["Всё плохо", "Предлагаю идею", "Спасибо большое"]
# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
text = categories
# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
with open('./comment1.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
text = [input_text]

# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
with open('./comment2.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
text = [input_text]

# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
with open('./comment3.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
text = [input_text]

# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

In [None]:
# Текст для классификации
with open('./comment4.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
text = [input_text]

# Предсказание
result = classifier(text, candidate_labels=categories)
print(*[f'{res["labels"]} : {[round(x, 3) for x in res["scores"]]} <--- {res["sequence"]}' for res in result], sep='\n')

# ruBERT

In [None]:
# 1. Загрузка данных
with open('./rubert_train_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

dataset = Dataset.from_dict(data)

In [None]:
# 2. Загрузка предобученной модели и токенизатора
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
# 3. Токенизация данных
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

# Деление на обучающую и валидационную выборки
train_texts, eval_texts, train_labels, eval_labels = train_test_split(
    data["text"], data["labels"], test_size=0.2, random_state=42
)

# Создание обучающего и валидационного датасетов
train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})
eval_dataset = Dataset.from_dict({"text": eval_texts, "labels": eval_labels})

# Токенизация данных
train_encoded = train_dataset.map(preprocess_function, batched=True)
eval_encoded = eval_dataset.map(preprocess_function, batched=True)


In [None]:
# 4. Настройка параметров обучения
training_args = TrainingArguments(
    output_dir="./results",         # Папка для результатов
    evaluation_strategy="epoch",    # Оценка раз в эпоху
    learning_rate=2e-5,             # Скорость обучения
    per_device_train_batch_size=16, # Размер батча
    num_train_epochs=10,            # Увеличиваем количество эпох
    weight_decay=0.01,              # Регуляризация
    save_strategy="epoch",          # Сохранение модели раз в эпоху
    logging_dir='./logs',           # Логи
    logging_steps=10,               # Частота логов
)

# 5. Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=eval_encoded  # Валидационный датасет
)

In [None]:
# 6. Запуск обучения
trainer.train()

# 7. Сохранение модели
model.save_pretrained("./drive/MyDrive/Esenin colab/finetuned_rubert")
tokenizer.save_pretrained("./drive/MyDrive/Esenin colab/finetuned_rubert")
# model.save_pretrained("./finetuned_rubert")
# tokenizer.save_pretrained("./finetuned_rubert")

In [None]:
# prompt: plot a graph of loss of trainer

import matplotlib.pyplot as plt

# Assuming you have the training history in the 'trainer' object
# Access the training logs (replace with the correct attribute if necessary)
if hasattr(trainer, 'state'):
  if hasattr(trainer.state, 'log_history'):
    log_history = trainer.state.log_history
    epochs = [log['epoch'] for log in log_history if 'epoch' in log]
    train_losses = [log['loss'] for log in log_history if 'loss' in log and 'eval_loss' not in log]
    eval_losses = [log['eval_loss'] for log in log_history if 'eval_loss' in log]

    # Plotting training and eval loss
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, train_losses, label="Training Loss")
    plt.plot(epochs, eval_losses, label="Evaluation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Evaluation Loss")
    plt.legend()
    plt.grid(True)
    plt.show()
  else:
    print("Log history not found in trainer.state")
else:
  print("State attribute not found in the trainer object")

In [None]:
# prompt: plot a graph of following data:
# 1   0.702900	0.412326
# 2	0.152200	0.082027
# 3	0.021600	0.012879
# 4	0.008200	0.011686
# 5	0.005200	0.004955
# 6	0.004100	0.004149
# 7	0.003500	0.003836
# 8	0.003100	0.003733
# 9	0.002900	0.003627
# 10	0.002700	0.003622

import matplotlib.pyplot as plt

# Data
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y1 = [0.702900, 0.152200, 0.021600, 0.008200, 0.005200, 0.004100, 0.003500, 0.003100, 0.002900, 0.002700]
y2 = [0.412326, 0.082027, 0.012879, 0.011686, 0.004955, 0.004149, 0.003836, 0.003733, 0.003627, 0.003622]

# Plotting
plt.plot(x, y1, label="Training Loss")
plt.plot(x, y2, label="Validation Loss")

# Adding labels and title
plt.xlabel("X-axis")
plt.ylabel("Y-axis")

# Adding legend
plt.legend()

# Displaying the plot
plt.show()

In [None]:
# 8. Получение предсказаний на валидационном наборе
predictions = trainer.predict(eval_encoded)  # Используем eval_encoded
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Настоящие метки из eval_dataset
true_labels = eval_dataset["labels"]

# 9. Оценка качества классификации
print(classification_report(
    true_labels,
    predicted_labels,
    target_names=["Претензия", "Предложение", "Благодарность"],
    zero_division=0
))

In [None]:
from collections import Counter

# Подсчет примеров в каждой категории
print(Counter(train_dataset["labels"]))
print(Counter(predicted_labels))
probabilities = predictions.predictions  # Вероятности для каждого класса
print(probabilities[:5])  # Посмотреть первые 5 строк

In [None]:
incorrect_predictions = [
    (true, pred, text)
    for true, pred, text in zip(true_labels, predicted_labels, eval_texts)
    if true != pred
]
for true, pred, text in incorrect_predictions:
    print(f"True: {true}, Predicted: {pred}, Text: {text}")

In [None]:
from transformers import pipeline

# Загрузка обученной модели и токенизатора
model_path = "./drive/MyDrive/Esenin colab/finetuned_rubert"  # Путь к вашей обученной модели
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

# Пример использования
def classify_feedback(feedbacks):
    """
    Классификация списка отзывов по категориям.
    :param feedbacks: List[str] - Список текстов отзывов
    :return: List[Dict] - Список классификаций с вероятностями
    """
    results = classifier(feedbacks)
    return [
        {
            "text": feedback,
            "label": result["label"],
            "score": round(result["score"], 2)  # Удобное округление вероятности
        }
        for feedback, result in zip(feedbacks, results)
    ]


In [None]:
# Пример отзывов
with open('./comment1.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks = [input_text]
with open('./comment2.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks.append(input_text)
with open('./comment3.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks.append(input_text)
with open('./comment4.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks.append(input_text)
with open('./comment5.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks.append(input_text)
with open('./comment6.txt', 'r', encoding='utf-8') as file:
    input_text = file.read()
feedbacks.append(input_text)

# Классификация отзывов
classified_feedbacks = classify_feedback(feedbacks)

# Печать результатов
for feedback in classified_feedbacks:
    print(f"Отзыв: {feedback['text']}")
    print(f"Категория: {feedback['label']}, Уверенность: {feedback['score']}\n")


# Test

In [None]:
def preprocessing(json_data):
    text = json_data['text'].replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('<p>', '')
    text = text.replace('</p>', '')
    text = text.replace('<ul>', '')
    text = text.replace('<li>', '')
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = text.strip()
    if len(text) > 512:
        text = text[:512]
    return text

In [None]:
def load_json_data(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            return json_data
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{filepath}'.")
        return None

In [None]:
# Example usage
json_data = load_json_data('./drive/MyDrive/Esenin colab/finetuned_rubert/reviews_bankiru_promsvyazbank.json')
if json_data:
    print(preprocessing(json_data[0]))

In [None]:
json_data[0]

In [None]:
# prompt: get the texts from json_data with preprocessing and pass them to classify_feedback. Print the output correspondingly for feedback['label']: LABEL_0 is "Претензия", LABEL_1 is "Предложение", LABEL_2 is "Благодарность"

LABEL_0 = "Претензия"
LABEL_1 = "Предложение"
LABEL_2 = "Благодарность"

# Example usage
# json_data = load_json_data('./drive/MyDrive/Esenin colab/finetuned_rubert/reviews_bankiru_promsvyazbank.json')
json_data = load_json_data('./drive/MyDrive/Esenin colab/real_feedback123.json')
result = []
# json_data = json_data[:100] # !@#%!#$%!#%#$%#@$%!#!#$!%!#$%!#
texts_to_classify = []
for item in json_data:
    texts_to_classify.append(preprocessing(item))

classified_feedbacks = classify_feedback(texts_to_classify)

resmet = []
for feedback, item in zip(classified_feedbacks, json_data):
    label_mapping = {
        "LABEL_0": LABEL_0,
        "LABEL_1": LABEL_1,
        "LABEL_2": LABEL_2
    }
    resmet.append(label_mapping.get(feedback['label'], 'Unknown'))
    # result.append({
    #     'grade': item['grade'],
    #     'dateCreate': item['dateCreate'],
    #     'label': label_mapping.get(feedback['label'], 'Unknown')})
    # print(f"Отзыв: {feedback['text']}")
    # print(f"Категория: {label_mapping.get(feedback['label'], 'Unknown')}, Уверенность: {feedback['score']}\n")


In [None]:
# prompt: compare predicted `resmet` values and the truth values from `json_data['label']`. Compute the accuracy

from sklearn.metrics import accuracy_score

# Assuming 'resmet' and 'json_data' are defined as in the previous code

true_labels = [item['label'] for item in json_data]
accuracy = accuracy_score(true_labels, resmet)
print(f"Accuracy: {accuracy}")

In [None]:
with open('./drive/MyDrive/Esenin colab/labeled_reviews.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)
