In [None]:
import pandas as pd

In [None]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [None]:
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [None]:
pip install torch

In [None]:
pip install transformers

In [None]:
# Импорт библиотек
import torch
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

In [None]:
pip install sentencepiece

In [None]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)  # Перемещаем модель на GPU

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [None]:
import torch.nn as nn

In [None]:
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
def tokenize(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
        verbose=100
    )

In [None]:
train_encodings = tokenize(X_train)

In [None]:
val_encodings = tokenize(X_val)

In [None]:
test_encodings = tokenize(X_test)

In [None]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: val[idx].to(device)  # Данные сразу на GPU
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels.iloc[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = TextDataset(train_encodings, y_train)
test_dataset = TextDataset(test_encodings, y_test)
val_dataset = TextDataset(val_encodings, y_val)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
best_f1 = 0

In [None]:
from tqdm import tqdm, trange
for epoch in trange(num_epochs):
    # Обучение
    model.train()
    total_loss = 0

    for batch in train_loader:
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Валидация
    model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(batch['labels'].cpu().numpy())
    
    accuracy = accuracy_score(val_true, val_preds)
    f1 = f1_score(val_true, val_preds)
    
    print(f"Эпоха {epoch+1}/{num_epochs}")
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Val Accuracy: {accuracy:.4f}, F1: {f1:.4f}\n")
    if f1 > best_f1:
        best_f1 = f1
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, 'best_model.pth')
        print("Cool, that's awesome scores! Model saved to best_model.pth")


### Best model metrics

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
val_preds = []
val_preds_proba = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds_proba.extend(logits)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(val_true, val_preds)
f1 = f1_score(val_true, val_preds)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(val_true, val_preds))
print(confusion_matrix(val_true, val_preds))

In [None]:
preds_proba = []
for i in val_preds_proba:
    preds_proba.append(i.cpu().numpy())

In [None]:
preds_proba = torch.softmax(torch.from_numpy(np.array(preds_proba)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba[:, 1] > 0.79) * 1
print(classification_report(val_true, preds))
print(confusion_matrix(val_true, preds))

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(val_true, preds_proba[:, 1])

In [None]:
import matplotlib.pyplot as plt
plt.plot(thresholds, precision[1:], 'r-')
plt.plot(thresholds, recall[1:], 'b-')
plt.show()

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
test_preds = []
test_preds_proba = []
test_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds_proba.extend(logits)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(test_true, test_preds)
f1 = f1_score(test_true, test_preds)

In [None]:
preds_proba_test = []
for i in test_preds_proba:
    preds_proba_test.append(i.cpu().numpy())

In [None]:
preds_proba_test = torch.softmax(torch.from_numpy(np.array(preds_proba_test)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba_test[:, 1] > 0.79) * 1
print(classification_report(test_true, preds))
print(confusion_matrix(test_true, preds))

### Augmentation

In [None]:
train_dataset = pd.DataFrame({'message_txt': X_train, 'error': y_train})
train_dataset

In [None]:
# Разделение на классы
minority = train_dataset[train_dataset['error'] == 1]
majority = train_dataset[train_dataset['error'] == 0]

# Аугментация миноритарного класса
minority_augmented = minority.copy()

# Балансировка данных
df_balanced = pd.concat([majority, minority, minority_augmented]).sample(frac=1)

In [None]:
df_balanced.error.value_counts()

### Обучение на аугментированных данных 

In [None]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

In [None]:
train_encodings = tokenize(df_balanced.message_txt)

In [None]:
df_balanced.message_txt

In [None]:
train_dataset = TextDataset(train_encodings, df_balanced.error)
test_dataset = TextDataset(test_encodings, y_test)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
best_f1 = 0

In [None]:
from tqdm import tqdm, trange
for epoch in trange(num_epochs):
    # Обучение
    model.train()
    total_loss = 0

    for batch in train_loader:
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Валидация
    model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(batch['labels'].cpu().numpy())
    
    accuracy = accuracy_score(val_true, val_preds)
    f1 = f1_score(val_true, val_preds)
    
    print(f"Эпоха {epoch+1}/{num_epochs}")
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Val Accuracy: {accuracy:.4f}, F1: {f1:.4f}\n")
    if f1 > best_f1:
        best_f1 = f1
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, 'best_model.pth')
        print("Cool, that's awesome scores! Model saved to best_model.pth")

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
val_preds = []
val_preds_proba = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds_proba.extend(logits)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(val_true, val_preds)
f1 = f1_score(val_true, val_preds)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(val_true, val_preds))
print(confusion_matrix(val_true, val_preds))

In [None]:
val_preds_proba[0].cpu().numpy()

In [None]:
preds_proba = []
for i in val_preds_proba:
    preds_proba.append(i.cpu().numpy())

In [None]:
preds_proba = torch.softmax(torch.from_numpy(np.array(preds_proba)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba[:, 1] > 0.80) * 1
print(classification_report(val_true, preds))
print(confusion_matrix(val_true, preds))

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
test_preds = []
test_preds_proba = []
test_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds_proba.extend(logits)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(test_true, test_preds)
f1 = f1_score(test_true, test_preds)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_true, test_preds))
print(confusion_matrix(test_true, test_preds))

In [None]:
test_preds_proba[0].cpu().numpy()

In [None]:
preds_proba_test = []
for i in test_preds_proba:
    preds_proba_test.append(i.cpu().numpy())

In [None]:
preds_proba_test = torch.softmax(torch.from_numpy(np.array(preds_proba_test)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba_test[:, 1] > 0.80) * 1
print(classification_report(test_true, preds))
print(confusion_matrix(test_true, preds))

### Аугментация с обратным переводом 

In [None]:
pip install sentencepiece

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
from time import time

In [None]:
from transformers import pipeline

# Инициализация моделей перевода (русский → английский → русский)
translator_ru_en = pipeline('translation_ru_to_en', model='Helsinki-NLP/opus-mt-ru-en')
translator_en_ru = pipeline('translation_en_to_ru', model='Helsinki-NLP/opus-mt-en-ru')

In [None]:
def back_translate(text):
    en_text = translator_ru_en(text)[0]['translation_text']
    ru_text = translator_en_ru(en_text)[0]['translation_text']
    return ru_text

In [None]:
from tqdm import tqdm

In [None]:
X_train_new_1_class = []

In [None]:
X_train_1_class = X_train.loc[y_train == 1]

In [None]:
c = 0
d = 0
for i in tqdm(X_train_1_class):
    for j in i.split('. '):
        if len(j) > 400:
            c += 1
        d += 1
c, d

In [None]:
%%time
for j in X_train.iloc[227].split('. ') * 5:
    translator_ru_en(j)


In [None]:
batch_size = 10
batches = [X_train_1_class[x:x+batch_size] for x in range(0, len(X_train_1_class), batch_size)]
len(batches)

In [None]:
splitted_batches = []
indexes_batch = []
for batch in tqdm(batches):
    tmp_batch = []
    tmp_index = [0]
    for t in batch:
        tmp_batch.extend(t.split('. '))
        tmp_index.append(tmp_index[-1] + len(t.split('. ')))
    splitted_batches.append(tmp_batch)
    indexes_batch.append(tmp_index)

In [None]:
indexes_batch

In [None]:
translated_batches = []

In [None]:
def back_translate_batch(text_batch):
    en_text_batch = translator_ru_en(text_batch, batch_size=16)
    en_text_batch = [i['translation_text'] for i in en_text_batch]
    ru_text_batch = translator_en_ru(en_text_batch, batch_size=16)
    ru_text_batch = [i['translation_text'] for i in ru_text_batch]
    return ru_text_batch

In [None]:
from tqdm import tqdm

for text_batch, indexes in tqdm(zip(splitted_batches[181:], indexes_batch[181:]), total=len(splitted_batches[181:])):
    batch =  back_translate_batch(text_batch)
    tmp_batch = []
    for j in range(1, len(indexes)):
        if j == len(indexes) - 1:
            tmp_text = ' '.join(batch[indexes[j - 1]:])
        else:
            tmp_text = ' '.join(batch[indexes[j - 1]:indexes[j]])
        tmp_batch.append(tmp_text)
    translated_batches.append(tmp_batch)

In [None]:
pd.Series(batch_text[0])

In [None]:
c = 0
for i in batch_text[0]:
    c += 1
c

In [None]:
all_df = []
for i in batch_text:
    for j in i:
        all_df.append(j)
pd.Series(all_df)

In [None]:
translated_batches = []

In [None]:
from tqdm import tqdm

for text_batch, indexes in tqdm(zip(splitted_batches[181:], indexes_batch[181:]), total=len(splitted_batches[181:])):
    batch =  back_translate_batch(text_batch)
    tmp_batch = []
    for j in range(1, len(indexes)):
        if j == len(indexes) - 1:
            tmp_text = ' '.join(batch[indexes[j - 1]:])
        else:
            tmp_text = ' '.join(batch[indexes[j - 1]:indexes[j]])
        tmp_batch.append(tmp_text)
    translated_batches.append(tmp_batch)

In [None]:
all_df = []
for i in translated_batches:
    for j in i:
        all_df.append(j)
pd.Series(all_df)

In [None]:
for x in tqdm(splitted_batches):
    for i in x:
        
    spltd = x.split('. ')
    translated_txt = []
    for sen in spltd:
       
    translated_txt.append(back_translate(sen))
    X_train_new.append(' '.join(translated_txt))

In [None]:
df = all_df.copy()

In [None]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

In [None]:
df[df['error'] == 0]

In [None]:
train_encodings = tokenize(df.message_txt)

In [None]:
test_encodings = tokenize(X_test)

In [None]:
val_encodings = tokenize(X_val)

In [None]:
df_test = pd.DataFrame({'message_txt': X_test, 'error': y_test})

In [None]:
df_test.to_csv('df_test.csv')

In [None]:
train_dataset = TextDataset(train_encodings, df.error)
test_dataset = TextDataset(test_encodings, y_test)
val_dataset = TextDataset(val_encodings, y_test)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
best_f1 = 0

In [None]:
from tqdm import tqdm, trange
for epoch in trange(num_epochs):
    # Обучение
    model.train()
    total_loss = 0

    for batch in train_loader:
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Валидация
    model.eval()
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(batch['labels'].cpu().numpy())
    
    accuracy = accuracy_score(val_true, val_preds)
    f1 = f1_score(val_true, val_preds)
    
    print(f"Эпоха {epoch+1}/{num_epochs}")
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")
    print(f"Val Accuracy: {accuracy:.4f}, F1: {f1:.4f}\n")
    if f1 > best_f1:
        best_f1 = f1
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, 'best_model.pth')
        print("Cool, that's awesome scores! Model saved to best_model.pth")

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
val_preds = []
val_preds_proba = []
val_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds_proba.extend(logits)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(val_true, val_preds)
f1 = f1_score(val_true, val_preds)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(val_true, val_preds))
print(confusion_matrix(val_true, val_preds))

In [None]:
import numpy as np

In [None]:
val_preds_proba[0].cpu().numpy()

In [None]:
preds_proba = []
for i in val_preds_proba:
    preds_proba.append(i.cpu().numpy())

In [None]:
preds_proba = torch.softmax(torch.from_numpy(np.array(preds_proba)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba[:, 1] > 0.87) * 1
print(classification_report(val_true, preds))
print(confusion_matrix(val_true, preds))

In [None]:
from tqdm import tqdm, trange

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
model.load_state_dict(torch.load('best_model.pth')['model_state_dict'])
# Валидация
model.eval()
test_preds = []
test_preds_proba = []
test_true = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        test_preds_proba.extend(logits)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(test_true, test_preds)
f1 = f1_score(test_true, test_preds)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_true, test_preds))
print(confusion_matrix(test_true, test_preds))

In [None]:
test_preds_proba[0].cpu().numpy()

In [None]:
preds_proba_test = []
for i in test_preds_proba:
    preds_proba_test.append(i.cpu().numpy())

In [None]:
preds_proba_test = torch.softmax(torch.from_numpy(np.array(preds_proba_test)), dim=-1).cpu().numpy()

In [None]:
preds = (preds_proba_test[:, 1] > 0.87) * 1
print(classification_report(test_true, preds))
print(confusion_matrix(test_true, preds))