In [None]:
import pandas as pd

In [None]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [None]:
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [None]:
pip install -U scikit-learn scipy matplotlib

In [None]:
pip install torch

In [None]:
pip install transformers==4.25.1

In [None]:
pip install datasets

In [None]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"  # Можно использовать 'gpt2-medium', 'EleutherAI/gpt-neo-125M' и др.
num_labels = 2       # Количество классов для классификации
batch_size = 8
learning_rate = 2e-5
num_epochs = 10

# Загрузка модели и токенизатора
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Для классификации нам нужно добавить специальный токен для классификации
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# Загрузка данных
dataset = load_data()
tokenized_datasets = tokenize_data(tokenizer, dataset)

In [None]:
pip install accelerate==0.28.0

In [None]:
# Аргументы обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    metric_for_best_model='f1',
    eval_strategy='epoch',
    # save_strategy='best',
    # load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="best",
    load_best_model_at_end=False,
    learning_rate=learning_rate,
)

# Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['val'],     
    compute_metrics=compute_metrics,
)
training_args.device

In [None]:
# Обучение
trainer.train()

# Оценка
eval_results = trainer.evaluate()
print(f"Результаты оценки: {eval_results}")


In [None]:
import torch
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
model_dir = "results/checkpoint-10478"

# Загрузка токенизатора и модели
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained(model_dir)

# Перевод модели в режим оценки
model.eval()

In [None]:
def predict(text):
    # Токенизация входного текста
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Получение предсказаний
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    return predictions.item()

In [None]:
model.to('cuda')

In [None]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# Создайте DataLoader для тестового набора данных
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Получите предсказания
all_preds_probas = []
all_predictions = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(model.device)
        
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_preds_probas.extend(outputs.logits.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
print(classification_report(y_test, all_predictions))

In [None]:
preds_proba = torch.sigmoid(torch.tensor(all_preds_probas)).cpu().numpy()

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
precision, recall, thr = precision_recall_curve(all_labels, preds_proba[:, 1])

plt.plot(thr, precision[:-1], label='precision')
plt.plot(thr, recall[:-1], label='recall')
plt.grid()
plt.legend()
plt.show()

In [None]:
print(classification_report(y_test, (preds_proba[:, 1] > 0.64) * 1))

### Data augmentation with back translation

In [None]:
y_test = df_test.error.copy()

In [None]:
X_train = pd.read_csv('train_back_translated.csv')

In [None]:
import torch
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [None]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"  
num_labels = 2       
batch_size = 8
learning_rate = 2e-5
num_epochs = 10

tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = model.config.eos_token_id

In [None]:

dataset = load_data()
tokenized_datasets = tokenize_data(tokenizer, dataset)

In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    metric_for_best_model='f1',
    # eval_strategy='epoch',
    # save_strategy='best',
    # load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=False,
    learning_rate=learning_rate,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],  
    eval_dataset=tokenized_datasets['val'],     
    compute_metrics=compute_metrics,
)
training_args.device

In [None]:

trainer.train()

# Оценка
eval_results = trainer.evaluate()
print(f"Результаты оценки: {eval_results}")


### Prediction 

In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
model_dir = "results/checkpoint-35385"

# Загрузка токенизатора и модели
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained(model_dir)

# Перевод модели в режим оценки
model.eval()

In [None]:
def predict(text):
    # Токенизация входного текста
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Получение предсказаний
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    return predictions.item()

In [None]:
model.to('cuda')

In [None]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# Создайте DataLoader для тестового набора данных
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Получите предсказания
all_preds_probas = []
all_predictions = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(model.device)
        
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_preds_probas.extend(outputs.logits.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
print(classification_report(y_test, all_predictions))

In [None]:
preds_proba = torch.sigmoid(torch.tensor(all_preds_probas)).cpu().numpy()

### Duplication

In [None]:
df = pd.read_csv('df_duplication.csv')
df

In [None]:
X_train = df.message_txt.copy()

In [None]:
y_train = df.error.copy()

In [None]:
model_name = "sberbank-ai/rugpt3small_based_on_gpt2"  # Можно использовать 'gpt2-medium', 'EleutherAI/gpt-neo-125M' и др.
num_labels = 2       # Количество классов для классификации
batch_size = 16
learning_rate = 2e-5
num_epochs = 10

# Загрузка модели и токенизатора
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Для классификации нам нужно добавить специальный токен для классификации
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# Загрузка данных
dataset = load_data()
tokenized_datasets = tokenize_data(tokenizer, dataset)

In [None]:
# Аргументы обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    metric_for_best_model='f1',
    # eval_strategy='epoch',
    # save_strategy='best',
    # load_best_model_at_end=True,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=False,
    learning_rate=learning_rate,
)

# Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],  # Ограничение для примера
    eval_dataset=tokenized_datasets['val'],     # Ограничение для примера
    compute_metrics=compute_metrics,
)
training_args.device

In [None]:
# Обучение
trainer.train()

# Оценка
eval_results = trainer.evaluate()
print(f"Результаты оценки: {eval_results}")


In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
model_dir = "results/checkpoint-10617"

# Загрузка токенизатора и модели
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2ForSequenceClassification.from_pretrained(model_dir)

# Перевод модели в режим оценки
model.eval()

In [None]:
model.to('cuda')

In [None]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# Создайте DataLoader для тестового набора данных
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Получите предсказания
all_preds_probas = []
all_predictions = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(model.device)
        
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_preds_probas.extend(outputs.logits.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
print(classification_report(y_test, all_predictions))

In [None]:
preds_proba = torch.sigmoid(torch.tensor(all_preds_probas)).cpu().numpy()