Скачаем все необходимые библиотеки

In [None]:
pip install torch numpy

In [None]:
pip install -U scikit-learn

In [None]:
pip install transformers googletrans==3.1.0a0 sentencepiece protobuf transformers[torch] accelerate -U tqdm

Импортируем небходимые библиотеки/функции из библиотек

In [2]:
import json
import random
import numpy as np
import torch
from transformers import AutoTokenizer, AdamW, AutoModel
from googletrans import Translator
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from torch.nn.utils.rnn import pad_sequence

Скачаем трансформер кодировщик mDeBERTa V3 и его токенизатор. У модели будет два выходы (две метки: первая будет отвечать за несгенерированность текста, вторая - за сгенерированность)

In [None]:
model_transformer_name = "microsoft/mdeberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_transformer_name)
model_transformer = AutoModel.from_pretrained(model_transformer_name, num_labels=2)

Загрузим тренировочную, валидационную и тестовую выборки и создадим массивы для текстов и для меток, соответсвующих этим текстам

In [5]:
def get_jsonl_from_file(path_to_file):
    with open(path_to_file, "r", encoding="utf-8") as file:
        data = [json.loads(line) for line in file]
    return data

train_data = get_jsonl_from_file("mono_500_train.jsonl")
test_data = get_jsonl_from_file("mono_test.jsonl")
val_data = get_jsonl_from_file("mono_val.jsonl")

train_texts = [elem["text"] for elem in train_data]
train_labels = [elem["label"] for elem in train_data]

test_texts = [elem["text"] for elem in test_data]
test_labels = [elem["label"] for elem in test_data]

val_texts = [elem["text"] for elem in val_data]
val_labels = [elem["label"] for elem in val_data]

Переведем 50% процентов сгененрированных английских текстов в тренировочном датасете

In [6]:
translator = Translator()

def translate_text_from_en_to_ru(text):
    translated = translator.translate(text, dest="ru")
    return translated.text


train_texts_50 = []
progress_bar = tqdm(total=len(train_texts))
for text, label in zip(train_texts, train_labels):
    if label == 1 and random.random() < 0.5:
        translated_text = translate_text_from_en_to_ru(text)
        train_texts_50.append(translated_text)
    else:
        train_texts_50.append(text)
    progress_bar.update(1)
progress_bar.close()

100%|██████████| 500/500 [00:12<00:00, 41.52it/s]


Разделим каждый текст тренировочной выборки на сегменты длиной 256 токенов с покрытием предыдущего сегмента 50-ю токенами

In [7]:
def split_text(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    segments = []
    start_index = 0

    while start_index < len(tokens):
        end_index = min(start_index + 256, len(tokens))
        one_segment = tokens[start_index:end_index]
        segments.append(one_segment)
        start_index += 256 - 50

    text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in segments]

    return text_chunks


train_texts_50_segmented = []
train_labels_50_segmented = []

progress_bar = tqdm(total=len(train_texts_50))
for text, label in zip(train_texts_50, train_labels):
    segments = split_text(text, tokenizer)
    train_texts_50_segmented.extend(segments)
    train_labels_50_segmented.extend([label] * len(segments))
    progress_bar.update(1)
progress_bar.close()

100%|██████████| 500/500 [00:01<00:00, 329.64it/s]


Создадим датасет из тренировочной выборки, для этого опишем класс, в которой будем приводить текста к нужной структуре. Далее разделим датасет на батчи (при этом перемешаем их). Также опишем функцию, которая будет корректно объединять данные в батчи (чтобы они имели одинаковый размер)

In [8]:
class Dataset_for_model(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
            max_length=256
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def collate_fn(batch):
    input_ids = [elem['input_ids'] for elem in batch]
    attention_mask = [elem['attention_mask'] for elem in batch]
    labels = [elem['labels'] for elem in batch]

    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
    labels = torch.stack(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

batch_size = 16
train_dataset = Dataset_for_model(train_texts_50_segmented, train_labels_50_segmented, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

Создадим датасет из валидационной выборки: разделим каждый текст на текста по 256 токенов (используя перекрытие предыдущего сегмента 50-ю токенами); создадим батчи размером 16

In [9]:
val_texts_segmented = []
val_labels_segmented = []

progress_bar = tqdm(total=len(val_texts))
for text, label in zip(val_texts, val_labels):
    segments = split_text(text, tokenizer)
    val_texts_segmented.extend(segments)
    val_labels_segmented.extend([label] * len(segments))
    progress_bar.update(1)
progress_bar.close()

val_dataset = Dataset_for_model(val_texts_segmented,val_labels_segmented, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

100%|██████████| 100/100 [00:00<00:00, 289.95it/s]


Опишем головуную часть и основуную части модели в виде классов и запустим обучение (параметры, настройки указаны в отчете). Также во время обучения после каждой эпохи будем оценивать модель на валидационном датасете.

In [10]:
class Head_of_model(nn.Module):
    def __init__(self, hidden_size, num_classes, dropout=0.1):
        super(Head_of_model, self).__init__()
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.gelu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.gelu(x)
        x = self.dropout(x)

        x = self.fc3(x)
        return x

class Classification(nn.Module):
    def __init__(self, transformer, num_classes=2, dropout=0.1):
        super(Classification, self).__init__()
        self.transformer = transformer
        self.classifier = Head_of_model(transformer.config.hidden_size, num_classes, dropout)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        output_to_model = hidden_states[:, 0, :]
        return self.classifier(output_to_model)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Classification(model_transformer)
model.to(device)


epochs = 5
batch_size = 16
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4], gamma=0.5)

model.train()
total_iterations = len(train_dataloader) * epochs
total_iterations_val = len(val_dataloader)

current_iteration = 0
current_iteration_val = 0

for epoch in range(epochs):
    # обучение модели
    model.train()
    if epoch == 0 or epoch == 4:
        for parameters in model.transformer.parameters():
            parameters.requires_grad = False
    else:
        for parameters in model.transformer.parameters():
            parameters.requires_grad = True

    progress_bar = tqdm(train_dataloader, desc="Training", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        current_iteration += 1
        progress_bar.set_postfix({"Steps": f"{current_iteration}/{total_iterations}"})

    #валидация после каждой эпохи
    model.eval()
    val_labels_prediction = []
    val_labels_in_dataset = []
    progress_bar_val = tqdm(val_dataloader, desc="Validation", unit="batch")
    with torch.no_grad():
        for batch in progress_bar_val:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs
            predictions_of_tags = torch.argmax(logits, dim=1)

            val_labels_prediction.extend(predictions_of_tags.tolist())
            val_labels_in_dataset.extend(labels.tolist())

            current_iteration_val += 1
            progress_bar_val.set_postfix({"Steps": f"{current_iteration_val}/{total_iterations_val}"})

    current_iteration_val = 0
    print("Epoch:", epoch + 1)
    print(f"Generated Texts: {sum(val_labels_prediction)} out of {len(val_labels_prediction)}")
    print("F1:", f1_score(val_labels_in_dataset, val_labels_prediction))
    print("Accuracy:", accuracy_score(val_labels_in_dataset, val_labels_prediction))
    print()


Training: 100%|██████████| 129/129 [00:42<00:00,  3.06batch/s, Steps=129/645]
Validation: 100%|██████████| 29/29 [00:08<00:00,  3.29batch/s, Steps=29/29]


Epoch: 1
Generated Texts: 458 out of 458
F1: 0.4707846410684474
Accuracy: 0.3078602620087336



Training: 100%|██████████| 129/129 [02:00<00:00,  1.07batch/s, Steps=258/645]
Validation: 100%|██████████| 29/29 [00:09<00:00,  3.16batch/s, Steps=29/29]


Epoch: 2
Generated Texts: 421 out of 458
F1: 0.4341637010676157
Accuracy: 0.3056768558951965



Training: 100%|██████████| 129/129 [02:00<00:00,  1.07batch/s, Steps=387/645]
Validation: 100%|██████████| 29/29 [00:09<00:00,  3.20batch/s, Steps=29/29]


Epoch: 3
Generated Texts: 427 out of 458
F1: 0.44014084507042256
Accuracy: 0.3056768558951965



Training: 100%|██████████| 129/129 [02:00<00:00,  1.07batch/s, Steps=516/645]
Validation: 100%|██████████| 29/29 [00:09<00:00,  3.17batch/s, Steps=29/29]


Epoch: 4
Generated Texts: 425 out of 458
F1: 0.4381625441696113
Accuracy: 0.3056768558951965



Training: 100%|██████████| 129/129 [00:43<00:00,  3.00batch/s, Steps=645/645]
Validation: 100%|██████████| 29/29 [00:09<00:00,  3.10batch/s, Steps=29/29]

Epoch: 5
Generated Texts: 424 out of 458
F1: 0.4389380530973452
Accuracy: 0.3078602620087336






Опишем метод Бенджамини-Хохберга (возвращает массив, в котором на i-ом месте стоит True, если фргамент сгенерирован, иначе стоит False). Протестируем построенную модель на тестовом датасете.

In [11]:
def method_of_Benjamini_Hochberg(p_values):
    m = len(p_values)
    alpha=0.05

    index_sorted = sorted(range(m), key=lambda i: p_values[i])
    p_values_sorted = [p_values[i] for i in index_sorted]

    new_alpha = [i * alpha / m for i in range(1, m+1)]

    max_index = 0
    for i in range(m):
        if p_values_sorted[i] <= new_alpha[i]:
            max_index = i

    ans = [False] * m
    for i in index_sorted[:max_index+1]:
        ans[i] = True

    return ans



model.eval()
generated_verdicts = []
progress_bar = tqdm(total=len(test_texts))
for text in test_texts:
    segments = [text[i:i+512] for i in range(0, len(text), 512)]
    segments_predictions = []

    for segment in segments:
        encoding = tokenizer.encode_plus(
            segment,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
            max_length=256
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0]

            tags = torch.softmax(logits, dim=0)
            tag_of_no_generation = tags[0].item()
            segments_predictions.append(tag_of_no_generation)

    segments_predictions = method_of_Benjamini_Hochberg(segments_predictions)
    avg_generated_prob = sum(segments_predictions) / len(segments_predictions)

    if (avg_generated_prob >= 0.5):
        generated_verdicts.append(1)
    else:
        generated_verdicts.append(0)

    progress_bar.update(1)

progress_bar.close()


num_generated = sum(generated_verdicts)
num_total = len(generated_verdicts)
generated_percentage = num_generated / num_total * 100
print()
print(f"Generated Texts: {num_generated} out of {num_total}")

f1 = f1_score(test_labels, generated_verdicts)
precision = precision_score(test_labels, generated_verdicts)
recall = recall_score(test_labels, generated_verdicts)

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


100%|██████████| 1000/1000 [02:49<00:00,  5.91it/s]



Generated Texts: 494 out of 1000
F1 Score: 0.3984
Precision: 0.4008
Recall: 0.3960
