In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import re
from nltk.corpus import stopwords

In [None]:
# Загрузка данных
train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
test_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")

In [None]:
# Предобработка текста
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()  # Tokenize
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

train_essays['clean_text'] = train_essays['text'].apply(clean_text)

In [None]:
# Разделение данных
X_train, X_val, y_train, y_val = train_test_split(train_essays['clean_text'], train_essays['generated'], test_size=0.2, random_state=42)

In [None]:
# Токенизация для BERT
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased/bert-base-uncased', do_lower_case=True, padding=True, truncation=True, max_length=128)

In [None]:
encoded_train = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt')
encoded_val = tokenizer(X_val.tolist(), padding=True, truncation=True, return_tensors='pt')

In [None]:
# Преобразование меток в тензоры
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

In [None]:
# Создание TensorDatasets
train_dataset = TensorDataset(encoded_train['input_ids'], encoded_train['attention_mask'], train_labels)
val_dataset = TensorDataset(encoded_val['input_ids'], encoded_val['attention_mask'], val_labels)

In [None]:
# DataLoader для эффективной обработки
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Определение модели BERT для классификации последовательностей
model = BertForSequenceClassification.from_pretrained('/kaggle/input/bert-base-uncased/bert-base-uncased', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Определение оптимизатора и планировщика скорости обучения
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 10

In [None]:
# Цикл обучения
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.2f}")

In [None]:
# Цикл валидации
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

In [None]:
# Расчет точности валидации
val_accuracy = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {val_accuracy:.2f}")

In [None]:
# Обработка тестовых данных
test_inputs = tokenizer(test_essays['text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Перемещение входного тензора на то же устройство, что и модель
test_inputs = {key: value.to(device) for key, value in test_inputs.items()}

with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits
    
predictions = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

submission = pd.DataFrame({
    'id': test_essays['id'],
    'generated': predictions
})

submission.to_csv('/kaggle/working/submission.csv', index=False)