<a href="https://colab.research.google.com/github/GriPet12/mercorAiDetect/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece transformers --quiet

# Import

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Load Data

In [None]:
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

train['answer'] = train['answer'].astype(str)
train['topic'] = train['topic'].astype(str)
test['answer'] = test['answer'].astype(str)
test['topic'] = test['topic'].astype(str)

# Dataset Class

In [None]:
class TextDataset(Dataset):
    def __init__(self, topics, answers, labels=None):
        self.texts = [t + " [SEP] " + a for t, a in zip(topics, answers)]
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Parameters

In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"
OUTPUT_DIR = "./deberta_v3_fulltrain"
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 50
LR = 5e-7
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer

In [None]:
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

# Prepare Datasets

In [None]:
train_dataset = TextDataset(train['topic'].tolist(), train['answer'].tolist(), train['is_cheating'].tolist())
test_dataset = TextDataset(test['topic'].tolist(), test['answer'].tolist())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Model, Optimizer, Loss

In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Training Loop

In [None]:
train_losses = []

model.train()

for epoch in range(EPOCHS):
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).unsqueeze(1)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        loss = loss_fn(outputs, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        current_loss = loss.item()
        total_loss += current_loss

        progress_bar.set_postfix({'loss': f'{current_loss:.4f}'})

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    print(f"Epoch {epoch+1} done. Average Train Loss: {avg_loss:.4f}")

final_model_path = os.path.join(OUTPUT_DIR, "final_model.pth")
torch.save(model.state_dict(), final_model_path)
print(f"âœ… Final model saved to {final_model_path}")

plt.figure(figsize=(10, 5))
plt.plot(range(1, EPOCHS + 1), train_losses, marker='o', label='Training Loss')
plt.title('Training Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()

plot_path = os.path.join(OUTPUT_DIR, "loss_plot.png")
plt.savefig(plot_path)
print(f"ðŸ“Š Loss plot saved to {plot_path}")

plt.show()

# Test Predictions

In [None]:
model.eval()
test_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting on test set"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask).logits
        test_preds.extend(torch.sigmoid(outputs).cpu().numpy())

test_preds = np.array(test_preds).flatten()