In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [None]:
import pandas as pd

# 1. Load human text (from persuade_combined.csv)
persuade_path = '/kaggle/input/persuade-corpus-ai-generated-dataset/persuade_combined.csv'
persuade_df = pd.read_csv(persuade_path)

# Keep only human-written texts (generated == 0)
human_df = persuade_df[persuade_df['generated'] == 0].copy()
human_df = human_df[['text']]  # Keep only text column
human_df['generated'] = 0      # Add label column

print(f"Number of human texts: {len(human_df)}")

# 2. Load generated text (from slimpajama.csv)
slimpajama_path = '/kaggle/input/slimpajama-ai-generated-parallel-dataset/slimpajama.csv'
slimpajama_df = pd.read_csv(slimpajama_path)

# Ensure we only use 'text' column, randomly sample AI texts at 10x the human text count
ai_df = slimpajama_df[['text']].sample(n=10 * len(human_df), random_state=42).copy()
ai_df['generated'] = 1  # Add label column

print(f"Number of AI-generated texts: {len(ai_df)}")

# 3. Combine datasets and shuffle
combined_df = pd.concat([human_df, ai_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 4. Save the new training set
output_path = '/kaggle/working/combined_training_data_large.csv'
combined_df.to_csv(output_path, index=False)
print(f"\n✅ New training data saved to: {output_path}")
print(f"Total samples: {len(combined_df)} (Human: {len(human_df)}, AI: {len(ai_df)})")
# Explicitly disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Set seed for reproducibility
def set_seed(seed=42):
    # np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed()

# Check GPU availability and print info
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
    print("GPU memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")

# 1. Load data
train_df = combined_df
test_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Clean data - keep id column
train_df = train_df[['text', 'generated']].dropna()
test_df = test_df[['id', 'text']].dropna()

# 2. Split train/validation sets
train_split, val_split = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['generated'])

# 3. Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_split)
val_dataset = Dataset.from_pandas(val_split)

# 4. Use DeBERTa model
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Ensure model runs on GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Using device: {device}")

# 5. Tokenize function - limit length for stability
def tokenize_fn(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=384)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)

# 6. Set label field name
if "generated" in train_dataset.column_names:
    train_dataset = train_dataset.rename_column("generated", "labels")
if "generated" in val_dataset.column_names:
    val_dataset = val_dataset.rename_column("generated", "labels")

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# 7. Training arguments - use stable settings
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,  # Lower learning rate
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    fp16=False,
    bf16=False,  # Explicitly disable bf16
    report_to="none",
    logging_dir="./logs",
    logging_steps=50,
    logging_strategy="steps",
    gradient_accumulation_steps=4,  # Increase gradient accumulation steps
    dataloader_num_workers=0,  # Reduce parallel loading
    warmup_ratio=0.1,  # Add warmup steps
    max_grad_norm=1.0,  # Limit gradient norm
)

# 8. Evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)  # Use numpy instead of torch
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

from transformers import TrainerCallback
import time
import math

class ProgressCallback(TrainerCallback):
    def __init__(self, print_freq=10):
        self.print_freq = print_freq
        self.start_time = time.time()
        self.step_start_time = self.start_time
        self.last_log_step = 0
    
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"Starting training, total {args.num_train_epochs} epochs")
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        print(f"\nStarting epoch {state.epoch+1}/{args.num_train_epochs}")
        
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step > 0 and (state.global_step % self.print_freq == 0 or state.global_step == 1):
            # Calculate current epoch progress
            steps_per_epoch = state.max_steps // args.num_train_epochs
            current_epoch_step = state.global_step % steps_per_epoch if steps_per_epoch > 0 else state.global_step
            if steps_per_epoch > 0:
                epoch_progress = (current_epoch_step / steps_per_epoch) * 100
            else:
                epoch_progress = 0
                
            # Calculate speed
            current_time = time.time()
            steps_since_last_log = state.global_step - self.last_log_step
            time_since_last_log = current_time - self.step_start_time
            if time_since_last_log > 0:
                steps_per_second = steps_since_last_log / time_since_last_log
            else:
                steps_per_second = 0
                
            # Calculate remaining time
            if steps_per_second > 0:
                steps_remaining = state.max_steps - state.global_step
                est_time_remaining = steps_remaining / steps_per_second
                mins_remaining = est_time_remaining // 60
                secs_remaining = est_time_remaining % 60
                time_remaining = f"{int(mins_remaining)}min{int(secs_remaining)}sec"
            else:
                time_remaining = "calculating..."
                
            print(f"Epoch {state.epoch+1}/{args.num_train_epochs} | "
                  f"Step {current_epoch_step}/{steps_per_epoch} | "
                  f"Progress {epoch_progress:.2f}% | "
                  f"Total progress {(state.global_step/state.max_steps)*100:.2f}% | "
                  f"Speed {steps_per_second:.2f} steps/sec | "
                  f"Est. time remaining {time_remaining}")
            
            self.last_log_step = state.global_step
            self.step_start_time = current_time
            
    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time
        print(f"Epoch {state.epoch+1} completed, time: {epoch_time:.2f}sec")
        
    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.start_time
        hours = total_time // 3600
        mins = (total_time % 3600) // 60
        secs = total_time % 60
        print(f"\nTraining complete! Total time: {int(hours)}h{int(mins)}m{int(secs)}s")

# Modified Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[ProgressCallback(print_freq=20)],
)

# Start training
trainer.train()

# Save trained model
model_save_path = "/kaggle/working/deberta_model_persuade_V2"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# Evaluate model on validation set
print("\nEvaluating model on validation set:")
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Validation F1 score: {eval_results['eval_f1']:.4f}")

# Process test set
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_fn, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Model predictions
print("Generating predictions...")
test_outputs = trainer.predict(test_dataset)
test_preds = np.argmax(test_outputs.predictions, axis=-1)  # Use numpy instead of torch

# Output detailed results
detailed_df = test_df.copy()
detailed_df['generated_pred'] = test_preds
detailed_df.to_csv("/kaggle/working/deberta_detailed_predictions.csv", index=False)
print("Detailed prediction results saved")

probs = torch.nn.functional.softmax(torch.tensor(test_outputs.predictions), dim=-1).numpy()
predicted_probs = probs[:, 1]  # Probability of AI-generated (label=1)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'generated': predicted_probs
})
submission_df.to_csv("/kaggle/working/submission.csv", index=False)

# Print statistics
# print(f"Number of predicted AI-generated texts: {sum(test_preds)}")
# print(f"Number of predicted human-written texts: {len(test_preds) - sum(test_preds)}")
# print(f"Percentage of predicted AI-generated texts: {sum(test_preds) / len(test_preds) * 100:.2f}%")
model_path = "/kaggle/input/deberta_final/pytorch/default/1/deberta_model_persuade_V2"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_path)
print("Model class count:", config.num_labels)