# üî¨ Phase 5: Ablation Study - Preprocessed Code (Task A)

**Project**: SemEval-2026 Task 13 - Machine-Generated Code Detection  
**Phase**: 5 (T029-T035) - Ablation Study  
**Input**: PREPROCESSED code (Comments removed, whitespace normalized)  
**Hypothesis**: Performance should be LOWER than raw code (proving "Fingerprint Paradox")  

## Setup
1. **Runtime ‚Üí Change runtime type ‚Üí T4 GPU**
2. Upload `task_a_train.parquet` and `task_a_val.parquet`
3. Run all cells

In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers datasets accelerate

In [None]:
from google.colab import files
print("üìÅ Upload task_a_train.parquet and task_a_val.parquet")
uploaded = files.upload()

In [None]:
import os, random, numpy as np, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import pandas as pd
from datetime import datetime
from tqdm.auto import tqdm

# ============== Configuration ==============
SEED = 42
CONFIG = {
    'model_name': 'microsoft/codebert-base',
    'max_length': 512,
    'batch_size': 32,
    'epochs': 3,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'dropout': 0.1,
    'max_grad_norm': 1.0,
}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Device: {device}")

In [None]:
# ============== Dataset (Preprocessed) ==============
class CodeDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        # ‚ö†Ô∏è CRITICAL CHANGE FOR ABLATION: Use 'code_preprocessed' column
        self.codes = df['code_preprocessed'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.codes)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.codes[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Load data
train_df = pd.read_parquet('task_a_train.parquet')
val_df = pd.read_parquet('task_a_val.parquet')
print(f"üìä Train: {len(train_df):,}, Val: {len(val_df):,}")
print(f"‚ö†Ô∏è Using PREPROCESSED code column for training")
print(f"Sample: {train_df['code_preprocessed'].iloc[0][:100]}...")

In [None]:
# ============== Model ==============
class CodeBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes=2, dropout=0.1):
        super().__init__()
        self.codebert = AutoModel.from_pretrained(model_name)
        hidden_size = self.codebert.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]
        return self.classifier(self.dropout(cls_output))

print("ü§ñ Loading CodeBERT...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = CodeBERTClassifier(CONFIG['model_name'], dropout=CONFIG['dropout']).to(device)
print(f"‚úÖ Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# ============== DataLoaders ==============
train_dataset = CodeDataset(train_df, tokenizer, CONFIG['max_length'])
val_dataset = CodeDataset(val_df, tokenizer, CONFIG['max_length'])

g = torch.Generator().manual_seed(SEED)
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'])

sample = next(iter(train_loader))
print(f"‚úÖ Batch shape: {sample['input_ids'].shape}")

In [None]:
# ============== Training Setup ==============
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])

total_steps = len(train_loader) * CONFIG['epochs']
warmup_steps = int(CONFIG['warmup_ratio'] * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

print(f"üìà Steps: {total_steps}, Warmup: {warmup_steps}")

In [None]:
# ============== Training Functions ==============
def train_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        loss = criterion(logits, batch['label'].to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['max_grad_norm'])
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    preds, labels, probs = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            probs.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy())
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(batch['label'].numpy())
    return f1_score(labels, preds), roc_auc_score(labels, probs), classification_report(labels, preds)

In [None]:
# ============== TRAINING (Ablation) ==============
print("üöÄ Training on PREPROCESSED code (Phase 5 Ablation)...")
print("="*50)

epoch_losses = []
best_f1, best_state = 0.0, None

for epoch in range(CONFIG['epochs']):
    print(f"\nüìç Epoch {epoch + 1}/{CONFIG['epochs']}")
    loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, device)
    epoch_losses.append(loss)
    print(f"   Loss: {loss:.4f}")
    
    f1, roc_auc, _ = evaluate(model, val_loader, device)
    print(f"   Val F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        print(f"   ‚≠ê New best!")

print("\n‚úÖ Training complete!")

In [None]:
# ============== Final Evaluation ==============
model.load_state_dict(best_state)
model.to(device)
f1, roc_auc, report = evaluate(model, val_loader, device)

print("\n" + "="*50)
print("üìä ABLATION RESULTS (PREPROCESSED CODE)")
print("="*50)
print(f"üéØ F1: {f1:.4f}")
print(f"üìà ROC-AUC: {roc_auc:.4f}")
print(f"\n{report}")

In [None]:
# ============== Save & Download ==============
torch.save(best_state, 'model_task_a_preprocessed.pt')
print("üíæ Saved: model_task_a_preprocessed.pt")

# Results markdown
results = f"""# CodeBERT Results - PREPROCESSED Code (Task A Ablation)

**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (Colab T4 GPU)

## Metrics
| Metric | Value |
|--------|-------|
| **F1** | **{f1:.4f}** |
| ROC-AUC | {roc_auc:.4f} |

## Training Losses
""" + "\n".join([f"- Epoch {i+1}: {l:.4f}" for i, l in enumerate(epoch_losses)]) + f"""

## Classification Report
```
{report}
```
"""

with open('codebert_preprocessed_task_a.md', 'w') as f:
    f.write(results)

# Download both files
files.download('model_task_a_preprocessed.pt')
files.download('codebert_preprocessed_task_a.md')

In [None]:
# ============== Per-Language F1 (T037-T040) ==============
def compute_per_language_f1(df, y_pred):
    """Compute F1 per language."""
    results = []
    print("\n" + "="*50)
    print("üìä PER-LANGUAGE ANALYSIS")
    print("="*50)
    print(f"{'Language':<15} {'Samples':<10} {'F1 Score':<10}")
    print("-" * 35)
    
    for lang in sorted(df['language'].unique()):
        mask = df['language'] == lang
        lang_labels = df.loc[mask, 'label'].values
        lang_preds = y_pred[mask]
        f1 = f1_score(lang_labels, lang_preds)
        results.append({'language': lang, 'samples': mask.sum(), 'f1': f1})
        print(f"{lang:<15} {mask.sum():<10} {f1:.4f}")
    
    return results

# Get predictions on validation set
model.eval()
all_preds = []
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Getting predictions"):
        logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

all_preds = np.array(all_preds)

# Compute per-language F1
per_lang = compute_per_language_f1(val_df, all_preds)

---
## üì• Output Handling

1. Compare `codebert_preprocessed_task_a.md` with your RAW code results.
2. If F1 is LOWER here, you have successfully verified the **Fingerprint Paradox**.
3. Add the numbers to your `tasks.md` ablation table.