# üöÄ Phase 3: CodeBERT Training on RAW Code (Task B)

**Project**: SemEval-2026 Task 13 - Machine-Generated Code Detection  
**Phase**: 3 (T015-T024) - CodeBERT Implementation  
**Input**: RAW code (preserves AI fingerprints)  

## Setup
1. **Runtime ‚Üí Change runtime type ‚Üí T4 GPU** (Colab)
2. Upload `task_b_train.parquet` and `task_b_val.parquet` (see EDA stats in repo)
3. Run all cells

In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers datasets accelerate sklearn torch joblib pandas scikit-learn

In [None]:
from google.colab import files
print("üìÅ Upload task_b_train.parquet and task_b_val.parquet")
uploaded = files.upload()

In [None]:
import os, random, numpy as np, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import pandas as pd
from datetime import datetime
from tqdm.auto import tqdm
import joblib

# ============== Configuration =============
SEED = 42
CONFIG = {
    'model_name': 'microsoft/codebert-base',
    'max_length': 512,
    'batch_size': 32,
    'epochs': 3,
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'dropout': 0.1,
    'max_grad_norm': 1.0,
}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"‚úÖ Device: {device}")

In [None]:
# ============== Dataset =============
train_df = pd.read_parquet('task_b_train.parquet')
val_df = pd.read_parquet('task_b_val.parquet')
print(f"üìä Train: {len(train_df):,}, Val: {len(val_df):,}")

# quick sanity: show label distribution on validation
print(val_df['label'].value_counts().to_dict())

In [None]:
# ============== TF-IDF Baseline (Task B) =============
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, accuracy_score

OUT_DIR = 'outputs/002-zeev-mgc-detection'
os.makedirs(os.path.join(OUT_DIR, 'results'), exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, 'models'), exist_ok=True)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3), analyzer='word', token_pattern=r"\b\w+\b", min_df=2)
X_train = vectorizer.fit_transform(train_df['code'])
X_val = vectorizer.transform(val_df['code'])
y_train = train_df['label']
y_val = val_df['label']

clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs')
clf.fit(X_train, y_train)
preds = clf.predict(X_val)
macro_f1 = f1_score(y_val, preds, average='macro')
acc = accuracy_score(y_val, preds)
report = classification_report(y_val, preds)

print(f"TF-IDF Macro F1: {macro_f1:.4f}")
print(report)

# save artifacts
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
joblib.dump(vectorizer, os.path.join(OUT_DIR, 'models', f'tfidf_vectorizer_{ts}.pkl'))
joblib.dump(clf, os.path.join(OUT_DIR, 'models', f'tfidf_clf_{ts}.pkl'))

# write results markdown (Task B template)
md_path = os.path.join(OUT_DIR, 'results', f'tfidf_task_b_{ts}.md')
with open(md_path, 'w') as f:
    f.write('# TF-IDF Baseline Results (Task B)\n\n')
    f.write(f'**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n')
    f.write('## Metrics\n')
    f.write('| Metric | Value |\n')
    f.write('|--------|-------|\n')
    f.write(f
)
    f.write(f
)
    f.write('## Configuration\n')
    f.write(
    f.write(
3
)
    f.write(
    f.write(
    f.write('## Detailed Report (validation)\n')
    f.write('```\n')
    f.write(report)
    f.write('```\n')

print('Wrote TF-IDF results to', md_path)

---
## Next: CodeBERT training

Run the following cells to train CodeBERT on the uploaded Task B data (GPU recommended). To avoid downloading model weights, run the trainer locally with `--allow_model_download` flagged in the repo script; otherwise Colab will download weights automatically.

In [None]:
# ============== Model =============
class CodeBERTClassifier(nn.Module):
    def __init__(self, model_name, num_classes=11, dropout=0.1):
        super().__init__()
        self.codebert = AutoModel.from_pretrained(model_name)
        hidden_size = self.codebert.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.codebert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]
        return self.classifier(self.dropout(cls_output))

print("ü§ñ Loading CodeBERT...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = CodeBERTClassifier(CONFIG['model_name'], dropout=CONFIG['dropout']).to(device)
print(f"‚úÖ Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# ============== DataLoaders =============
train_dataset = CodeDataset(train_df, tokenizer, CONFIG['max_length'])
val_dataset = CodeDataset(val_df, tokenizer, CONFIG['max_length'])

g = torch.Generator().manual_seed(SEED)
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'])

# Verify batch shape
sample = next(iter(train_loader))
print(f"‚úÖ Batch shape: {sample['input_ids'].shape}")

In [None]:
# ============== Training Functions =============
def train_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
        loss = criterion(logits, batch['label'].to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['max_grad_norm'])
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    model.eval()
    preds, labels, probs = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            logits = model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
            probs.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy())
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels.extend(batch['label'].numpy())
    return f1_score(labels, preds, average='macro'), classification_report(labels, preds)

In [None]:
# ============== TRAINING =============
print("üöÄ Training on RAW code (Phase 3)...")
print("=
,
,
,

,
,
,
,
   Loss: {loss:.4f}\
,
,
,
   Val F1: {f1:.4f}\
,
,