# RESTORING BROKEN BERT: ADVANCED VERSION

Strategies implemented:
1. **Weighted Cross-Entropy**: Penalizing neutral bias to improve recall on positive/negative.
2. **Confident Pseudo-labeling**: Only training on high-confidence test samples (>0.85 probability).
3. **Longer Embedding Recovery**: 30 epochs for initial tuning + 10 epochs for combined tuning.
4. **Scheduler with Warmup**: Gradual LR increase to stabilize embedding recovery.

**Note**: Restart Kernel before running.

In [None]:
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import f1_score, classification_report

np.random.seed(42)
torch.manual_seed(42)

VAL_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/val_dataset.csv"
TEST_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/test.csv"
MODEL_NAME = "Ilseyar-kfu/broken_bert"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
BATCH_SIZE = 32
LABEL_MAP = {'neutral': 0, 'positive': 1, 'negative': 2}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

print(f"Device: {DEVICE}")

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = {k: torch.tensor(v) for k, v in encodings.items()}
        self.labels = torch.tensor(labels) if labels is not None else None
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

def train_step(model, loader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm.tqdm(loader, leave=False):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
df_val = pd.read_csv(VAL_PATH)
df_test = pd.read_csv(TEST_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# 1. Initialize Zeros
with torch.no_grad():
    weights = model.bert.embeddings.word_embeddings.weight.data
    is_zero = (weights.pow(2).sum(dim=1) == 0)
    nz_mean = weights[~is_zero].mean(dim=0)
    nz_std = weights[~is_zero].std()
    weights[is_zero] = nz_mean + torch.randn_like(weights[is_zero]) * nz_std * 0.1

for param in model.parameters():
    param.requires_grad = False
model.bert.embeddings.word_embeddings.weight.requires_grad = True

print(f"Corrupted tokens initialized: {is_zero.sum().item()}")

### Phase 1: Heavy Tuning with Class Weights

In [None]:
val_texts = df_val["text"].tolist()
val_labels = df_val["labels"].map(LABEL_MAP).tolist()

val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
val_dataset = SentimentDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Class Weights: Increase importance of positive(1) and negative(2)
class_weights = torch.tensor([1.0, 2.5, 2.5]).to(DEVICE)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

optimizer = AdamW(model.bert.embeddings.word_embeddings.parameters(), lr=8e-4)
epochs = 30
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=200, num_training_steps=len(val_loader)*epochs)

print("Phase 1: Recovery Loop")
for epoch in range(epochs):
    loss = train_step(model, val_loader, optimizer, scheduler, criterion)
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1} Loss: {loss:.4f}")

### Phase 2: High-Confidence Pseudo-Labeling

In [None]:
model.eval()
test_texts = df_test["text"].tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
test_dataset = SentimentDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

probs_list = []
preds_list = []

with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Predicting Test"):
        logits = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE)).logits
        probs = F.softmax(logits, dim=1)
        conf, pred = torch.max(probs, dim=1)
        probs_list.extend(conf.cpu().numpy())
        preds_list.extend(pred.cpu().numpy())

CONFIDENCE_THRESHOLD = 0.85
test_df_pseudo = pd.DataFrame({'text': test_texts, 'label': preds_list, 'prob': probs_list})
confident_test = test_df_pseudo[test_df_pseudo['prob'] >= CONFIDENCE_THRESHOLD]

print(f"Using {len(confident_test)} / {len(df_test)} samples from test for Phase 2")

# Combine data
p2_texts = val_texts + confident_test['text'].tolist()
p2_labels = val_labels + confident_test['label'].tolist()

p2_encodings = tokenizer(p2_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
p2_dataset = SentimentDataset(p2_encodings, p2_labels)
p2_loader = DataLoader(p2_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Second stage tuning (lower LR)
optimizer = AdamW(model.bert.embeddings.word_embeddings.parameters(), lr=1e-4)
epochs_p2 = 10
scheduler_p2 = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(p2_loader)*epochs_p2)

print("Phase 2: Combined Tuning")
for epoch in range(epochs_p2):
    loss = train_step(model, p2_loader, optimizer, scheduler_p2, criterion)
    if (epoch + 1) % 2 == 0:
        print(f"P2 Epoch {epoch+1} Loss: {loss:.4f}")

### Final Evaluation & Submission

In [None]:
model.eval()
final_preds = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Final Pred"):
        logits = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE)).logits
        final_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

submission = pd.DataFrame({"labels": [INV_LABEL_MAP[p] for p in final_preds], "id": df_test["id"]})
submission.to_csv("submission.csv", index=False)
print("Submission saved!")

# Validation Report
val_preds = []
eval_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
with torch.no_grad():
    for batch in eval_loader:
        logits = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE)).logits
        val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

print(f1_score(df_val["labels"], [INV_LABEL_MAP[p] for p in val_preds], average='macro'))
print(classification_report(df_val["labels"], [INV_LABEL_MAP[p] for p in val_preds]))