# Fixing Broken BERT Embeddings (Fixed Padding)

This notebook focuses on restoring a BERT model whose word embeddings have been corrupted. 
We use the following strategy:
1. **Identify and Initialize**: Identify zeroed-out embeddings and initialize them with the mean of intact embeddings.
2. **Freeze & Tune**: Freeze all transformer layers and refine only the `word_embeddings` matrix.
3. **Pseudo-labeling**: Use the validation-tuned model to label the test data and perform a second round of fine-tuning on the combined dataset.

**Restrictions**:
- No other transformer models.
- No additional data.

In [None]:
import torch
import pandas as pd
import numpy as np
import hashlib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, pipeline
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import f1_score, classification_report

np.random.seed(21)
torch.manual_seed(21)

VAL_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/val_dataset.csv"
TEST_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/test.csv"
MODEL_NAME = "Ilseyar-kfu/broken_bert"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

### Load Data and Model

In [None]:
df_val = pd.read_csv(VAL_PATH)
df_test = pd.read_csv(TEST_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

print("Model and Tokenizer loaded.")

### Embedding Analysis and Initialization

In [None]:
with torch.no_grad():
    weights = model.bert.embeddings.word_embeddings.weight.data
    is_zero = (weights.pow(2).sum(dim=1) == 0)
    non_zero_indices = torch.where(~is_zero)[0]
    
    nz_mean = weights[non_zero_indices].mean(dim=0)
    nz_std = weights[non_zero_indices].std()
    
    print(f"Total tokens: {len(weights)}")
    print(f"Corrupted tokens: {is_zero.sum().item()}")
    
    # Initialize corrupted ones with mean + light noise
    weights[is_zero] = nz_mean + torch.randn_like(weights[is_zero]) * nz_std * 0.1

# Freeze everything but the embeddings
for param in model.parameters():
    param.requires_grad = False
model.bert.embeddings.word_embeddings.weight.requires_grad = True

### Training Setup

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        # Check length of any key
        return len(self.encodings['input_ids'])

# Predefined mapping from baseline
LABEL_MAP = {'neutral': 0, 'positive': 1, 'negative': 2}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

def train_model(model, loader, epochs, lr, warmup_steps=0):
    optimizer = AdamW(model.bert.embeddings.word_embeddings.parameters(), lr=lr)
    total_steps = len(loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm.tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Average Loss: {total_loss / len(loader):.4f}")

### Phase 1: Fine-tuning on Validation Set

In [None]:
MAX_LEN = 128
BATCH_SIZE = 32

val_texts = df_val["text"].tolist()
val_labels = df_val["labels"].map(LABEL_MAP).tolist()

# CRITICAL: Use padding='max_length' to ensure consistent sizes when combining datasets
val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
val_dataset = SentimentDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Starting Phase 1 Training...")
train_model(model, val_loader, epochs=15, lr=5e-4, warmup_steps=100)

### Phase 2: Pseudo-labeling and Combined Tuning

In [None]:
model.eval()
test_texts = df_test["text"].tolist()
# CRITICAL: Use padding='max_length' here too
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
test_dataset = SentimentDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

pseudo_labels = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Pseudo-labeling"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        pseudo_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# Combine datasets - since lengths are now fixed (MAX_LEN), this won't crash in DataLoader
combined_encodings = {k: val_encodings[k] + test_encodings[k] for k in val_encodings.keys()}
combined_labels = val_labels + pseudo_labels
combined_dataset = SentimentDataset(combined_encodings, combined_labels)
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Starting Phase 2 Combined Training...")
# Use lower learning rate for combined tuning
train_model(model, combined_loader, epochs=5, lr=1e-4)

### Final Evaluation on Validation

In [None]:
model.eval()
val_preds = []
eval_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
with torch.no_grad():
    for batch in tqdm.tqdm(eval_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        val_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

val_predict_labels = [INV_LABEL_MAP[p] for p in val_preds]
print("F1 Score (Macro):", f1_score(df_val["labels"], val_predict_labels, average='macro'))
print(classification_report(df_val["labels"], val_predict_labels))

### Create Submission

In [None]:
model.eval()
final_preds = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Final Prediction"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask)
        final_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

test_ans = [INV_LABEL_MAP[p] for p in final_preds]
submission = pd.DataFrame({"labels": test_ans, "id": df_test["id"]})

hsh = hashlib.sha256(submission.to_csv(index=False).encode('utf-8')).hexdigest()[:8]
submit_path = f"submission_{hsh}.csv"
submission.to_csv(submit_path, index=False)
submission.to_csv("submission.csv", index=False)

print(f"Submission created: {submit_path}")
print(submission.head())