# Fixing Broken BERT Embeddings (ROBUST VERSION)

**IMPORTANT**: To avoid variable collisions, please **Restart Kernel & Run All** after copying this code.

Strategy:
1. **Initialize**: Fill zeros in `word_embeddings` with mean of healthy vectors.
2. **Phase 1**: Tune ONLY embeddings on `val_dataset.csv`.
3. **Phase 2**: Pseudo-label `test.csv`, then tune on BOTH datasets using fixed padding.


In [None]:
import torch
import pandas as pd
import numpy as np
import hashlib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, pipeline
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import f1_score, classification_report

np.random.seed(21)
torch.manual_seed(21)

VAL_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/val_dataset.csv"
TEST_PATH = "/kaggle/input/cyprus-ai-camp-broken-bert/test.csv"
MODEL_NAME = "Ilseyar-kfu/broken_bert"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

MAX_LEN = 128
BATCH_SIZE = 32
LABEL_MAP = {'neutral': 0, 'positive': 1, 'negative': 2}
INV_LABEL_MAP = {v: k for k, v in LABEL_MAP.items()}

### Dataset Class

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = {k: torch.tensor(v) for k, v in encodings.items()}
        self.labels = torch.tensor(labels) if labels is not None else None
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

### Load Data and Model

In [None]:
df_val = pd.read_csv(VAL_PATH)
df_test = pd.read_csv(TEST_PATH)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)

with torch.no_grad():
    weights = model.bert.embeddings.word_embeddings.weight.data
    is_zero = (weights.pow(2).sum(dim=1) == 0)
    non_zero_indices = torch.where(~is_zero)[0]
    nz_mean = weights[non_zero_indices].mean(dim=0)
    nz_std = weights[non_zero_indices].std()
    weights[is_zero] = nz_mean + torch.randn_like(weights[is_zero]) * nz_std * 0.1

for param in model.parameters():
    param.requires_grad = False
model.bert.embeddings.word_embeddings.weight.requires_grad = True

print("Model prepared and embeddings initialized.")

### Phase 1: Training on Validation

In [None]:
val_texts = df_val["text"].tolist()
val_labels = df_val["labels"].map(LABEL_MAP).tolist()

val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
val_dataset = SentimentDataset(val_encodings, val_labels)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = AdamW(model.bert.embeddings.word_embeddings.parameters(), lr=5e-4)
epochs = 15
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(val_loader)*epochs)

model.train()
for epoch in range(epochs):
    for batch in tqdm.tqdm(val_loader, desc=f"P1 Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        outputs.loss.backward()
        optimizer.step()
        scheduler.step()

### Phase 2: Pseudo-labeling and Combined Tuning

In [None]:
model.eval()
test_texts = df_test["text"].tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
test_dataset = SentimentDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

pseudo_labels = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Pseudo-labeling"):
        outputs = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE))
        pseudo_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# RE-TOKENIZE EVERYTHING TO BE 100% SURE OF DIMENSIONS
print("Combining and re-tokenizing...")
all_texts = val_texts + test_texts
all_labels = val_labels + pseudo_labels

combined_encodings = tokenizer(all_texts, truncation=True, padding='max_length', max_length=MAX_LEN)
combined_dataset = SentimentDataset(combined_encodings, all_labels)
combined_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = AdamW(model.bert.embeddings.word_embeddings.parameters(), lr=1e-4)
for epoch in range(5):
    model.train()
    for batch in tqdm.tqdm(combined_loader, desc=f"P2 Epoch {epoch+1}"):
        optimizer.zero_grad()
        outputs = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE), labels=batch['labels'].to(DEVICE))
        outputs.loss.backward()
        optimizer.step()

### Final Prediction

In [None]:
model.eval()
final_preds = []
with torch.no_grad():
    for batch in tqdm.tqdm(test_loader, desc="Final Pred"):
        outputs = model(batch['input_ids'].to(DEVICE), attention_mask=batch['attention_mask'].to(DEVICE))
        final_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

submission = pd.DataFrame({"labels": [INV_LABEL_MAP[p] for p in final_preds], "id": df_test["id"]})
submission.to_csv("submission.csv", index=False)
print("Done! Output saved to submission.csv")