# Approach: Embedding + MLP

In [141]:
import os
import pandas as pd
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import csv


In [142]:
train = pd.read_csv("../../data/train.csv")
test = pd.read_csv("../../data/test.csv")

# IN KAGGLE:
#train = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
#test = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")

### Generate Embeddings

In [143]:
from sentence_transformers import SentenceTransformer

# SBERT Modell laden
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# IN KAGGLE (UPLOAD sbert_model.zip TO NOTEBOOK AS DATASET FIRST):
#sbert_model = SentenceTransformer("/kaggle/input/sentence-bert")

# Erzeuge Embeddings
embeddings = sbert_model.encode(train["full_text"].tolist(), show_progress_bar=True)

# Optional: Embeddings in die DataFrame schreiben
train["embedding"] = embeddings.tolist()


Batches: 100%|██████████| 123/123 [00:10<00:00, 12.26it/s]


### Define Dataset & Dataloader

In [144]:

class SBERTEmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.embeddings = dataframe["embedding"].tolist()
        self.labels = dataframe[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values.astype(float)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = torch.tensor(self.embeddings[idx], dtype=torch.float)
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        return embedding, labels


In [145]:
dataset = SBERTEmbeddingDataset(train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Define simple Feed Forward Net

In [146]:
def model_fn():
    return nn.Sequential(
        nn.Linear(384, 512),
        nn.ReLU(),
        nn.BatchNorm1d(512),
        nn.Dropout(0.4),

        nn.Linear(512, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(0.3),

        nn.Linear(256, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128),
        nn.Dropout(0.2),

        nn.Linear(128, 6)  # Multi-Target Regression
    )


### Train Net

In [147]:
class MCRMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')  # wichtig!

    def forward(self, preds, targets):
        # preds/targets: [batch_size, num_targets]
        colwise_mse = self.mse(preds, targets).mean(dim=0)  # Mittelwert pro Spalte
        colwise_rmse = torch.sqrt(colwise_mse + 1e-8)       # RMSE pro Ziel
        return colwise_rmse.mean()                          # Mittelwert über Ziele


In [148]:
def cross_validate_kfold(dataset, model_fn, k=5, num_epochs=10, lr=1e-3, batch_size=32):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    fold_val_losses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print(f"\n--- Fold {fold + 1} ---")

        train_subset = torch.utils.data.Subset(dataset, train_idx)
        val_subset = torch.utils.data.Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size)

        model = model_fn().to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
        loss_fn = MCRMSELoss()

        for epoch in range(num_epochs):
            model.train()
            total_loss = 0

            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                preds = model(xb)
                loss = loss_fn(preds, yb)
                loss.backward()
                optimizer.step()
                total_loss += loss.item() * xb.size(0)

            avg_loss = total_loss / len(train_loader.dataset)
            print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}")

            model.eval()
            val_loss = 0
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    preds = model(xb)
                    loss = loss_fn(preds, yb)
                    val_loss += loss.item() * xb.size(0)

            avg_val_loss = val_loss / len(val_loader.dataset)
            scheduler.step(avg_val_loss)
            print(f"Fold {fold + 1} Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")

        fold_val_losses.append(avg_val_loss)

    print(f"\nAverage Validation Loss across {k} folds: {sum(fold_val_losses)/k:.4f}")
    return fold_val_losses, model


In [149]:
dataset = SBERTEmbeddingDataset(train)

_, model = cross_validate_kfold(
    dataset=dataset,
    model_fn=model_fn,
    k=5,
    num_epochs=25,
    lr=1e-3,
    batch_size=32
)



--- Fold 1 ---
Epoch 1: Train Loss = 2.9261
Fold 1 Epoch 1 Validation Loss: 2.4199
Epoch 2: Train Loss = 1.4318
Fold 1 Epoch 2 Validation Loss: 0.7171
Epoch 3: Train Loss = 0.8524
Fold 1 Epoch 3 Validation Loss: 0.6066
Epoch 4: Train Loss = 0.7816
Fold 1 Epoch 4 Validation Loss: 0.5831
Epoch 5: Train Loss = 0.7530
Fold 1 Epoch 5 Validation Loss: 0.5922
Epoch 6: Train Loss = 0.7035
Fold 1 Epoch 6 Validation Loss: 0.5769
Epoch 7: Train Loss = 0.6840
Fold 1 Epoch 7 Validation Loss: 0.5652
Epoch 8: Train Loss = 0.6646
Fold 1 Epoch 8 Validation Loss: 0.5743
Epoch 9: Train Loss = 0.6497
Fold 1 Epoch 9 Validation Loss: 0.5685
Epoch 10: Train Loss = 0.6430
Fold 1 Epoch 10 Validation Loss: 0.5660
Epoch 11: Train Loss = 0.6247
Fold 1 Epoch 11 Validation Loss: 0.5643
Epoch 12: Train Loss = 0.6132
Fold 1 Epoch 12 Validation Loss: 0.5564
Epoch 13: Train Loss = 0.6161
Fold 1 Epoch 13 Validation Loss: 0.5625
Epoch 14: Train Loss = 0.6038
Fold 1 Epoch 14 Validation Loss: 0.5597
Epoch 15: Train Loss =

### Generate Submission

In [150]:
embeddings = sbert_model.encode(test["full_text"].tolist(), show_progress_bar=True)

# Optional: Embeddings in die DataFrame schreiben
test["embedding"] = embeddings.tolist()

Batches: 100%|██████████| 1/1 [00:00<00:00, 29.18it/s]


In [151]:
class SBERTTestDataset(Dataset):
    def __init__(self, dataframe):
        self.ids = dataframe["text_id"].tolist()
        self.embeddings = dataframe["embedding"].tolist()

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        embedding = torch.tensor(self.embeddings[idx], dtype=torch.float)
        return self.ids[idx], embedding

In [152]:
test_dataset = SBERTTestDataset(test)
test_loader = DataLoader(test_dataset, batch_size=32)

In [153]:
model.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

predictions = []
text_ids = []

with torch.no_grad():
    for ids, xb in test_loader:
        xb = xb.to(device)
        preds = model(xb)
        preds = preds.clamp(0, 5)
        preds = preds.cpu().numpy()
        predictions.extend(preds.tolist())
        text_ids.extend(ids)

submission_df = pd.DataFrame(predictions, columns=[
    "cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"
])
submission_df.insert(0, "text_id", text_ids)

submission_df.to_csv("submission.csv", index=False)