In [None]:
#!pip install transformers datasets accelerate

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from transformers import get_scheduler
from sentence_transformers import SentenceTransformer, losses
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sentence_transformers import InputExample

# Multitask Regression


In [None]:
from sentence_transformers import SentenceTransformer

traindf = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
testdf = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")

sbert_model = SentenceTransformer("/kaggle/input/sbert-model")

In [None]:
from torch.utils.data import Dataset
import torch

class EssayRegressionDataset(Dataset):
    def __init__(self, dataframe):
        
        self.texts = dataframe["full_text"].tolist()
        self.labels = dataframe[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values.astype('float32')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return text, label


In [None]:
from sentence_transformers import SentenceTransformer, models, losses, InputExample
from torch import nn
import torch
from torch.utils.data import DataLoader

# Schritt 1: SBERT Backbone laden


# Optional: freeze SBERT zuerst


# Schritt 2: Regressionskopf definieren
class SBERTRegressor(nn.Module):
    def __init__(self, sbert, output_dim=6):
        super().__init__()
        self.sbert = sbert

        for param in self.sbert.parameters():
            param.requires_grad = True  # oder False, wenn du es NICHT feintunen willst

        self.regressor = nn.Sequential(
            nn.Linear(384, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, output_dim)
        )

    def forward(self, sentences):
        embeddings = self.sbert.encode(sentences, convert_to_tensor=True)
        return self.regressor(embeddings)


In [None]:
class MCRMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')  # wichtig!

    def forward(self, preds, targets):
        # preds/targets: [batch_size, num_targets]
        colwise_mse = self.mse(preds, targets).mean(dim=0)  # Mittelwert pro Spalte
        colwise_rmse = torch.sqrt(colwise_mse + 1e-8)       # RMSE pro Ziel
        return colwise_rmse.mean()                         # Mittelwert über Ziele



In [None]:
train_df, val_df = train_test_split(traindf, test_size=0.15, random_state=42)

train_dataset = EssayRegressionDataset(train_df)
val_dataset = EssayRegressionDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [None]:


# Modell
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SBERTRegressor(sbert_model).to(device)

# Optimierung
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()

epochs = 30
# Training
model.train()
for epoch in range(epochs):
    train_loss = 0
    for texts, labels in train_loader:
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()


    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            targets = targets.to(device)
            outputs = model(texts)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()

    
    print(f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

In [None]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")  
test_texts = test_df["full_text"].tolist()


In [None]:
model.eval()
predictions = []

with torch.no_grad():
    for text in test_texts:

        outputs = model(text)
        predictions.append(outputs.cpu().numpy())

import numpy as np
predictions = np.vstack(predictions)
print(predictions)

In [None]:

columns = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]

pred_df = pd.DataFrame(predictions, columns=columns)

if "text_id" in test_df.columns:
    pred_df.insert(0, "text_id", test_df["text_id"])


pred_df.to_csv("/kaggle/working/submission.csv", index=False)
