In [None]:
#!pip install transformers datasets accelerate

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from transformers import get_scheduler
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Multitask Regression


In [None]:
class EssayDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        targets = torch.tensor(self.targets[idx], dtype=torch.float)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'targets': targets
        }


In [None]:
class SentenceBertRegressionModel(nn.Module):
    def __init__(self, dropoutRate=0.3, num_outputs=6):
        super().__init__()
        #self.encoder = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
        self.encoder = AutoModel.from_pretrained("./sbert_model/")

        for param in self.encoder.parameters():
            param.requires_grad = False

        # # Step 2: Unfreeze the last two encoder layers
        for layer in self.encoder.layer[-2:]:  # Last two transformer blocks
             for param in layer.parameters():
                 param.requires_grad = True

        self.dropout = nn.Dropout(dropoutRate)
        hidden_size = self.encoder.config.hidden_size
        self.regressor = nn.Linear(hidden_size, num_outputs)

    def forward(self, input_ids, attention_mask, targets=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]
        x = self.dropout(cls_output)
        preds = self.regressor(x)
        return preds


In [None]:
df = pd.read_csv("../../data/train.csv")
texts = df["full_text"].tolist()
targets = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values

train_texts, val_texts, train_targets, val_targets = train_test_split(texts, targets, test_size=0.15, random_state=42)

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # SentenceBert ausprobieren
#model_name = "sentence-transformers/all-MiniLM-L6-v2"
#tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("./sbert_model/", local_files_only=True)

train_dataset = EssayDataset(train_texts, train_targets, tokenizer)
val_dataset = EssayDataset(val_texts, val_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [5]:
class MCRMSELoss(nn.Module): # noch damit ausprobieren
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')  # wichtig!

    def forward(self, preds, targets):
        # preds/targets: [batch_size, num_targets]
        colwise_mse = self.mse(preds, targets).mean(dim=0)  # Mittelwert pro Spalte
        colwise_rmse = torch.sqrt(colwise_mse + 1e-8)       # RMSE pro Ziel
        return colwise_rmse.mean()                         # Mittelwert über Ziele


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# best_model = SentenceBertRegressionModel(dropoutRate=best_params["dropout"]).to(device)
best_model = SentenceBertRegressionModel(dropoutRate=0.3).to(device)


# train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=best_params["batch_size"])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = torch.optim.AdamW(best_model.parameters(), lr=2e-5)
loss_fn = MCRMSELoss()


epochs = 3

for epoch in range(epochs):
    best_model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)

        optimizer.zero_grad()
        preds = best_model(input_ids, attention_mask)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Training Loss = {avg_train_loss:.4f}")

# Evaluation
predictions = []
best_model.eval()
val_loss = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)

        preds = best_model(input_ids, attention_mask)
        loss = loss_fn(preds, targets)
        val_loss += loss.item()
        predictions.append(preds.cpu().numpy())

predictions = np.vstack(predictions)


avg_val_loss = val_loss / len(val_loader)
print(f"\n📊 Final Validation Loss (MCRMSE) with best params: {avg_val_loss:.4f}")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'RobertaForMaskedLM' object has no attribute 'layer'

In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

val_df = pd.read_csv('validation_set.csv')

val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()

val_embeddings = best_model.encode(val_texts, convert_to_tensor=False)

predictions = [1 if np.dot(emb, emb) > 0.5 else 0 for emb in val_embeddings]  

accuracy = accuracy_score(val_labels, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
# Epoch 1: loss = 3.1710
# Epoch 2: loss = 2.4400
# Epoch 3: loss = 2.0149

In [None]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")  
test_texts = test_df["full_text"].tolist()


In [None]:

columns = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]

pred_df = pd.DataFrame(predictions, columns=columns)

if "text_id" in test_df.columns:
    pred_df.insert(0, "text_id", test_df["text_id"])


pred_df.to_csv("/kaggle/working/submission.csv", index=False)
