In [1]:
#!pip install transformers datasets accelerate

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
from transformers import get_scheduler
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna

  from .autonotebook import tqdm as notebook_tqdm


# Multitask Regression


In [2]:
class EssayDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        targets = torch.tensor(self.targets[idx], dtype=torch.float)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'targets': targets
        }


In [3]:
class BertRegressionModel(BertPreTrainedModel):
    def __init__(self, config, dropoutRate= 0.3, num_outputs=6):
        super().__init__(config)
        self.bert = BertModel(config)

        #for param in bert.parameters():
            # Nur die linearen Layer trainieren
            # param.requires_grad = False
    
        for name, param in self.bert.named_parameters():
            #Nur die letzten 2 Layer trainieren
            if "layer.10" in name or "layer.11" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

        self.dropout = nn.Dropout(dropoutRate)
        self.regressor = nn.Linear(config.hidden_size, num_outputs)

    def forward(self, input_ids, attention_mask, targets=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        preds = self.regressor(x)
        return preds


In [4]:
df = pd.read_csv("../../data/train.csv")
texts = df["full_text"].tolist()
targets = df[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values

train_texts, val_texts, train_targets, val_targets = train_test_split(texts, targets, test_size=0.1)

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # SentenceBert ausprobieren
tokenizer = BertTokenizer.from_pretrained("./mein_bert_model/")

train_dataset = EssayDataset(train_texts, train_targets, tokenizer)
val_dataset = EssayDataset(val_texts, val_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [5]:
class MCRMSELoss(nn.Module): # noch damit ausprobieren
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')  # wichtig!

    def forward(self, preds, targets):
        # preds/targets: [batch_size, num_targets]
        colwise_mse = self.mse(preds, targets).mean(dim=0)  # Mittelwert pro Spalte
        colwise_rmse = torch.sqrt(colwise_mse + 1e-8)       # RMSE pro Ziel
        return colwise_rmse.mean()                         # Mittelwert über Ziele


In [8]:
def objective(trail):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    lr = trail.suggest_float("lr", 1e-5, 5e-5, log=True)
    dropout = trail.suggest_float("dropout", 0.1, 0.5)
    batch_size = trail.suggest_categorical("batch_size", [8, 16, 32])
    epochs = trail.suggest_int("epochs", 3, 10)

    model = BertRegressionModel.from_pretrained("./mein_bert_model/", dropoutRate=dropout).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = MCRMSELoss()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            optimizer.zero_grad()
            preds = model(input_ids, attention_mask)
            loss = loss_fn(preds, targets)
            loss.backward()
            optimizer.step()
    
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            preds = model(input_ids, attention_mask)
            loss = loss_fn(preds, targets)
            val_loss += loss.item()

    return val_loss / len(val_loader)

In [9]:

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[I 2025-06-26 14:00:21,138] A new study created in memory with name: no-name-749f8ee3-1db4-4a85-8d8b-fe4f1c9a812f
Some weights of BertRegressionModel were not initialized from the model checkpoint at ./mein_bert_model/ and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2025-06-26 14:00:46,526] Trial 0 failed with parameters: {'lr': 4.139995659321839e-05, 'dropout': 0.26614683632225256, 'batch_size': 32, 'epochs': 5} because of the following error: RuntimeError('CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 3072 n 16384 k 768 mat1_ld 768 mat2_ld 768 result_ld 3072 abcType 0 computeType 68 scaleType 0').
Traceback (most recent call last):
  File "c:\Users\timok\Documents\HSKA\Info_Master\2.Semester\AI_Labor\AI_Project\aiProjectVenv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_t

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 3072 n 16384 k 768 mat1_ld 768 mat2_ld 768 result_ld 3072 abcType 0 computeType 68 scaleType 0

In [None]:
print("Beste Parameter:", study.best_params)
best_params = study.best_params

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#model = BertRegressionModel.from_pretrained("bert-base-uncased", num_outputs=6)
model = BertRegressionModel.from_pretrained("./mein_bert_model/",dropoutRate= best_params["dropout"], num_outputs=6)

train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_params["batch_size"])

model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=best_params["lr"])
#optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = MCRMSELoss()
num_epochs = best_params["epochs"]

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1}: loss = {loss.item():.4f}")


Some weights of BertRegressionModel were not initialized from the model checkpoint at ./mein_bert_model/ and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: loss = 1.1264
Epoch 2: loss = 0.4747
Epoch 3: loss = 0.5721


In [20]:
import pandas as pd

test_df = pd.read_csv("../../data/test.csv")  
test_texts = test_df["full_text"].tolist()


In [21]:
test_dataset = EssayDataset(test_texts, targets=[[0]*6]*len(test_texts), tokenizer=tokenizer)  
test_loader = DataLoader(test_dataset, batch_size=8)


In [22]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.append(outputs.cpu().numpy())

import numpy as np
predictions = np.vstack(predictions)
print(predictions)


[[3.375704  3.175569  3.3665185 3.3763192 3.151475  3.2728837]
 [3.3416333 3.1318235 3.3881595 3.1415646 3.1871712 3.2017174]
 [3.7802155 3.502886  3.9107623 3.7809057 3.628125  3.6386893]]


In [23]:

columns = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]

pred_df = pd.DataFrame(predictions, columns=columns)

if "text_id" in test_df.columns:
    pred_df.insert(0, "text_id", test_df["text_id"])


pred_df.to_csv("predictions.csv", index=False)


In [25]:
def round_half(x):
    return round(x * 2) / 2

pred_df[columns] = pred_df[columns].map(round_half)

display(pred_df)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.5,3.0,3.5,3.5,3.0,3.5
1,000BAD50D026,3.5,3.0,3.5,3.0,3.0,3.0
2,00367BB2546B,4.0,3.5,4.0,4.0,3.5,3.5
