In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np

In [3]:
df = pd.read_csv("data/feature_engineered/train_vars.csv")
print(df)

           text_id                                          full_text  \
0     0016926B079C  i think that students would benefit from learn...   
1     0022683E9EA5  when a problem is a change you have to let it ...   
2     00299B378633  dear, principal  if u change the school policy...   
3     003885A45F42  the best time in life is when you become yours...   
4     0049B1DF5CCC  small act of kindness can impact in other peop...   
...            ...                                                ...   
3926  FFD29828A873  i believe using cellphones in class for educat...   
3927  FFD9A83B0849  working alone, students do not have to argue w...   
3928  FFDC4011AC9C  "a problem is a chance for you to do your best...   
3929  FFE16D704B16  many people disagree with albert schweitzer's ...   
3930  FFED00D6E0BD  do you think that failure is the main thing fo...   

      cohesion  syntax  vocabulary  phraseology  grammar  conventions  \
0          3.5     3.5         3.0          3.0   

In [4]:
label_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [5]:
feature_cols = df.columns.difference(['text_id', 'full_text'] + label_cols).tolist()

In [6]:
df = df.dropna()

In [7]:

# Deine Loss-Funktion bleibt unverändert
class MCRMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')

    def forward(self, preds, targets):
        colwise_mse = self.mse(preds, targets).mean(dim=0)
        colwise_rmse = torch.sqrt(colwise_mse + 1e-8)
        return colwise_rmse.mean()

# Targets: alle Labelspalten
y_all = df[label_cols].values
X = df[feature_cols].values

# Split für alle Zielgrößen
X_train, X_test, y_train, y_test = train_test_split(X, y_all, test_size=0.2, random_state=42)

# Modelltraining
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Vorhersage
y_pred = model.predict(X_test)

# In Torch-Tensoren umwandeln
y_pred_tensor = torch.tensor(y_pred, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Loss berechnen
mcrmse_loss = MCRMSELoss()
loss_value = mcrmse_loss(y_pred_tensor, y_test_tensor)

print(f"Evaluation mit MCRMSE: {loss_value.item():.4f}")

Evaluation mit MCRMSE: 0.5501
