In [2]:
import pandas as pd
import json

In [4]:
# JSON-Datei laden
with open('generated_responses_few_errors.json', 'r') as f:
    data = json.load(f)

# Daten in ein DataFrame umwandeln
rows = []
for entry in data:
    question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            "question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["text"],
            "answer_label": answer["label"],
            "timestamp": answer["timestamp"]
        })

# DataFrame erstellen
df = pd.DataFrame(rows)

# DataFrame anzeigen
#print(df.head())

In [42]:
#df

In [22]:
single_select_df = df[df['type'] == 'SINGLE_SELECT']
#single_select_df

In [44]:
#single_select_df.isna().sum()

In [45]:
#print(single_select_df['answer_label'].unique())

In [23]:
single_select_df = single_select_df[['question', 'answer_text', 'answer_label']]

In [24]:
single_select_df

Unnamed: 0,question,answer_text,answer_label
0,May we process your data?,Yes.,Yes
1,May we process your data?,"No, I do not consent to the processing of my d...",No
2,May we process your data?,"Yes, you may process my data.",Yes
3,May we process your data?,"No, I do not consent.",No
4,May we process your data?,I don't know,No Preference
...,...,...,...
695,May we process your data?,I don't know,No Preference
696,May we process your data?,"Yes, you may process my data as described in y...",Yes
697,May we process your data?,,No Preference
698,May we process your data?,"Yes, you may process my data.",Yes


In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
label_encoder = LabelEncoder()
single_select_df["label"] = label_encoder.fit_transform(single_select_df["answer_label"])  # z. B. "Yes" â†’ 0, "No" â†’ 1, "No Preference" â†’ 2

print(label_encoder.classes_)  # Zeigt, welche Zahl welchem Label entspricht

['No' 'No Preference' 'Yes']


In [27]:
from sklearn.model_selection import train_test_split

In [29]:
train_df, test_df = train_test_split(single_select_df, test_size=0.2, random_state=42, stratify=single_select_df["label"])

In [30]:
# Speichere die Datasets als JSON
train_df[["question", "answer_text", "label"]].to_json("train.json", orient="records", lines=True)
test_df[["question", "answer_text", "label"]].to_json("test.json", orient="records", lines=True)

In [31]:
print("Train size:", len(train_df), "Test size:", len(test_df))

Train size: 560 Test size: 140


In [19]:
!pip install datasets

In [20]:
from datasets import load_dataset

In [32]:
dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json"})
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer_text', 'label'],
        num_rows: 560
    })
    test: Dataset({
        features: ['question', 'answer_text', 'label'],
        num_rows: 140
    })
})


In [33]:
from transformers import AutoTokenizer

In [34]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    text_inputs = [q + " " + a for q, a in zip(examples["question"], examples["answer_text"])]
    return tokenizer(text_inputs, truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [35]:
from transformers import AutoModelForSequenceClassification

In [36]:
num_labels = len(label_encoder.classes_)  # Anzahl der Klassen (3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import TrainingArguments

In [38]:
training_args = TrainingArguments(
    output_dir="./roberta_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)



In [61]:
# Use the One Cycle Policy
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",  # 'linear' or 'cosine'
    warmup_steps=500,  # Steps to gradually increase the learning rate
    max_grad_norm=1.0,  # Gradient clipping
)



In [62]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Funktion zur Berechnung der Metriken
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Nimmt die Klasse mit der hÃ¶chsten Wahrscheinlichkeit
    acc = accuracy_score(labels, predictions)  # Accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")  # Andere Scores
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [63]:
from transformers import Trainer

In [64]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Hier fÃ¼gen wir die Metrik-Funktion hinzu!
)


trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,2e-05,1.0,1.0,1.0,1.0
2,0.0,7e-06,1.0,1.0,1.0,1.0
3,0.0,4e-06,1.0,1.0,1.0,1.0


TrainOutput(global_step=210, training_loss=3.185824306066414e-05, metrics={'train_runtime': 70.06, 'train_samples_per_second': 23.979, 'train_steps_per_second': 2.997, 'total_flos': 110507635445760.0, 'train_loss': 3.185824306066414e-05, 'epoch': 3.0})

In [65]:
model.save_pretrained("./roberta_finetuned")
tokenizer.save_pretrained("./roberta_finetuned")

('./roberta_finetuned/tokenizer_config.json',
 './roberta_finetuned/special_tokens_map.json',
 './roberta_finetuned/vocab.json',
 './roberta_finetuned/merges.txt',
 './roberta_finetuned/added_tokens.json',
 './roberta_finetuned/tokenizer.json')

In [66]:
label_mapping = {
    "LABEL_0": "No",
    "LABEL_1": "No Preference",
    "LABEL_2": "Yes"
}

In [72]:
test_df

Unnamed: 0,question,answer_text,answer_label,label
680,May we process your data?,"Yes, you may process my data.",Yes,2
444,May we process your data?,,No Preference,1
431,May we process your data?,"Yes, No, or Unsure?",No,0
693,May we process your data?,"Yes, No, or Unsure?",No,0
316,May we process your data?,"Yes, you may process my data.",Yes,2
...,...,...,...,...
554,May we process your data?,"Yes, you may process my data.",Yes,2
596,May we process your data?,"Yes, you may process my data as outlined in yo...",Yes,2
45,May we process your data?,"No, I do not consent.",No,0
32,May we process your data?,I don't care,No Preference,1


In [73]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")

test_input = {
    "question": "May we process your data?",
    "answer_text": ""
}

# Kombinieren wie im Training (Frage + Antwort)
test_text = test_input["question"] + " " + test_input["answer_text"]
result = classifier(test_text)
print(result)

# Die vorhergesagte Klasse zurÃ¼ck in ein Label umwandeln
predicted_label = label_mapping[result[0]["label"]]

print("Vorhergesagtes Label:", predicted_label)


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9999938011169434}]
Vorhergesagtes Label: No Preference


In [51]:
from transformers import pipeline

# Modell und Tokenizer aus gespeichertem Ordner laden
classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")


OSError: Incorrect path_or_model_id: './roberta_finetuned'. Please provide either the path to a local folder or the repo_id of a model on the Hub.