<a href="https://colab.research.google.com/github/Michel-p16/PDS-Project/blob/capstone_korbi/DTS_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Distilbert zur Evaluierung


In [80]:
!pip install transformers datasets
#1. Daten laden + filtern (SINGLE SELECT here)
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report

import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

with open("generated_responses_few_errors.json", "r") as file:
    dataset = json.load(file)

# Filtere das Dataset nach Fragen mit dem Typ "SINGLE_SELECT"
filtered_dataset = [example for example in dataset if example["type"] == "SINGLE_SELECT"]





In [68]:
#2. Daten formatieren

def convert_to_distilbert_format(data):
    formatted_data = []
    all_labels = []  # Labels für Single-Select

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer["text"]
            label = answer.get("label")

            # Kontext aus der Antwort extrahieren (da er nicht separat vorhanden ist)
            context = text

            if label is not None:
                all_labels.append(label)
            formatted_data.append({
                "question": question,
                "context": context,  # Kontext ist hier die Antwort selbst
                "answers": {"text": [text], "answer_start": [0]}, # answer_start ist 0, da Kontext = Antwort
                "label": label
            })

    # Label-Encodierung für Single-Select
    label_encoder = LabelEncoder()
    label_encoder.fit(list(set(all_labels)))
    for example in formatted_data:
        if example["label"] is not None:
            example["label"] = label_encoder.transform([example["label"]])[0]

    return formatted_data, label_encoder  # Gib formatted_data und label_encoder zurück


# Hier den Funktionsaufruf außerhalb der Funktion platzieren:
formatted_dataset, label_encoder = convert_to_distilbert_format(filtered_dataset)

In [69]:
#3. Daten splitten 80% Training 20% Evaluation

train_data_formatted, eval_data_formatted = train_test_split(formatted_dataset, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {len(train_data_formatted)}")
print(f"Evaluationsdaten: {len(eval_data_formatted)}")

Trainingsdaten: 560
Evaluationsdaten: 140


In [70]:
import random

# Anzahl der Beispiele, die ausgegeben werden sollen
num_samples = 10

# Zufällige Beispiele aus den Trainingsdaten auswählen
random_indices_train = random.sample(range(len(train_data_formatted)), num_samples)

print("Formatierte Trainingsdaten:")
for index in random_indices_train:
    example = train_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

# Zufällige Beispiele aus den Evaluationsdaten auswählen
random_indices_eval = random.sample(range(len(eval_data_formatted)), num_samples)

print("\nFormatierte Evaluationsdaten:")
for index in random_indices_eval:
    example = eval_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

Formatierte Trainingsdaten:
Beispiel 115:
  Frage: May we process your data?
  Kontext: No, I do not consent to the processing of my data.
  Antworten: {'text': ['No, I do not consent to the processing of my data.'], 'answer_start': [0]}
  Label: 0
--------------------
Beispiel 26:
  Frage: May we process your data?
  Kontext: Yes, you may process my data.
  Antworten: {'text': ['Yes, you may process my data.'], 'answer_start': [0]}
  Label: 2
--------------------
Beispiel 282:
  Frage: May we process your data?
  Kontext: No, I do not consent to the processing of my data.
  Antworten: {'text': ['No, I do not consent to the processing of my data.'], 'answer_start': [0]}
  Label: 0
--------------------
Beispiel 251:
  Frage: May we process your data?
  Kontext: I don't care
  Antworten: {'text': ["I don't care"], 'answer_start': [0]}
  Label: 1
--------------------
Beispiel 229:
  Frage: May we process your data?
  Kontext: No, I do not consent to the processing of my data.
  Antworten:

In [71]:
# 4. Labels vorbereiten
# Extrahiere die Labels für den LabelEncoder
train_labels = [example["label"] for example in train_data_formatted if example["label"] is not None]

# Erstellen und Anpassen des LabelEncoders
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

# Funktion zum Transformieren der Labels
def transform_label(example):
    if example["label"] is not None:
        example["label"] = label_encoder.transform([example["label"]])[0]
    return example

# Transformiere die Labels in den formatierten Daten
train_data_formatted = [transform_label(example) for example in train_data_formatted]

In [74]:
# 5. Datasets erstellen
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data_formatted))
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data_formatted))


# Überprüfen Datenmenge
print(f"Anzahl der Trainingsdaten: {len(train_dataset)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset)}")


Anzahl der Trainingsdaten: 560
Anzahl der Evaluationsdaten: 140


In [75]:
# 6. Vorverarbeitung

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(
        examples["question"], examples["context"],
        padding=True, truncation=True, max_length=128
    )


train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [76]:
from collections import Counter

train_labels = [sample['label'] for sample in train_data_formatted]
eval_labels = [sample['label'] for sample in eval_data_formatted]

print("Trainingsdaten Klassenverteilung:", Counter(train_labels))
print("Evaluationsdaten Klassenverteilung:", Counter(eval_labels))


Trainingsdaten Klassenverteilung: Counter({2: 228, 0: 226, 1: 106})
Evaluationsdaten Klassenverteilung: Counter({2: 54, 0: 53, 1: 33})


In [77]:
# 7.Training start
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))  # num_labels anpassen

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluation nach jeder Epoche
    save_strategy="epoch",  # Speichern nach jeder Epoche
    logging_dir="./logs",
    load_best_model_at_end=True, # bestes Modell am Ende laden,
    metric_for_best_model="accuracy",  # Wähle die Metrik, die das beste Modell definiert
    greater_is_better=True,  # Für Metriken wie Accuracy, Precision, etc
    report_to="none",  # Deaktiviert WandB
    logging_steps=10,  # Logge Fortschritte alle 10 Schritte
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Evaluationsdatensatz hinzufügen
    compute_metrics=compute_metrics,  # Metrikenfunktion hinzufügen

)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0632,0.054803,1.0,1.0,1.0,1.0
2,0.0241,0.019766,1.0,1.0,1.0,1.0


TrainOutput(global_step=140, training_loss=0.25494763063532966, metrics={'train_runtime': 453.0649, 'train_samples_per_second': 2.472, 'train_steps_per_second': 0.309, 'total_flos': 7534217658240.0, 'train_loss': 0.25494763063532966, 'epoch': 2.0})

In [83]:
#8. Evaluation nach Training

# 1. Laden des besten Modells
checkpoints = [ckpt for ckpt in os.listdir("./results") if ckpt.startswith("checkpoint")]
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[1]))
model_path = f"./results/{latest_checkpoint}"
print(f"Using model from: {model_path}")

# 2. Evaluation Dataset vorbereiten
eval_df = pd.DataFrame(eval_data_formatted)  # Verwende deine formatierten Evaluationsdaten
eval_texts = eval_df["question"].tolist()
eval_labels = eval_df["label"].tolist()  # Extrahiere die tatsächlichen Labels

# 3. Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Hugging Face Dataset erstellen
eval_dataset = Dataset.from_dict({"text": eval_texts, "label": eval_labels})
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# 4. Modellvorhersagen
predictions = trainer.predict(eval_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# 5. Vorhersagen in DataFrame einfügen
eval_df["predicted_label"] = predicted_labels

# 6. Labels zurückmappen (falls codiert)
eval_df["true_label"] = eval_df["label"]
eval_df["predicted_label_decoded"] = label_encoder.inverse_transform(eval_df["predicted_label"])
eval_df["true_label_decoded"] = label_encoder.inverse_transform(eval_df["true_label"])

# 7. Berechnung der Metriken
# Konvertiere die Klassenlabels in Strings
target_names = [str(label) for label in label_encoder.classes_]
# Klassifikationsbericht erstellen und ausgeben
print("Klassifikationsbericht:")
print(classification_report(eval_df["true_label"], eval_df["predicted_label"], target_names=target_names))





Using model from: ./results/checkpoint-140


Map:   0%|          | 0/140 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        53
           1       0.24      1.00      0.38        33
           2       0.00      0.00      0.00        54

    accuracy                           0.24       140
   macro avg       0.08      0.33      0.13       140
weighted avg       0.06      0.24      0.09       140



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [88]:
# 9. Manuelle Eingabe Test

# 1. Laden des Modells und Tokenizers
model_path = f"./results/{latest_checkpoint}"  # Stelle sicher, dass das Modell korrekt geladen wird
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Mapping der Labels
label_mapping = {
    0: "No",
    1: "No Preference",
    2: "Yes"
}

# 2. Manuelle Eingabe
text = input("Gib einen Text ein, den das Modell klassifizieren soll: ")

# 3. Vorverarbeitung der Eingabe
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

# 4. Modellvorhersage
model.eval()  # Setzt das Modell in den Evaluationsmodus
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

# 5. Decodierung der Vorhersage
predicted_label = label_mapping[predicted_class]  # Nutze das Mapping

# 6. Ausgabe
print(f"Der eingegebene Text wurde als '{predicted_label}' klassifiziert.")


Gib einen Text ein, den das Modell klassifizieren soll: sure u can do that
Der eingegebene Text wurde als 'No Preference' klassifiziert.


In [None]:
#evaluationsergebnisse
print(eval_df[["question", "predicted_label"]])