<a href="https://colab.research.google.com/github/Michel-p16/PDS-Project/blob/capstone_korbi/DTS_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Distilbert zur Evaluierung


In [27]:
!pip install transformers datasets
#1. Daten laden + filtern (SINGLE SELECT here)
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

with open("generated_responses_few_errors.json", "r") as file:
    dataset = json.load(file)

# Filtere das Dataset nach Fragen mit dem Typ "SINGLE_SELECT"
filtered_dataset = [example for example in dataset if example["type"] == "SINGLE_SELECT"]

# Hier den Code zum Reduzieren der Datenmenge einfügen:
#smaller_dataset = filtered_dataset[:500]  # Wähle die ersten 500 Datenpunkte
#formatted_dataset, label_encoder = convert_to_distilbert_format(smaller_dataset)




In [14]:
#2. Daten formatieren

def convert_to_distilbert_format(data):
    formatted_data = []
    all_labels = []  # Labels für Single-Select

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer["text"]
            label = answer.get("label")

            # Kontext aus der Antwort extrahieren (da er nicht separat vorhanden ist)
            context = text

            if label is not None:
                all_labels.append(label)
            formatted_data.append({
                "question": question,
                "context": context,  # Kontext ist hier die Antwort selbst
                "answers": {"text": [text], "answer_start": [0]}, # answer_start ist 0, da Kontext = Antwort
                "label": label
            })

    # Label-Encodierung für Single-Select
    label_encoder = LabelEncoder()
    label_encoder.fit(list(set(all_labels)))
    for example in formatted_data:
        if example["label"] is not None:
            example["label"] = label_encoder.transform([example["label"]])[0]

    return formatted_data, label_encoder  # Gib formatted_data und label_encoder zurück


# Hier den Funktionsaufruf außerhalb der Funktion platzieren:
formatted_dataset, label_encoder = convert_to_distilbert_format(filtered_dataset)

In [15]:
#3. Daten splitten 80% Training 20% Evaluation

train_data_formatted, eval_data_formatted = train_test_split(formatted_dataset, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {len(train_data_formatted)}")
print(f"Evaluationsdaten: {len(eval_data_formatted)}")

Trainingsdaten: 560
Evaluationsdaten: 140


In [16]:
import random

# Anzahl der Beispiele, die ausgegeben werden sollen
num_samples = 10

# Zufällige Beispiele aus den Trainingsdaten auswählen
random_indices_train = random.sample(range(len(train_data_formatted)), num_samples)

print("Formatierte Trainingsdaten:")
for index in random_indices_train:
    example = train_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

# Zufällige Beispiele aus den Evaluationsdaten auswählen
random_indices_eval = random.sample(range(len(eval_data_formatted)), num_samples)

print("\nFormatierte Evaluationsdaten:")
for index in random_indices_eval:
    example = eval_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

Formatierte Trainingsdaten:
Beispiel 215:
  Frage: May we process your data?
  Kontext: Sure, or no.
  Antworten: {'text': ['Sure, or no.'], 'answer_start': [0]}
  Label: 0
--------------------
Beispiel 20:
  Frage: May we process your data?
  Kontext: I don't care
  Antworten: {'text': ["I don't care"], 'answer_start': [0]}
  Label: 1
--------------------
Beispiel 17:
  Frage: May we process your data?
  Kontext: I don't care
  Antworten: {'text': ["I don't care"], 'answer_start': [0]}
  Label: 1
--------------------
Beispiel 237:
  Frage: May we process your data?
  Kontext: Sure, or no.
  Antworten: {'text': ['Sure, or no.'], 'answer_start': [0]}
  Label: 0
--------------------
Beispiel 469:
  Frage: May we process your data?
  Kontext: Yes, you may process my data as described in your privacy policy.
  Antworten: {'text': ['Yes, you may process my data as described in your privacy policy.'], 'answer_start': [0]}
  Label: 2
--------------------
Beispiel 537:
  Frage: May we process 

In [17]:
# 4. Labels vorbereiten
# Extrahiere die Labels für den LabelEncoder
train_labels = [example["label"] for example in train_data_formatted if example["label"] is not None]

# Erstellen und Anpassen des LabelEncoders
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

# Funktion zum Transformieren der Labels
def transform_label(example):
    if example["label"] is not None:
        example["label"] = label_encoder.transform([example["label"]])[0]
    return example

# Transformiere die Labels in den formatierten Daten
train_data_formatted = [transform_label(example) for example in train_data_formatted]

In [19]:
# 5. Datasets erstellen
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data_formatted))
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data_formatted))

# Begrenzen der Datenpunkte
train_dataset = train_dataset.select(range(400))  # Begrenze Trainingsdaten auf 100 Punkte
eval_dataset = eval_dataset.select(range(100))    # Begrenze Evaluationsdaten auf 25 Punkte

# Überprüfen der Begrenzung
print(f"Anzahl der Trainingsdaten: {len(train_dataset)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset)}")


Anzahl der Trainingsdaten: 400
Anzahl der Evaluationsdaten: 100


In [9]:
# 6. Vorverarbeitung

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["question"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [44]:
"""
# class weights balancen
# Extrahiere die Labels für den LabelEncoder aus train_data_formatted
train_labels = [example["label"] for example in train_data_formatted]

# Wandle die Labels in ein NumPy-Array um
train_labels = np.array(train_labels)

# Berechne die Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

# Wandle die Class Weights in einen PyTorch-Tensor um
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)



class WeightedLoss(nn.Module):
    def __init__(self, weight):
        super(WeightedLoss, self).__init__()
        self.weight = weight

    def forward(self, input, target):
        # Hier die originale Loss-Funktion (z.B. CrossEntropyLoss) verwenden
        criterion = nn.CrossEntropyLoss(weight=self.weight)
        loss = criterion(input, target)
        return loss

# Definiere das Gerät (GPU oder CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Erstelle eine Instanz der WeightedLoss-Klasse
weighted_loss = WeightedLoss(weight=class_weights_tensor.to(device))

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        loss = weighted_loss(logits, labels)  # weighted_loss ist deine Custom Loss Function
        return (loss, outputs) if return_outputs else loss

def preprocess_function(examples):
    return tokenizer(examples["question"], padding="max_length", truncation=True, return_tensors="pt")
"""

'\n# class weights balancen\n# Extrahiere die Labels für den LabelEncoder aus train_data_formatted\ntrain_labels = [example["label"] for example in train_data_formatted]\n\n# Wandle die Labels in ein NumPy-Array um\ntrain_labels = np.array(train_labels)\n\n# Berechne die Class Weights\nclass_weights = compute_class_weight(class_weight=\'balanced\', classes=np.unique(train_labels), y=train_labels) \n\n# Wandle die Class Weights in einen PyTorch-Tensor um\nclass_weights_tensor = torch.tensor(class_weights, dtype=torch.float)\n\n\n\nclass WeightedLoss(nn.Module):\n    def __init__(self, weight):\n        super(WeightedLoss, self).__init__()\n        self.weight = weight\n\n    def forward(self, input, target):\n        # Hier die originale Loss-Funktion (z.B. CrossEntropyLoss) verwenden\n        criterion = nn.CrossEntropyLoss(weight=self.weight) \n        loss = criterion(input, target)\n        return loss\n\n# Definiere das Gerät (GPU oder CPU)\ndevice = torch.device("cuda" if torch.cu

In [45]:
from collections import Counter

train_labels = [sample['label'] for sample in train_data_formatted]
eval_labels = [sample['label'] for sample in eval_data_formatted]

print("Trainingsdaten Klassenverteilung:", Counter(train_labels))
print("Evaluationsdaten Klassenverteilung:", Counter(eval_labels))


Trainingsdaten Klassenverteilung: Counter({2: 228, 0: 226, 1: 106})
Evaluationsdaten Klassenverteilung: Counter({2: 54, 0: 53, 1: 33})


In [42]:
# 7.Training start
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))  # num_labels anpassen

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluation nach jeder Epoche
    save_strategy="epoch",  # Speichern nach jeder Epoche
    logging_dir="./logs",
    load_best_model_at_end=True, # bestes Modell am Ende laden,
    metric_for_best_model="accuracy",  # Wähle die Metrik, die das beste Modell definiert
    greater_is_better=True,  # Für Metriken wie Accuracy, Precision, etc
    report_to="none",  # Deaktiviert WandB
    logging_steps=10,  # Logge Fortschritte alle 10 Schritte
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Evaluationsdatensatz hinzufügen
    compute_metrics=compute_metrics,  # Metrikenfunktion hinzufügen

)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: You have to specify either input_ids or inputs_embeds

In [None]:
#8. Evaluation nach Training
from transformers import AutoModelForSequenceClassification

model_path = "./results/checkpoint-XXX" # Ersetze XXX durch den Checkpoint-Schritt
model = AutoModelForSequenceClassification.from_pretrained(model_path)


eval_df = pd.DataFrame(eval_data)
eval_texts = eval_df["question"].tolist()

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

from datasets import Dataset
eval_dataset = Dataset.from_dict({"text": eval_texts})
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

import numpy as np

predictions = trainer.predict(eval_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Füge die vorhergesagten Labels zum eval_df hinzu
eval_df["predicted_label"] = predicted_labels

# Mappe die numerischen Labels zurück auf die ursprünglichen Labels (falls erforderlich)
# ... (Hier musst du den Code einfügen, um die numerischen Labels auf die ursprünglichen Labels zu mappen) ...

In [None]:
#evaluationsergebnisse
print(eval_df[["question", "predicted_label"]])