Multi Select Questions

In [None]:
<a href="https://colab.research.google.com/github/Michel-p16/PDS-Project/blob/capstone_korbi/distilbert_multi_trainingwQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
#connect drive
from google.colab import drive
drive.mount('/content/drive')

# Pfad zum neuen Multi-Select-Dataset
dataset_path_multi = '/content/drive/My Drive/ColabData/final_multi_question_data.json'

# Laden des neuen Datensatzes
import json
with open(dataset_path_multi, 'r') as file:
    dataset_multi = json.load(file)

!pip install transformers datasets
#1. Daten laden + filtern (MULTI SELECT here)
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

# Filtere das Dataset nach Fragen mit dem Typ "MULTI_SELECT"
filtered_dataset_multi = [example for example in dataset_multi if example["type"] == "MULTI_SELECT"]



#2. Daten formatieren (diesmal inkl Fragen Einbezug für training) -> binäre vektoren durch MultiLabelBinarizer

from sklearn.preprocessing import MultiLabelBinarizer

# Labels direkt in Float32 konvertieren
def convert_to_multi_select_format(data, mlb):
    formatted_data = []

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer.get("answer_text", "")
            labels = answer.get("answer_label", "").split(",")

            # Labels bereinigen
            labels = [label.strip() for label in labels]

            if labels:  # Falls Labels vorhanden sind
                label_vector = mlb.transform([labels])[0]  # Binärvektor
                label_vector = label_vector.astype(float)  # **Hier direkt in float konvertieren!**

                formatted_data.append({
                    "question": question,  # Frage bleibt erhalten
                    "text": text,  # Antwort bleibt erhalten
                    "labels": label_vector  # Labels nun als float
                })

    return formatted_data



# Alle eindeutigen Labels aus dem Datensatz sammeln
all_labels = set()
for example in filtered_dataset_multi:
    for answer in example["answers"]:
        labels = answer.get("answer_label", "").split(",")
        labels = [label.strip() for label in labels]  # Führende/nachfolgende Leerzeichen entfernen
        all_labels.update(labels)

# MultiLabelBinarizer initialisieren
multi_label_binarizer = MultiLabelBinarizer(classes=sorted(list(all_labels)))  # Sortiert für Konsistenz
multi_label_binarizer.fit([list(all_labels)])

# Daten formatieren
formatted_multi_dataset = convert_to_multi_select_format(filtered_dataset_multi, multi_label_binarizer)

# Überprüfung der Ergebnisse
print(f"Anzahl der formatierten Beispiele: {len(formatted_multi_dataset)}")
if formatted_multi_dataset:
    print(f"Beispiel: {formatted_multi_dataset[0]}")
print(f"Alle möglichen Labels: {multi_label_binarizer.classes_}")

#3. Daten splitten + formatieren in Hugging Face Dataset

#Split 80/20
train_data_multi_formatted, eval_data_multi_formatted = train_test_split(formatted_multi_dataset, test_size=0.2, random_state=42)

# Multi-Label-Trainings- und Evaluations-Dataset erstellen
train_dataset_multi = Dataset.from_pandas(pd.DataFrame(train_data_multi_formatted))
eval_dataset_multi = Dataset.from_pandas(pd.DataFrame(eval_data_multi_formatted))

# Überprüfung der Datenmengen
print(f"Anzahl der Trainingsdaten: {len(train_dataset_multi)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset_multi)}")
#4. Dataset vorverarbeitung

from transformers import AutoTokenizer
import torch
from datasets import Dataset

# Tokenizer initialisieren
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Vorverarbeitungsfunktion für Multi-Label-Daten
def preprocess_function_multi(examples):
    return tokenizer(
        examples["question"],  # Frage
        examples["text"],      # Antwort
        padding="max_length",
        truncation=True,
        max_length=128  # Falls nötig, auf 128 erhöht
    )

# Tokenizer-Funktion an
train_dataset_multi = train_dataset_multi.map(preprocess_function_multi, batched=True)
eval_dataset_multi = eval_dataset_multi.map(preprocess_function_multi, batched=True)

# PyTorch-kompatibles Format setzen
train_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Struktur überprüfen
print(train_dataset_multi[0])
print(train_dataset_multi[0]["labels"])  # Sollte ein Float-Tensor sein
print(train_dataset_multi[0]["labels"].dtype)  # Erwartet: torch.float32


#5. PyTorch-kompatibles Format setzen
train_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

#6. Training

import pickle
from sklearn.metrics import precision_recall_fscore_support
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
import torch.nn.functional as F
import torch

# Loss-Funktion für Multi-Label-Klassifikation
def custom_loss(predictions, labels):
    return F.binary_cross_entropy_with_logits(predictions, labels)

# Eigene Trainer-Klasse mit angepasster Loss-Funktion
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")  # Entferne Labels aus Inputs
        outputs = model(**inputs)      # Modell-Vorhersagen
        logits = outputs.logits        # Logits extrahieren
        loss = custom_loss(logits, labels)  # BCEWithLogitsLoss berechnen
        return (loss, outputs) if return_outputs else loss

# Modell initialisieren (Multi-Label-Modus)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(multi_label_binarizer.classes_),  # Anzahl der Labels
    problem_type="multi_label_classification"  # WICHTIG für Multi-Label
)

# Speichern des Label-Mappings für späteres Decoding
label_mapping = {idx: label for idx, label in enumerate(multi_label_binarizer.classes_)}
inverse_label_mapping = {label: idx for idx, label in enumerate(multi_label_binarizer.classes_)}

label_mapping_path = "/content/drive/My Drive/label_mapping_multi_wQ.pkl"
with open(label_mapping_path, "wb") as file:
    pickle.dump(label_mapping, file)

print("Label-Mapping erfolgreich gespeichert.")

# Anpassung der Metriken (für Multi-Label Klassifikation)
def compute_metrics(pred):
    labels = pred.label_ids  # Wahre Labels
    preds = torch.sigmoid(torch.tensor(pred.predictions)) > 0.5  # Wahrscheinlichkeiten -> Binärwerte

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds.numpy(), average="weighted"
    )
    acc = (preds.numpy() == labels).all(axis=1).mean()  # Beispielgenauigkeit
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# TrainingArgs
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    learning_rate=5e-5,
    weight_decay=0.01,  # Regularisierung -> erhöhen danach .05
    #adam_beta1=0.9, #senken falls langsam reaktion
    #adam_beta2=0.999, #senekn bei wenig lernprogress
    #adam_epsilon=1e-6,  # erhöhen falls instabil
    #label_smoothing_factor=0.1,  #bessere Generalisierung
    evaluation_strategy="epoch",  # eval pro epoch
    save_strategy="epoch",  # save pro epoch
    logging_dir="./logs",
    load_best_model_at_end=True,  # save best
    metric_for_best_model="f1",  # opt F1
    greater_is_better=True,  # higher better F1
    report_to="none",  # ausgeschaltet, weil kein Zugriff zu repo
    logging_steps=10,  # Alle 10 Schritte loggen
)

# Trainer-Objekt erstellen
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_multi,
    eval_dataset=eval_dataset_multi,
    compute_metrics=compute_metrics,
)

# Training starten
trainer.train()

# Modell speichern
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/My Drive/multi_distilbert_wQ")
tokenizer.save_pretrained("/content/drive/My Drive/multi_distilbert_wQ")

print("Das Modell wurde erfolgreich gespeichert.")


#7. manueller test

import torch
import random
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Modell & Tokenizer laden
model_path = "/content/drive/My Drive/multi_distilbert_wQ"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Label Mapping laden
label_mapping_path = "/content/drive/My Drive/label_mapping_multi_wQ.pkl"
with open(label_mapping_path, "rb") as file:
    label_mapping = pickle.load(file)

# Fragen aus dem Dataset extrahieren
all_questions = list(set(example["question"] for example in dataset_multi))

# 5 Zufällige Frage auswählen
selected_questions = random.sample(all_questions, 5)

# Evaluation für jede Frage
for idx, question in enumerate(selected_questions, start=1):
    print(f"Frage {idx}: {question}")

    # Manuelle Eingabe der Antwort
    user_answer = input("Bitte geben Sie eine Antwort ein: ")

    # Eingabe tokenisieren (Frage + Antwort)
    inputs = tokenizer(
        question,  # Frage
        user_answer,  # Antwort
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    )

    # Modellvorhersage (ohne Gradientenberechnung)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Rohwerte des Modells

    # Wahrscheinlichkeiten berechnen
    probs = torch.sigmoid(logits).squeeze().tolist()

    # Schwellenwert setzen (z.B. 0.5)
    threshold = 0.25
    predicted_labels_indices = [idx for idx, prob in enumerate(probs) if prob > threshold]

    # Vorhergesagte Labels mappen
    predicted_labels = [label_mapping[idx] for idx in predicted_labels_indices]

    # Ausgabe der Vorhersagen
    print("Predicted weighted vector (probabilities for each label):")
    print(probs)

    print("Predicted binary vector (thresholded):")
    binary_vector = [1 if prob > threshold else 0 for prob in probs]
    print(binary_vector)

    print("Predicted Labels:")
    print(predicted_labels if predicted_labels else "Keine Labels vorhergesagt")

    print("-" * 60)  # Trennlinie für bessere Übersicht

!pip install tensorboard
# TensorBoard-Extension laden
%load_ext tensorboard

# TensorBoard starten und Logs visualisieren
%tensorboard --logdir ./logs

import os

log_dir = "./logs"
print("Inhalt des Log-Verzeichnisses:", os.listdir(log_dir))

import os

fit_dir = "./logs/fit"
print("Inhalt des fit-Ordners:", os.listdir(fit_dir))

fit_subdir = "./logs/fit/20250130-114034"
print("Inhalt des Unterordners:", os.listdir(fit_subdir))


train_log_dir = "./logs/fit/20250130-114034/train"
print("Inhalt des train-Ordners:", os.listdir(train_log_dir))


%load_ext tensorboard
%tensorboard --logdir ./logs/fit/20250130-114034/train
