Multi Select Questions

In [14]:
# Laden des neuen Datensatzes
import json
with open('final_multi_question_data.json', 'r') as file:
    data = json.load(file)

In [11]:
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#!pip install datasets
#!pip install transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

In [32]:
filtered_dataset_multi = [example for example in data if example["type"] == "MULTI_SELECT"]
filtered_dataset_multi

[{'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answers': [{'answer_text': "I'm interested in improving data quality and generating business cards.",
    'answer_label': 'DataQuality, BusinessCards',
    'timestamp': '2025-01-25T21:31:30.976867'},
   {'answer_text': 'My priorities are data enrichment and visit reports.',
    'answer_label': 'DataEnrichment, VisitReport',
    'timestamp': '2025-01-25T21:31:30.976977'},
   {'answer_text': 'I need data cleansing and improved data quality.',
    'answer_label': 'Data Cleansing, DataQuality',
    'timestamp': '2025-01-25T21:31:30.977001'},
   {'answer_text': "I'm focused on business cards and data cleansing.",
    'answer_label': 'BusinessCards, Data Cleansing',
    'timestamp': '2025-01-25T21:31:30.977012'},
   {'answer_text': 'My interest lies in visit reports and data enrichment.',
    'answer_label': 'VisitReport, DataEnrichment',
    'timestamp': '2025-01-25T21:31:30.977022'},
   {'answer_text': "I'm expl

In [15]:
""" rows = []
for entry in data:
    #question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            #"question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["answer_text"],
            "answer_label": answer["answer_label"],
            "timestamp": answer["timestamp"]
        })

filtered_dataset_multi = pd.DataFrame(rows) """

In [33]:
filtered_dataset_multi

[{'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answers': [{'answer_text': "I'm interested in improving data quality and generating business cards.",
    'answer_label': 'DataQuality, BusinessCards',
    'timestamp': '2025-01-25T21:31:30.976867'},
   {'answer_text': 'My priorities are data enrichment and visit reports.',
    'answer_label': 'DataEnrichment, VisitReport',
    'timestamp': '2025-01-25T21:31:30.976977'},
   {'answer_text': 'I need data cleansing and improved data quality.',
    'answer_label': 'Data Cleansing, DataQuality',
    'timestamp': '2025-01-25T21:31:30.977001'},
   {'answer_text': "I'm focused on business cards and data cleansing.",
    'answer_label': 'BusinessCards, Data Cleansing',
    'timestamp': '2025-01-25T21:31:30.977012'},
   {'answer_text': 'My interest lies in visit reports and data enrichment.',
    'answer_label': 'VisitReport, DataEnrichment',
    'timestamp': '2025-01-25T21:31:30.977022'},
   {'answer_text': "I'm expl

In [47]:
#2. Daten formatieren (diesmal inkl Fragen Einbezug für training) -> binäre vektoren durch MultiLabelBinarizer

from sklearn.preprocessing import MultiLabelBinarizer

# Labels direkt in Float32 konvertieren
def convert_to_multi_select_format(data, mlb):
    formatted_data = []

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer.get("answer_text", "")
            labels = answer.get("answer_label", "").split(",")

            # Labels bereinigen
            labels = [label.strip() for label in labels]
            #print(labels)

            if labels:  # Falls Labels vorhanden sind
                label_vector = mlb.transform([labels])[0]  # Binärvektor
                label_vector = label_vector.astype(float)  # **Hier direkt in float konvertieren!**

                formatted_data.append({
                    "question": question,  # Frage bleibt erhalten
                    "text": text,  # Antwort bleibt erhalten
                    "labels": label_vector  # Labels nun als float
                })

    return formatted_data

In [48]:
all_labels = set()
for example in filtered_dataset_multi:
    for answer in example["answers"]:
        labels = answer.get("answer_label", "").split(",")
        labels = [label.strip() for label in labels]  # Führende/nachfolgende Leerzeichen entfernen
        all_labels.update(labels)

all_labels

{"'Angelina Haug'",
 "'Domiki Stein'",
 "'Erik Schneider'",
 "'Jens Roschmann'",
 "'Jessica Hanke'",
 "'Joachim Wagner'",
 "'Johannes Wagner'",
 "'Marisa Peng'",
 "'Oliver Eibel'",
 "'Sandro Kalter'",
 "'Sean Kennin'",
 "'Stephan Maier'",
 "'Tim Persson'",
 '1 week',
 '100',
 '2 weeks',
 '200',
 '234',
 '256',
 '3 weeks',
 '300',
 'AKW100',
 'AX100',
 'Automotive radar target simulation',
 'BusinessCards',
 'Capture trade fair contacts',
 'Clean up CRM',
 'Competitor',
 'Data Cleansing',
 'DataEnrichment',
 'DataQuality',
 'Display port debugging and compliance',
 'Double-Pulse Testing',
 'Email',
 'Existing customer',
 'Extract data from emails',
 'High-speed interconnect testing',
 'Improve CRM data quality',
 'JS EcoLine',
 'JTS',
 'MY-SYSTEM',
 'New customer',
 'No action',
 'Noise figure measurements',
 'Notion',
 'Phone',
 'Press',
 'Prospect',
 'Scan business cards',
 'Schedule a Visit',
 'Supplier',
 'VisitReport',
 'media'}

In [49]:
# MultiLabelBinarizer initialisieren
multi_label_binarizer = MultiLabelBinarizer(classes=sorted(list(all_labels)))  # Sortiert für Konsistenz
multi_label_binarizer.fit([list(all_labels)])

In [50]:
# Daten formatieren
formatted_multi_dataset = convert_to_multi_select_format(filtered_dataset_multi, multi_label_binarizer)
# Überprüfung der Ergebnisse
print(f"Anzahl der formatierten Beispiele: {len(formatted_multi_dataset)}")
if formatted_multi_dataset:
    print(f"Beispiel: {formatted_multi_dataset[0]}")
print(f"Alle möglichen Labels: {multi_label_binarizer.classes_}")

Anzahl der formatierten Beispiele: 745
Beispiel: {'question': 'What are your product interests?', 'text': "I'm interested in improving data quality and generating business cards.", 'labels': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])}
Alle möglichen Labels: ["'Angelina Haug'" "'Domiki Stein'" "'Erik Schneider'" "'Jens Roschmann'"
 "'Jessica Hanke'" "'Joachim Wagner'" "'Johannes Wagner'" "'Marisa Peng'"
 "'Oliver Eibel'" "'Sandro Kalter'" "'Sean Kennin'" "'Stephan Maier'"
 "'Tim Persson'" '1 week' '100' '2 weeks' '200' '234' '256' '3 weeks'
 '300' 'AKW100' 'AX100' 'Automotive radar target simulation'
 'BusinessCards' 'Capture trade fair contacts' 'Clean up CRM' 'Competitor'
 'Data Cleansing' 'DataEnrichment' 'DataQuality'
 'Display port debugging and compliance' 'Double-Pulse Testing' 'Email'

In [51]:
#3. Daten splitten + formatieren in Hugging Face Dataset

#Split 80/20
train_data_multi_formatted, eval_data_multi_formatted = train_test_split(formatted_multi_dataset, test_size=0.2, random_state=42)

# Multi-Label-Trainings- und Evaluations-Dataset erstellen
train_dataset_multi = Dataset.from_pandas(pd.DataFrame(train_data_multi_formatted))
eval_dataset_multi = Dataset.from_pandas(pd.DataFrame(eval_data_multi_formatted))

# Überprüfung der Datenmengen
print(f"Anzahl der Trainingsdaten: {len(train_dataset_multi)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset_multi)}")

Anzahl der Trainingsdaten: 596
Anzahl der Evaluationsdaten: 149


In [54]:
#4. Dataset vorverarbeitung

from transformers import AutoTokenizer
import torch
from datasets import Dataset

model_name = 'deepset/roberta-base-squad2'

# Tokenizer initialisieren
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Vorverarbeitungsfunktion für Multi-Label-Daten
def preprocess_function_multi(examples):
    return tokenizer(
        examples["question"],  # Frage
        examples["text"],      # Antwort
        padding="max_length",
        truncation=True,
        max_length=128  # Falls nötig, auf 128 erhöht
    )

# Tokenizer-Funktion an
train_dataset_multi = train_dataset_multi.map(preprocess_function_multi, batched=True)
eval_dataset_multi = eval_dataset_multi.map(preprocess_function_multi, batched=True)

# PyTorch-kompatibles Format setzen
train_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Struktur überprüfen
print(train_dataset_multi[0])
print(train_dataset_multi[0]["labels"])  # Sollte ein Float-Tensor sein
print(train_dataset_multi[0]["labels"].dtype)  # Erwartet: torch.float32

Map:   0%|          | 0/596 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'input_ids': tensor([    0,  2264,    32,   110,  1152,  3168,   116,     2,     2,   100,
          240,    10,  3944,    13, 22738,   154,   127,  2111,   414,     4,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1, 

In [55]:
#5. PyTorch-kompatibles Format setzen
train_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [56]:
#6. Training

import pickle
from sklearn.metrics import precision_recall_fscore_support
from transformers import  AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch.nn.functional as F
import torch

# Loss-Funktion für Multi-Label-Klassifikation
def custom_loss(predictions, labels):
    return F.binary_cross_entropy_with_logits(predictions, labels)

# Eigene Trainer-Klasse mit angepasster Loss-Funktion
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")  # Entferne Labels aus Inputs
        outputs = model(**inputs)      # Modell-Vorhersagen
        logits = outputs.logits        # Logits extrahieren
        loss = custom_loss(logits, labels)  # BCEWithLogitsLoss berechnen
        return (loss, outputs) if return_outputs else loss

# Modell initialisieren (Multi-Label-Modus)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(multi_label_binarizer.classes_),  # Anzahl der Labels
    problem_type="multi_label_classification"  # WICHTIG für Multi-Label
)

# Speichern des Label-Mappings für späteres Decoding
label_mapping = {idx: label for idx, label in enumerate(multi_label_binarizer.classes_)}
inverse_label_mapping = {label: idx for idx, label in enumerate(multi_label_binarizer.classes_)}

label_mapping_path = "/content/drive/My Drive/CapStone_models/label_mapping_multi_wQ.pkl"
with open(label_mapping_path, "wb") as file:
    pickle.dump(label_mapping, file)

print("Label-Mapping erfolgreich gespeichert.")

# Anpassung der Metriken (für Multi-Label Klassifikation)
def compute_metrics(pred):
    labels = pred.label_ids  # Wahre Labels
    preds = torch.sigmoid(torch.tensor(pred.predictions)) > 0.5  # Wahrscheinlichkeiten -> Binärwerte

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds.numpy(), average="weighted"
    )
    acc = (preds.numpy() == labels).all(axis=1).mean()  # Beispielgenauigkeit
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# TrainingArgs
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    learning_rate=5e-5,
    weight_decay=0.01,  # Regularisierung -> erhöhen danach .05
    #adam_beta1=0.9, #senken falls langsam reaktion
    #adam_beta2=0.999, #senekn bei wenig lernprogress
    #adam_epsilon=1e-6,  # erhöhen falls instabil
    #label_smoothing_factor=0.1,  #bessere Generalisierung
    evaluation_strategy="epoch",  # eval pro epoch
    save_strategy="epoch",  # save pro epoch
    logging_dir="./logs",
    load_best_model_at_end=True,  # save best
    metric_for_best_model="f1",  # opt F1
    greater_is_better=True,  # higher better F1
    report_to="none",  # ausgeschaltet, weil kein Zugriff zu repo
    logging_steps=10,  # Alle 10 Schritte loggen
)

# Trainer-Objekt erstellen
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_multi,
    eval_dataset=eval_dataset_multi,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label-Mapping erfolgreich gespeichert.


In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1611,0.150821,0.0,0.0,0.0,0.0
2,0.119,0.113145,0.013423,0.074295,0.123268,0.06689
3,0.1005,0.093368,0.100671,0.165969,0.17629,0.177258
4,0.0851,0.080282,0.134228,0.295283,0.347241,0.287625
5,0.0798,0.075441,0.161074,0.361613,0.371295,0.377926
6,0.0778,0.069628,0.194631,0.394066,0.442596,0.404682
7,0.0617,0.064154,0.228188,0.511963,0.555161,0.518395
8,0.0646,0.059953,0.308725,0.559898,0.585561,0.571906
9,0.0601,0.054432,0.281879,0.516306,0.624882,0.481605
10,0.0495,0.050268,0.42953,0.655942,0.712402,0.64214


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

TrainOutput(global_step=2980, training_loss=0.06606806441641494, metrics={'train_runtime': 1206.9165, 'train_samples_per_second': 9.876, 'train_steps_per_second': 2.469, 'total_flos': 784429977661440.0, 'train_loss': 0.06606806441641494, 'epoch': 20.0})

In [70]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [73]:
model.save_pretrained("/content/drive/My Drive/CapStone_models/multi_roberta")
tokenizer.save_pretrained("/content/drive/My Drive/CapStone_models/multi_roberta")

print("Das Modell wurde erfolgreich gespeichert.")

Das Modell wurde erfolgreich gespeichert.


In [77]:
#import torch
import random
import pickle
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [78]:
# Modell & Tokenizer laden
model_path = "/content/drive/My Drive/CapStone_models/multi_roberta"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Label Mapping laden
label_mapping_path = "/content/drive/My Drive/CapStone_models/label_mapping_multi_wQ.pkl"
with open(label_mapping_path, "rb") as file:
    label_mapping = pickle.load(file)

# Fragen aus dem Dataset extrahieren
all_questions = list(set(example["question"] for example in dataset_multi))

# 5 Zufällige Frage auswählen
selected_questions = random.sample(all_questions, 5)

# Evaluation für jede Frage
for idx, question in enumerate(selected_questions, start=1):
    print(f"Frage {idx}: {question}")

    # Manuelle Eingabe der Antwort
    user_answer = input("Bitte geben Sie eine Antwort ein: ")

    # Eingabe tokenisieren (Frage + Antwort)
    inputs = tokenizer(
        question,  # Frage
        user_answer,  # Antwort
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    )

    # Modellvorhersage (ohne Gradientenberechnung)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Rohwerte des Modells

    # Wahrscheinlichkeiten berechnen
    probs = torch.sigmoid(logits).squeeze().tolist()

    # Schwellenwert setzen (z.B. 0.5)
    threshold = 0.25
    predicted_labels_indices = [idx for idx, prob in enumerate(probs) if prob > threshold]

    # Vorhergesagte Labels mappen
    predicted_labels = [label_mapping[idx] for idx in predicted_labels_indices]

    # Ausgabe der Vorhersagen
    print("Predicted weighted vector (probabilities for each label):")
    print(probs)

    print("Predicted binary vector (thresholded):")
    binary_vector = [1 if prob > threshold else 0 for prob in probs]
    print(binary_vector)

    print("Predicted Labels:")
    print(predicted_labels if predicted_labels else "Keine Labels vorhergesagt")

    print("-" * 60)  # Trennlinie für bessere Übersicht

Frage 1: What follow-up is planned?
Bitte geben Sie eine Antwort ein: calling
Predicted weighted vector (probabilities for each label):
[0.0027646347880363464, 0.003016249742358923, 0.0028603042010217905, 0.0033549643121659756, 0.0031317363027483225, 0.0040959883481264114, 0.004378580953925848, 0.0029001489747315645, 0.004173404071480036, 0.003602740354835987, 0.003498687408864498, 0.0030648282263427973, 0.0030200104229152203, 0.007016663905233145, 0.0024000201374292374, 0.0038833804428577423, 0.004095747135579586, 0.0047347666695714, 0.0036954085808247328, 0.007274362724274397, 0.004682971630245447, 0.004003577865660191, 0.004632485564798117, 0.002078949473798275, 0.005900014191865921, 0.003712946083396673, 0.00705999368801713, 0.005040167365223169, 0.0063518742099404335, 0.006251918151974678, 0.010179026983678341, 0.003208704525604844, 0.0060785445384681225, 0.042951494455337524, 0.006216627545654774, 0.002083328552544117, 0.00432062242180109, 0.006760545540601015, 0.0027503881137818

In [18]:
























#7. manueller test

import torch
import random
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification



!pip install tensorboard
# TensorBoard-Extension laden
%load_ext tensorboard

# TensorBoard starten und Logs visualisieren
%tensorboard --logdir ./logs

import os

log_dir = "./logs"
print("Inhalt des Log-Verzeichnisses:", os.listdir(log_dir))

import os

fit_dir = "./logs/fit"
print("Inhalt des fit-Ordners:", os.listdir(fit_dir))

fit_subdir = "./logs/fit/20250130-114034"
print("Inhalt des Unterordners:", os.listdir(fit_subdir))


train_log_dir = "./logs/fit/20250130-114034/train"
print("Inhalt des train-Ordners:", os.listdir(train_log_dir))


%load_ext tensorboard
%tensorboard --logdir ./logs/fit/20250130-114034/train


TypeError: string indices must be integers, not 'str'