<a href="https://colab.research.google.com/github/Michel-p16/PDS-Project/blob/capstone_korbi/final_train_single_select_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#connect drive
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers datasets
#1. Daten laden + filtern (SINGLE SELECT here)
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report

import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

# Laden des single Datensatzes
dataset_path_single = '/content/drive/My Drive/ColabData/final_single_question_data.json'
with open(dataset_path_single, 'r') as file:
    dataset_single = json.load(file)

# Filtere das Dataset nach Fragen mit dem Typ "SINGLE_SELECT"
filtered_dataset = [example for example in dataset_single if example["type"] == "SINGLE_SELECT"]



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#2. Daten formatieren

def convert_to_distilbert_format(data):
    formatted_data = []
    all_labels = []  # Labels für Single-Select

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer["answer_text"]
            label = answer.get("answer_label")

            # Kontext aus der Antwort extrahieren (da er nicht separat vorhanden ist)
            context = text

            if label is not None:
                all_labels.append(label)
            formatted_data.append({
                "question": question,
                "context": context,  # Kontext ist hier die Antwort selbst
                "answers": {"text": [text], "answer_start": [0]}, # answer_start ist 0, da Kontext = Antwort
                "label": label
            })

    # Label-Encodierung für Single-Select
    label_encoder = LabelEncoder()
    label_encoder.fit(list(set(all_labels)))
    for example in formatted_data:
        if example["label"] is not None:
            example["label"] = label_encoder.transform([example["label"]])[0]

    return formatted_data, label_encoder  # Gib formatted_data und label_encoder zurück


# Hier den Funktionsaufruf außerhalb der Funktion platzieren:
formatted_dataset, label_encoder = convert_to_distilbert_format(filtered_dataset)

In [5]:
#3. Daten splitten 80% Training 20% Evaluation

train_data_formatted, eval_data_formatted = train_test_split(formatted_dataset, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {len(train_data_formatted)}")
print(f"Evaluationsdaten: {len(eval_data_formatted)}")

Trainingsdaten: 5544
Evaluationsdaten: 1386


In [6]:
import random

# Anzahl der Beispiele, die ausgegeben werden sollen
num_samples = 10

# Zufällige Beispiele aus den Trainingsdaten auswählen
random_indices_train = random.sample(range(len(train_data_formatted)), num_samples)

print("Formatierte Trainingsdaten:")
for index in random_indices_train:
    example = train_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

# Zufällige Beispiele aus den Evaluationsdaten auswählen
random_indices_eval = random.sample(range(len(eval_data_formatted)), num_samples)

print("\nFormatierte Evaluationsdaten:")
for index in random_indices_eval:
    example = eval_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

Formatierte Trainingsdaten:
Beispiel 3502:
  Frage: What's the average size of your trade fair team?
  Kontext: We have a large team, over 40 strong, dedicated to our trade fair activities.
  Antworten: {'text': ['We have a large team, over 40 strong, dedicated to our trade fair activities.'], 'answer_start': [0]}
  Label: 57
--------------------
Beispiel 2557:
  Frage: What industry are you in?
  Kontext: I'm involved in the design and implementation of Ethernet networks.
  Antworten: {'text': ["I'm involved in the design and implementation of Ethernet networks."], 'answer_start': [0]}
  Label: 35
--------------------
Beispiel 2798:
  Frage: What is the specific customer group you're targeting?
  Kontext: We're targeting wholesalers and distributors who are technologically advanced and data-driven.
  Antworten: {'text': ["We're targeting wholesalers and distributors who are technologically advanced and data-driven."], 'answer_start': [0]}
  Label: 54
--------------------
Beispiel 505:

In [7]:
# 4. Labels vorbereiten
# Extrahiere die Labels für den LabelEncoder
train_labels = [example["label"] for example in train_data_formatted if example["label"] is not None]

# Erstellen und Anpassen des LabelEncoders
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

# Funktion zum Transformieren der Labels
def transform_label(example):
    if example["label"] is not None:
        example["label"] = label_encoder.transform([example["label"]])[0]
    return example

# Transformiere die Labels in den formatierten Daten
train_data_formatted = [transform_label(example) for example in train_data_formatted]

In [8]:
# 5. Datasets erstellen
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data_formatted))
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data_formatted))


# Überprüfen Datenmenge
print(f"Anzahl der Trainingsdaten: {len(train_dataset)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset)}")


Anzahl der Trainingsdaten: 5544
Anzahl der Evaluationsdaten: 1386


In [9]:
# 6. Vorverarbeitung

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(
        examples["question"], examples["context"],
        padding=True, truncation=True, max_length=36
    )


train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

In [10]:
#test ob inbalanced answers/labels
from collections import Counter

train_labels = [sample['label'] for sample in train_data_formatted]
eval_labels = [sample['label'] for sample in eval_data_formatted]

print("Trainingsdaten Klassenverteilung:", Counter(train_labels))
print("Evaluationsdaten Klassenverteilung:", Counter(eval_labels))


Trainingsdaten Klassenverteilung: Counter({37: 195, 55: 194, 13: 137, 27: 115, 11: 106, 53: 104, 40: 102, 20: 101, 47: 101, 43: 100, 31: 100, 42: 99, 32: 99, 25: 98, 17: 98, 15: 98, 22: 97, 51: 96, 38: 96, 46: 95, 33: 95, 34: 95, 35: 94, 50: 94, 29: 92, 23: 91, 3: 91, 5: 91, 16: 91, 44: 91, 12: 91, 30: 90, 14: 90, 56: 90, 8: 89, 10: 89, 36: 89, 0: 89, 52: 89, 57: 88, 19: 88, 21: 88, 49: 88, 26: 87, 4: 87, 18: 86, 41: 86, 54: 85, 39: 84, 28: 84, 7: 82, 45: 82, 48: 82, 9: 82, 24: 79, 1: 76, 6: 75, 2: 73})
Evaluationsdaten Klassenverteilung: Counter({55: 42, 37: 41, 21: 34, 11: 33, 6: 32, 38: 31, 52: 31, 50: 29, 36: 29, 23: 29, 7: 29, 29: 28, 35: 28, 24: 27, 18: 27, 39: 27, 42: 27, 32: 26, 25: 26, 2: 26, 12: 26, 26: 25, 10: 25, 44: 25, 28: 25, 22: 25, 43: 24, 31: 24, 13: 24, 3: 24, 14: 24, 19: 23, 41: 23, 57: 23, 15: 23, 51: 23, 40: 22, 1: 22, 47: 22, 46: 21, 17: 21, 9: 20, 8: 20, 49: 19, 20: 19, 27: 18, 30: 18, 34: 18, 45: 18, 54: 17, 0: 17, 16: 16, 48: 16, 33: 16, 56: 15, 4: 15, 53: 14,

In [33]:
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import matplotlib.pyplot as plt
import os


# Modell laden
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_encoder.classes_)
)

# Erweiterte compute_metrics-Funktion
metrics_log = []  #save list

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    # save metrics, losses im callback
    metrics_log.append({
        'epoch': len(metrics_log) + 1,
        'training_loss': None,  # Wird aus dem Callback gefüllt
        'validation_loss': None,  # Wird aus dem Callback gefüllt
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    })

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }



# TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    learning_rate=1.5e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    logging_steps=50,  # Fortschritt loggen
)

# Early Stopping Callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=1,  # stop at 1 epoch worse
    early_stopping_threshold=0.000  #  min same
)

#loss add for csv
from transformers import TrainerCallback

class LossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        """Speichert Training & Validation Loss"""
        if logs:
            if "loss" in logs and len(metrics_log) > 0:
                metrics_log[-1]["training_loss"] = logs["loss"]

            if "eval_loss" in logs and len(metrics_log) > 0:
                metrics_log[-1]["validation_loss"] = logs["eval_loss"]


# Trainer-Objekt
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping, LossCallback()],  # Loss Callback
)



# 1. Training starten
trainer.train()

# 2. Metriken speichern
metrics_df = pd.DataFrame(metrics_log)
metrics_csv_path = "/content/drive/My Drive/distilSingle_training_metrics.csv"
metrics_df.to_csv(metrics_csv_path, index=False)
print(f"Metriken wurden erfolgreich unter {metrics_csv_path} gespeichert.")

# 4. Modell und Tokenizer speichern
save_path = "/content/drive/My Drive/single_distilbert_final"
os.makedirs(save_path, exist_ok=True)  # Erstelle den Pfad, falls er nicht existiert
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Das Modell wurde erfolgreich unter {save_path} gespeichert.")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3494,1.050884,0.914863,0.896305,0.891099,0.914863
2,0.232,0.148654,0.971861,0.971786,0.974231,0.971861
3,0.0653,0.075071,0.981241,0.981031,0.981538,0.981241
4,0.0299,0.085079,0.977633,0.977606,0.979062,0.977633


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metriken wurden erfolgreich unter /content/drive/My Drive/distilSingle_training_metrics.csv gespeichert.
Das Modell wurde erfolgreich unter /content/drive/My Drive/single_distilbert_final gespeichert.


In [24]:
# Nachträgliches Speichern der CSV-Datei
metrics_df = pd.DataFrame(metrics_log).fillna(0)  # Sicherstellen, dass keine NaN-Werte vorhanden sind
metrics_csv_path = "/content/drive/My Drive/valuesdistilSingle_training_metrics.csv"
metrics_df.to_csv(metrics_csv_path, index=False, encoding='utf-8')
print(f"CSV-Datei erfolgreich nachträglich gespeichert: {metrics_csv_path}")


CSV-Datei erfolgreich nachträglich gespeichert: /content/drive/My Drive/valuesdistilSingle_training_metrics.csv


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 1. Confusion-Matrix erstellen
eval_preds = trainer.predict(eval_dataset)
true_labels = eval_preds.label_ids
pred_labels = eval_preds.predictions.argmax(-1)

# 2. Confusion-Matrix berechnen
cm = confusion_matrix(true_labels, pred_labels)

# 3. Optionen zur Verbesserung der Confusion-Matrix
# Normierte Confusion-Matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Kürze die Labels, falls notwendig
short_labels = [label[:10] for label in label_encoder.classes_]  # Kürze auf 10 Zeichen

# Speichern der vollständigen Metrik
metrics_df = pd.DataFrame({
    "True Label": true_labels,
    "Predicted Label": pred_labels
})
metrics_df_path = "/content/drive/My Drive/distilSingle_confusion_metrics.csv"
metrics_df.to_csv(metrics_df_path, index=False)
print(f"Metriken wurden unter {metrics_df_path} gespeichert.")

# 4. Verschiedene Darstellungen der Confusion-Matrix
# Original Confusion-Matrix
fig1, ax1 = plt.subplots(figsize=(15, 15))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap="Blues", ax=ax1, xticks_rotation="vertical")
plt.title("Confusion Matrix (Original)")
plt.savefig("/content/drive/My Drive/distilSingle_confusion_matrix_original.png", bbox_inches="tight")

# Normierte Confusion-Matrix
fig2, ax2 = plt.subplots(figsize=(15, 15))
disp_normalized = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=label_encoder.classes_)
disp_normalized.plot(cmap="Blues", ax=ax2, xticks_rotation="vertical")
plt.title("Confusion Matrix (Normalized)")
plt.savefig("/content/drive/My Drive/distilSingle_confusion_matrix_normalized.png", bbox_inches="tight")

# Fehlklassifikationen anzeigen (ohne Diagonale)
cm_off_diagonal = cm.copy()
np.fill_diagonal(cm_off_diagonal, 0)  # Setze die Diagonale auf 0
fig3, ax3 = plt.subplots(figsize=(15, 15))
disp_off_diag = ConfusionMatrixDisplay(confusion_matrix=cm_off_diagonal, display_labels=label_encoder.classes_)
disp_off_diag.plot(cmap="Oranges", ax=ax3, xticks_rotation="vertical")
plt.title("Confusion Matrix (Fehlklassifikationen)")
plt.savefig("/content/drive/My Drive/distilSingle_confusion_matrix_off_diag.png", bbox_inches="tight")

# Reduzierte Confusion-Matrix (häufigste Labels)
top_classes = [0, 1, 2, 3, 4]  # Wähle die Top 5 Klassen
cm_reduced = cm[np.ix_(top_classes, top_classes)]
labels_reduced = [label_encoder.classes_[i] for i in top_classes]
fig4, ax4 = plt.subplots(figsize=(10, 10))
disp_reduced = ConfusionMatrixDisplay(confusion_matrix=cm_reduced, display_labels=labels_reduced)
disp_reduced.plot(cmap="Blues", ax=ax4, xticks_rotation="vertical")
plt.title("Reduced Confusion Matrix (Top 5 Classes)")
plt.savefig("/content/drive/My Drive/distilSingle_confusion_matrix_reduced.png", bbox_inches="tight")

# Anzeigen der Plots
plt.show()

# 5. Hinweis auf Speicherorte
print("Confusion-Matrices wurden gespeichert:")
print("- Original: /content/drive/My Drive/distilSingle_confusion_matrix_original.png")
print("- Normalized: /content/drive/My Drive/distilSingle_confusion_matrix_normalized.png")
print("- Fehlklassifikationen: /content/drive/My Drive/distilSingle_confusion_matrix_off_diag.png")
print("- Reduced: /content/drive/My Drive/distilSingle_confusion_matrix_reduced.png")


In [None]:
#8. Evaluation nach Training

# 1. Laden des besten Modells
checkpoints = [ckpt for ckpt in os.listdir("./results") if ckpt.startswith("checkpoint")]
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[1]))
model_path = f"./results/{latest_checkpoint}"
print(f"Using model from: {model_path}")

# 2. Evaluation Dataset vorbereiten
eval_df = pd.DataFrame(eval_data_formatted)  # Verwende deine formatierten Evaluationsdaten
eval_texts = eval_df["question"].tolist()
eval_labels = eval_df["label"].tolist()  # Extrahiere die tatsächlichen Labels

# 3. Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Hugging Face Dataset erstellen
eval_dataset = Dataset.from_dict({"text": eval_texts, "label": eval_labels})
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# 4. Modellvorhersagen
predictions = trainer.predict(eval_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# 5. Vorhersagen in DataFrame einfügen
eval_df["predicted_label"] = predicted_labels

# 6. Labels zurückmappen (falls codiert)
eval_df["true_label"] = eval_df["label"]
eval_df["predicted_label_decoded"] = label_encoder.inverse_transform(eval_df["predicted_label"])
eval_df["true_label_decoded"] = label_encoder.inverse_transform(eval_df["true_label"])

# 7. Berechnung der Metriken
# Konvertiere die Klassenlabels in Strings
target_names = [str(label) for label in label_encoder.classes_]
# Klassifikationsbericht erstellen und ausgeben
print("Klassifikationsbericht:")
print(classification_report(eval_df["true_label"], eval_df["predicted_label"], target_names=target_names))





Using model from: ./results/checkpoint-2772


Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.19      1.00      0.32        17
           1       0.00      0.00      0.00        22
           2       0.00      0.00      0.00        26
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        14
           6       0.00      0.00      0.00        32
           7       0.00      0.00      0.00        29
           8       0.00      0.00      0.00        20
           9       0.00      0.00      0.00        20
          10       0.00      0.00      0.00        25
          11       0.00      0.00      0.00        33
          12       0.00      0.00      0.00        26
          13       0.00      0.00      0.00        24
          14       0.00      0.00      0.00        24
          15       0.00      0.00      0.00        23
          16       0.09      1.00      0.17        16
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 9. Manuelle Eingabe Test

# 1. Laden des Modells und Tokenizers
model_path = f"./results/{latest_checkpoint}"  # Stelle sicher, dass das Modell korrekt geladen wird
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Mapping der Labels
label_mapping = {
    0: "No",
    1: "No Preference",
    2: "Yes"
}

# 2. Manuelle Eingabe
text = input("Gib einen Text ein, den das Modell klassifizieren soll: ")

# 3. Vorverarbeitung der Eingabe
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

# 4. Modellvorhersage
model.eval()  # Setzt das Modell in den Evaluationsmodus
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()



# 6. Ausgabe
print(f"Der eingegebene Text wurde als '{predicted_class}' klassifiziert.")


Gib einen Text ein, den das Modell klassifizieren soll: thats okay for me
Der eingegebene Text wurde als '55' klassifiziert.
