<a href="https://colab.research.google.com/github/Michel-p16/PDS-Project/blob/capstone_korbi/distilbert_singleQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#connect drive
from google.colab import drive
drive.mount('/content/drive')

dataset_path = '/content/drive/My Drive/ColabData/final_single_question_data.json'

# Lade das Dataset
import json
with open(dataset_path, 'r') as file:
    dataset = json.load(file)

Mounted at /content/drive


In [4]:
!pip install transformers datasets
#1. Daten laden + filtern (SINGLE SELECT here)
import json
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

import os

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn

# Filtere das Dataset nach Fragen mit dem Typ "SINGLE_SELECT"
filtered_dataset = [example for example in dataset if example["type"] == "SINGLE_SELECT"]





In [11]:
#2. Daten formatieren

def convert_to_distilbert_format(data):
    formatted_data = []
    all_labels = []  # Labels für Single-Select

    for example in data:
        question = example["question"]
        answers = example["answers"]

        for answer in answers:
            text = answer["answer_text"]
            label = answer.get("answer_label")

            # Kontext aus der Antwort extrahieren (da er nicht separat vorhanden ist)
            context = text

            if label is not None:
                all_labels.append(label)
            formatted_data.append({
                "question": question,
                "context": context,  # Kontext ist hier die Antwort selbst
                "answers": {"text": [text], "answer_start": [0]}, # answer_start ist 0, da Kontext = Antwort
                "label": label
            })

    # Label-Encodierung für Single-Select
    label_encoder = LabelEncoder()
    label_encoder.fit(list(set(all_labels)))
    for example in formatted_data:
        if example["label"] is not None:
            example["label"] = label_encoder.transform([example["label"]])[0]

    return formatted_data, label_encoder  # Gib formatted_data und label_encoder zurück


# Hier den Funktionsaufruf außerhalb der Funktion platzieren:
formatted_dataset, label_encoder = convert_to_distilbert_format(filtered_dataset)

In [12]:
#3. Daten splitten 80% Training 20% Evaluation

train_data_formatted, eval_data_formatted = train_test_split(formatted_dataset, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {len(train_data_formatted)}")
print(f"Evaluationsdaten: {len(eval_data_formatted)}")

Trainingsdaten: 5544
Evaluationsdaten: 1386


In [13]:
import random

# Anzahl der Beispiele, die ausgegeben werden sollen
num_samples = 10

# Zufällige Beispiele aus den Trainingsdaten auswählen
random_indices_train = random.sample(range(len(train_data_formatted)), num_samples)

print("Formatierte Trainingsdaten:")
for index in random_indices_train:
    example = train_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

# Zufällige Beispiele aus den Evaluationsdaten auswählen
random_indices_eval = random.sample(range(len(eval_data_formatted)), num_samples)

print("\nFormatierte Evaluationsdaten:")
for index in random_indices_eval:
    example = eval_data_formatted[index]
    print(f"Beispiel {index + 1}:")
    print(f"  Frage: {example['question']}")
    print(f"  Kontext: {example['context']}")
    print(f"  Antworten: {example['answers']}")
    print(f"  Label: {example['label']}")  # Überprüfen Sie hier den Label-Typ
    print("-" * 20)

Formatierte Trainingsdaten:
Beispiel 2489:
  Frage: How satisfied are you with our service?
  Kontext: The service was inadequate and unprofessional.
  Antworten: {'text': ['The service was inadequate and unprofessional.'], 'answer_start': [0]}
  Label: 51
--------------------
Beispiel 4089:
  Frage: What industry are you in?
  Kontext: I'm a mechanical engineer for an industrial manufacturing company.
  Antworten: {'text': ["I'm a mechanical engineer for an industrial manufacturing company."], 'answer_start': [0]}
  Label: 29
--------------------
Beispiel 449:
  Frage: Want email marketing updates?
  Kontext: I want email updates on your email marketing techniques.
  Antworten: {'text': ['I want email updates on your email marketing techniques.'], 'answer_start': [0]}
  Label: 55
--------------------
Beispiel 338:
  Frage: What industry are you in?
  Kontext: I’m a registered nurse in a busy trauma ward.
  Antworten: {'text': ['I’m a registered nurse in a busy trauma ward.'], 'answer_

In [14]:
# 4. Labels vorbereiten
# Extrahiere die Labels für den LabelEncoder
train_labels = [example["label"] for example in train_data_formatted if example["label"] is not None]

# Erstellen und Anpassen des LabelEncoders
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)

# Funktion zum Transformieren der Labels
def transform_label(example):
    if example["label"] is not None:
        example["label"] = label_encoder.transform([example["label"]])[0]
    return example

# Transformiere die Labels in den formatierten Daten
train_data_formatted = [transform_label(example) for example in train_data_formatted]

In [15]:
# 5. Datasets erstellen
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data_formatted))
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data_formatted))


# Überprüfen Datenmenge
print(f"Anzahl der Trainingsdaten: {len(train_dataset)}")
print(f"Anzahl der Evaluationsdaten: {len(eval_dataset)}")


Anzahl der Trainingsdaten: 5544
Anzahl der Evaluationsdaten: 1386


In [16]:
# 6. Vorverarbeitung

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(
        examples["question"], examples["context"],
        padding=True, truncation=True, max_length=36
    )


train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

In [17]:
#test ob inbalanced answers/labels
from collections import Counter

train_labels = [sample['label'] for sample in train_data_formatted]
eval_labels = [sample['label'] for sample in eval_data_formatted]

print("Trainingsdaten Klassenverteilung:", Counter(train_labels))
print("Evaluationsdaten Klassenverteilung:", Counter(eval_labels))

#tokenazitaion length test
lengths = [len(tokenizer.encode(example["question"] + " " + example["context"])) for example in formatted_dataset]
print(f"Max length: {max(lengths)}, 95th percentile: {np.percentile(lengths, 95)}")


Trainingsdaten Klassenverteilung: Counter({37: 195, 55: 194, 13: 137, 27: 115, 11: 106, 53: 104, 40: 102, 20: 101, 47: 101, 43: 100, 31: 100, 42: 99, 32: 99, 25: 98, 17: 98, 15: 98, 22: 97, 51: 96, 38: 96, 46: 95, 33: 95, 34: 95, 35: 94, 50: 94, 29: 92, 23: 91, 3: 91, 5: 91, 16: 91, 44: 91, 12: 91, 30: 90, 14: 90, 56: 90, 8: 89, 10: 89, 36: 89, 0: 89, 52: 89, 57: 88, 19: 88, 21: 88, 49: 88, 26: 87, 4: 87, 18: 86, 41: 86, 54: 85, 39: 84, 28: 84, 7: 82, 45: 82, 48: 82, 9: 82, 24: 79, 1: 76, 6: 75, 2: 73})
Evaluationsdaten Klassenverteilung: Counter({55: 42, 37: 41, 21: 34, 11: 33, 6: 32, 38: 31, 52: 31, 50: 29, 36: 29, 23: 29, 7: 29, 29: 28, 35: 28, 24: 27, 18: 27, 39: 27, 42: 27, 32: 26, 25: 26, 2: 26, 12: 26, 26: 25, 10: 25, 44: 25, 28: 25, 22: 25, 43: 24, 31: 24, 13: 24, 3: 24, 14: 24, 19: 23, 41: 23, 57: 23, 15: 23, 51: 23, 40: 22, 1: 22, 47: 22, 46: 21, 17: 21, 9: 20, 8: 20, 49: 19, 20: 19, 27: 18, 30: 18, 34: 18, 45: 18, 54: 17, 0: 17, 16: 16, 48: 16, 33: 16, 56: 15, 4: 15, 53: 14,

In [19]:
# 7.Training start
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))  # num_labels anpassen

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluation nach jeder Epoche
    save_strategy="epoch",  # Speichern nach jeder Epoche
    logging_dir="./logs",
    load_best_model_at_end=True, # bestes Modell am Ende laden,
    metric_for_best_model="accuracy",  # Wähle die Metrik, die das beste Modell definiert
    greater_is_better=True,  # Für Metriken wie Accuracy, Precision, etc
    report_to="none",  # Deaktiviert WandB
    logging_steps=10,  # Logge Fortschritte alle 10 Schritte
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Evaluationsdatensatz hinzufügen
    compute_metrics=compute_metrics,  # Metrikenfunktion hinzufügen

)

trainer.train()

# Google Drive verbinden
from google.colab import drive
drive.mount('/content/drive')

# Speichern des Modells in Google Drive
model.save_pretrained("/content/drive/My Drive/final_model")
tokenizer.save_pretrained("/content/drive/My Drive/final_model")

print("Das Modell wurde erfolgreich in Google Drive gespeichert.")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6425,1.438614,0.847042,0.831509,0.840213,0.847042
2,0.5008,0.382813,0.967532,0.967299,0.970609,0.967532
3,0.1681,0.148088,0.978355,0.97814,0.978776,0.978355
4,0.0785,0.104821,0.977633,0.977477,0.978211,0.977633
5,0.1035,0.091002,0.981241,0.981276,0.98219,0.981241


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Das Modell wurde erfolgreich in Google Drive gespeichert.


In [14]:
# Überprüfen der Label-Klassen aus dem LabelEncoder
print(f"Alle Labels im LabelEncoder: {label_encoder.classes_}")


Alle Labels im LabelEncoder: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57]


In [12]:
# Collect unique labels
unique_labels = set()
for example in dataset:
    for answer in example.get("answers", []):
        label = answer.get("answer_label")
        if label:
            unique_labels.add(label)

# Generate mapping
label_mapping = {idx: label for idx, label in enumerate(sorted(unique_labels))}
inverse_label_mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}

print("Label Mapping:", label_mapping)

# Save the mapping for later use
import pickle
with open("/content/drive/My Drive/label_mapping.pkl", "wb") as file:
    pickle.dump(label_mapping, file)


Label Mapping: {0: '1-10', 1: '1-5', 2: '11-15', 3: '11-50', 4: '16-20', 5: '201-2000', 6: '21-30', 7: '31-40', 8: '51-200', 9: '6-10', 10: 'Adito', 11: 'Aerospace', 12: 'Applicant', 13: 'Automotive', 14: 'CAS', 15: 'Call', 16: 'Close.io', 17: 'Computers & Networks', 18: 'Construction company', 19: 'Consultant, Planner, Architect', 20: 'Craft enterprises', 21: 'Defense', 22: 'Education sector', 23: 'End User', 24: 'English', 25: 'Existing customer', 26: 'German', 27: 'Government', 28: 'HubSpot', 29: 'Industrial', 30: 'Italian', 31: 'Japanese ', 32: 'Medical', 33: 'Meeting', 34: 'Microsoft Dynamics', 35: 'Network Operators & Infrastructure', 36: 'New customer', 37: 'No', 38: 'Offer', 39: 'Partner', 40: 'Physical Security', 41: 'Pipedrive', 42: 'Production company', 43: 'Public Safety / Law Enforcement', 44: 'R&D', 45: 'SAP Sales Cloud', 46: 'Salesforce', 47: 'Satisfied', 48: 'Scaffolding company', 49: 'Spanish', 50: 'Trading company', 51: 'Unsatisfied', 52: 'Very satisfied', 53: 'Very u

In [20]:
import random
import pickle

# Load the trained model and tokenizer
model_path = "/content/drive/My Drive/final_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the label mapping
label_mapping_path = "/content/drive/My Drive/label_mapping.pkl"
with open(label_mapping_path, "rb") as file:
    label_mapping = pickle.load(file)

# Extract all unique questions from the dataset
all_questions = list(set(example["question"] for example in dataset))

# Randomly select 3 questions
selected_questions = random.sample(all_questions, 3)

# Iterate through the selected questions and classify user inputs
for idx, question in enumerate(selected_questions, start=1):
    print(f"\nQuestion {idx}: {question}")
    context = input("Please enter your answer: ")

    # Preprocess the input
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=36
    )

    # Perform prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Map the predicted class to the corresponding label
    predicted_label = label_mapping.get(predicted_class, "Unknown Label")

    # Display the result
    print(f"The response was classified as: '{predicted_label}'.")



Question 1: What language should we use to communicate?
Please enter your answer: i can not speak english, i prefer german
The response was classified as: 'German'.

Question 2: What type of customer are you?
Please enter your answer: i am already long time customer from you
The response was classified as: 'Existing customer'.

Question 3: Do you consent to the processing of your data?
Please enter your answer: thats okay for me
The response was classified as: 'Yes'.
