In [90]:
import os
import json
import time
import pandas as pd
import google.generativeai as genai
import datetime
import re
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#!pip install datasets
from datasets import Dataset, DatasetDict
#!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Data Extraction

In [18]:
json_dir = "Raw_data"

merged_data = []

for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            json_data = json.load(file)
            merged_data.extend(json_data)

with open("merged_questionnaires.json", "w", encoding="utf-8") as output_file:
    json.dump(merged_data, output_file, indent=4)

print("Merged JSON data with all information saved successfully!")

Merged JSON data with all information saved successfully!


In [19]:
with open("merged_questionnaires.json", "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Extract relevant fields
data = []
for entry in json_data:
    question_type = entry["type"]
    question = entry["question"]
    for option in entry["options"]:
        data.append([question_type, question, option["option"]])

# Create DataFrame
df_questionnaires = pd.DataFrame(data, columns=["Type", "Question", "Label"])
print(df_questionnaires)


              Type                  Question              Label
0    SINGLE_SELECT             Customer type       New customer
1    SINGLE_SELECT             Customer type  Existing customer
2    SINGLE_SELECT             Customer type            Partner
3    SINGLE_SELECT             Customer type          Applicant
4    SINGLE_SELECT     Customer satisfaction     Very satisfied
..             ...                       ...                ...
111   MULTI_SELECT  Who to copy in follow up      Sandro Kalter
112   MULTI_SELECT  Who to copy in follow up     Jens Roschmann
113   MULTI_SELECT  Who to copy in follow up       Domiki Stein
114   MULTI_SELECT  Who to copy in follow up        Sean Kennin
115   MULTI_SELECT  Who to copy in follow up        Tim Persson

[116 rows x 3 columns]


In [20]:
df_single_select_questions = df_questionnaires[df_questionnaires["Type"] == "SINGLE_SELECT"]
#df_single_select_questions

In [11]:
df_multi_select_questions = df_questionnaires[df_questionnaires["Type"] == "MULTI_SELECT"]
#df_multi_select_questions

# Generating Data with Gemini

In [21]:
#api_key = os.environ['gemini_api']
genai.configure(api_key='') # Enter your API key

In [22]:
def api_call_for_generating_question(question):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"Generate a full understandable and short question based on the following: {question}. Direct the message to me. Print the question only!"
        response = model.generate_content(prompt)
        #print(response.text.strip())
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [23]:
def generate_question(df):
    generated_questions = dict()

    for question in df["Question"]:

        if question not in generated_questions.keys():
            full_question = api_call_for_generating_question(question)
            generated_questions[question] = full_question
            #print(f"{question}: {full_question}")
            time.sleep(3)
            
    df['Question'] = df['Question'].map(generated_questions)
    print("Questions in dataframe with new Questions replaced.")

    return df


In [24]:
df_single_select_questions = generate_question(df_single_select_questions)

Questions in dataframe with new Questions replaced.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Question'] = df['Question'].map(generated_questions)


In [25]:
df_single_select_questions

Unnamed: 0,Type,Question,Label
0,SINGLE_SELECT,What type of customer are you?,New customer
1,SINGLE_SELECT,What type of customer are you?,Existing customer
2,SINGLE_SELECT,What type of customer are you?,Partner
3,SINGLE_SELECT,What type of customer are you?,Applicant
4,SINGLE_SELECT,How satisfied are you with our service?,Very satisfied
5,SINGLE_SELECT,How satisfied are you with our service?,Satisfied
6,SINGLE_SELECT,How satisfied are you with our service?,Unsatisfied
7,SINGLE_SELECT,How satisfied are you with our service?,Very unsatisfied
8,SINGLE_SELECT,What's the average size of your trade fair teams?,1-5
9,SINGLE_SELECT,What's the average size of your trade fair teams?,6-10


In [26]:
df_multi_select_questions = generate_question(df_multi_select_questions)

Questions in dataframe with new Questions replaced.


In [27]:
df_multi_select_questions

Unnamed: 0,Type,Question,Label
23,MULTI_SELECT,What product interests you?,BusinessCards
24,MULTI_SELECT,What product interests you?,DataEnrichment
25,MULTI_SELECT,What product interests you?,VisitReport
26,MULTI_SELECT,What product interests you?,Data Cleansing
27,MULTI_SELECT,What product interests you?,DataQuality
28,MULTI_SELECT,What problem are you trying to solve?,Scan business cards
29,MULTI_SELECT,What problem are you trying to solve?,Clean up CRM
30,MULTI_SELECT,What problem are you trying to solve?,Extract data from emails
31,MULTI_SELECT,What problem are you trying to solve?,Improve CRM data quality
32,MULTI_SELECT,What problem are you trying to solve?,Capture trade fair contacts


In [28]:
def make_api_call_for_answers(question, label, type):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the answer label : '{label}'. Print the answers ONLY. If the label is yes or no also include answers without the label."

        if type == "MULTI_SELECT":
            prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the possible answer labels : '{labels}'. Consider that multiple labels can be selected for answering, so include answers with all the possible combinations. Print in the generated answer followd up by the used labels in brackets ONLY. Also include answers without the label. Do not print additional information."

        response = model.generate_content(prompt)
        print(f"Answers for Question \"{question}\" with label \"{label}\" generated.")
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [38]:
def generate_diverse_answers(df):

    generated_answers = []
    processed_questions = set()

    for _, row in df.iterrows():

        type = row["Type"]
        question = row["Question"]

        if type == "SINGLE_SELECT":

            label = row["Label"]

            answers = make_api_call_for_answers(question, label, type=type)

            each_response = answers.split("\n")

            for curr_response in each_response:
                    generated_answers.append({
                        #'question_id': row['question_id'],
                        'question': question,
                        'type': type,
                        'answer_text': curr_response,
                        'answer_label': label,
                        'timestamp': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")  # Aktueller Timestamp
                    })
            
            time.sleep(3)
        
        else:
             
            for _, row in df.iterrows():
                
                labels = df[df['Question'] == question]['Label'].tolist()

                if question not in processed_questions:

                    processed_questions.add(question)

                    answers = make_api_call_for_answers(question, labels, type=type)

                    #answers.pop(0)
                    #answers.pop()

                    answers_with_tags = []
                    pattern = r'^(.*?)\s+\[([^\]]+)\]$'

                    for answer in answers.split("\n"):
                        
                        # Using re.match to extract both text and tags
                        match = re.match(pattern, answer)
                        #print(match.group(0))
                        #print(match.group(1))
                        #
                        #print(match.group(2))

                        try:
                            # Your code that may raise an exception
                            if match is not None:
                                answers_with_tags.append({
                                "answer": match.group(1),
                                "label": match.group(2)
                                })
                        except AttributeError as e:  # Replace TypeError with the specific exception you want to catch
                            print(f"Caught an exception: {e}")
                                        
                        #answers_with_tags[match.group(1)] = match.group(2)
                    
                    for row in answers_with_tags:
                        generated_answers.append({
                                #'question_id': row['question_id'],
                                'question': question,
                                'type': type,
                                'answer_text': row["answer"],
                                'answer_label': row["label"],
                                'timestamp': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")  # Aktueller Timestamp
                        })
                    
                    time.sleep(3) 

        #print("Answers generated")  
             
    return generated_answers

In [39]:
df_single_select_with_new_q_and_a = generate_diverse_answers(df_single_select_questions)

Answers for Question "What type of customer are you?" with label "New customer" generated.
Answers for Question "What type of customer are you?" with label "Existing customer" generated.
Answers for Question "What type of customer are you?" with label "Partner" generated.
Answers for Question "What type of customer are you?" with label "Applicant" generated.
Answers for Question "How satisfied are you with our service?" with label "Very satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Unsatisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Very unsatisfied" generated.
Answers for Question "What's the average size of your trade fair teams?" with label "1-5" generated.
Answers for Question "What's the average size of your trade fair teams?" with label "6-10" generated.
Answers for Question "What's the a

In [40]:
df_single_select_with_new_q_and_a

[{'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a first-time buyer exploring your offerings.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-30T20:47:06.766613'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': 'This is my initial purchase from your company.',
  'answer_label': 'New customer',
  'timestamp': '2025-01-30T20:47:06.766673'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I've never used your services before.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-30T20:47:06.766684'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a brand-new customer looking for information.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-30T20:47:06.766691'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm trying your product

In [None]:
df_multi_select_with_new_q_and_a = generate_diverse_answers(df_multi_select_questions)

Answers for Question "Productinterests" with label "['BusinessCards', 'DataEnrichment', 'VisitReport', 'Data Cleansing', 'DataQuality']" generated.
Answers for Question "Searches a solution for" with label "['Scan business cards', 'Clean up CRM', 'Extract data from emails', 'Improve CRM data quality', 'Capture trade fair contacts']" generated.
Answers for Question "What is the type of contact?" with label "['Existing customer', 'Supplier', 'New customer / Prospect', 'Press / media', 'Competitor']" generated.
Answers for Question "What is the contact person interested in?" with label "['100 Additive Manufacturing', '200 Automation', '300 Advanced Manufacturing', '234 Assembly Systems', '256 Joining Systems for large components', 'Others']" generated.
Answers for Question "When does the contact person wish to receive a follow up?" with label "['1 week', '2 weeks', '3 weeks']" generated.
Answers for Question "What products are you interested in?" with label "['Automotive radar target simu

In [None]:
for item in df_multi_select_with_new_q_and_a:
    item['answer_label'] = item['answer_label'].replace('/', ',')

In [16]:
""" for entry in data:
    question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            "question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["text"],
            "answer_label": answer["label"],
            "timestamp": answer["timestamp"]
        }) """

' for entry in data:\n    question_id = entry["question_id"]\n    question = entry["question"]\n    question_type = entry["type"]\n    for answer in entry["answers"]:\n        rows.append({\n            "question_id": question_id,\n            "question": question,\n            "type": question_type,\n            "answer_text": answer["text"],\n            "answer_label": answer["label"],\n            "timestamp": answer["timestamp"]\n        }) '

In [41]:
def save_in_json(data, filename):
    grouped_data = defaultdict(lambda: {'type': None, 'answers': []})

    for entry in data:
        question = entry['question']
        if grouped_data[question]['type'] is None:
            grouped_data[question]['type'] = entry['type']
        grouped_data[question]['answers'].append({
            'answer_text': entry['answer_text'],
            'answer_label': entry['answer_label'].replace("  ", " ").strip(),
            'timestamp': entry['timestamp']
        })

    # Convert to final JSON structure
    final_json = [
        {
            'question': question,
            'type': details['type'],
            'answers': details['answers']
        }
        for question, details in grouped_data.items()
    ]

    # Save to file
    with open(filename, 'w') as f:
        json.dump(final_json, f, indent=4)

    # Print output
    print(json.dumps(final_json, indent=4))

In [42]:
save_in_json(df_single_select_with_new_q_and_a, 'final_single_question_data.json')

[
    {
        "question": "What type of customer are you?",
        "type": "SINGLE_SELECT",
        "answers": [
            {
                "answer_text": "I'm a first-time buyer exploring your offerings.",
                "answer_label": "New customer",
                "timestamp": "2025-01-30T20:47:06.766613"
            },
            {
                "answer_text": "This is my initial purchase from your company.",
                "answer_label": "New customer",
                "timestamp": "2025-01-30T20:47:06.766673"
            },
            {
                "answer_text": "I've never used your services before.",
                "answer_label": "New customer",
                "timestamp": "2025-01-30T20:47:06.766684"
            },
            {
                "answer_text": "I'm a brand-new customer looking for information.",
                "answer_label": "New customer",
                "timestamp": "2025-01-30T20:47:06.766691"
            },
            {
          

In [None]:
save_in_json(df_multi_select_with_new_q_and_a, "final_multi_question_data_test.json")

[
    {
        "question": "Productinterests",
        "type": "MULTI_SELECT",
        "answers": [
            {
                "answer_text": "I need high-quality business cards to make a strong impression, and I'm also interested in data enrichment services to improve my client database.",
                "answer_label": "BusinessCards, DataEnrichment",
                "timestamp": "2025-01-27T21:19:17.915056"
            },
            {
                "answer_text": "My primary interest is in improving data quality, which includes data cleansing and enrichment.",
                "answer_label": "DataQuality, Data Cleansing, DataEnrichment",
                "timestamp": "2025-01-27T21:19:17.915119"
            },
            {
                "answer_text": "I'm looking for a solution to enhance my visit reports with data enrichment capabilities.",
                "answer_label": "VisitReport, DataEnrichment",
                "timestamp": "2025-01-27T21:19:17.915129"
           

# Fine-Tuning Q&A Model

## Roberta

1. Preprocessing Single Select Questions

In [61]:
def convert_json_to_df(json_file):
    rows = []

    for entry in json_file:

        question = entry["question"]
        question_type = entry["type"]
        for answer in entry["answers"]:
            rows.append({
                #"question_id": question_id,
                "question": question,
                "type": question_type,
                "answer_text": answer["answer_text"],
                "answer_label": answer["answer_label"],
                "timestamp": answer["timestamp"]
            })
    
    return pd.DataFrame(rows)

In [62]:
# JSON-Datei laden
with open('final_single_question_data.json', 'r') as f:
    single_select_model_data = json.load(f)

df_single_select_final = convert_json_to_df(single_select_model_data)

In [64]:
df_single_select_final = df_single_select_final[['question', 'answer_text', 'answer_label']]

In [65]:
df_single_select_final

Unnamed: 0,question,answer_text,answer_label
0,What type of customer are you?,I'm a first-time buyer exploring your offerings.,New customer
1,What type of customer are you?,This is my initial purchase from your company.,New customer
2,What type of customer are you?,I'm a new customer looking for information.,New customer
3,What type of customer are you?,I've never used your services before.,New customer
4,What type of customer are you?,"I'm a brand-new customer, excited to try your ...",New customer
...,...,...,...
6925,What is the specific customer group you're tar...,Our specific customer base is R&D personnel in...,R&D
6926,What is the specific customer group you're tar...,We're aiming to reach R&D professionals workin...,R&D
6927,What is the specific customer group you're tar...,The specific customer group is R&D scientists ...,R&D
6928,What is the specific customer group you're tar...,Our focus is on R&D teams working on national ...,R&D


In [91]:
label_encoder_single_select = LabelEncoder()
df_single_select_final["label"] = label_encoder_single_select.fit_transform(df_single_select_final["answer_label"])  # z. B. "Yes" → 0, "No" → 1, "No Preference" → 2

print(label_encoder_single_select.classes_)  # Zeigt, welche Zahl welchem Label entspricht

['1-10' '1-5' '11-15' '11-50' '16-20' '201-2000' '21-30' '31-40' '51-200'
 '6-10' 'Adito' 'Aerospace' 'Applicant' 'Automotive' 'CAS' 'Call'
 'Close.io' 'Computers & Networks' 'Construction company'
 'Consultant, Planner, Architect' 'Craft enterprises' 'Defense'
 'Education sector' 'End User' 'English' 'Existing customer' 'German'
 'Government' 'HubSpot' 'Industrial' 'Italian' 'Japanese ' 'Medical'
 'Meeting' 'Microsoft Dynamics' 'Network Operators & Infrastructure'
 'New customer' 'No' 'Offer' 'Partner' 'Physical Security' 'Pipedrive'
 'Production company' 'Public Safety / Law Enforcement' 'R&D'
 'SAP Sales Cloud' 'Salesforce' 'Satisfied' 'Scaffolding company'
 'Spanish' 'Trading company' 'Unsatisfied' 'Very satisfied'
 'Very unsatisfied' 'Wholesaler, Distributor' 'Yes' 'larger than 2000'
 'more than 40']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_select_final["label"] = label_encoder_single_select.fit_transform(df_single_select_final["answer_label"])  # z. B. "Yes" → 0, "No" → 1, "No Preference" → 2


In [71]:
train_df_single_select, test_df_single_select = train_test_split(df_single_select_final, test_size=0.2, random_state=42, stratify=df_single_select_final["label"])

In [None]:
# Speichere die Datasets als JSON
#train_df_single_select[["question", "answer_text", "label"]].to_json("train.json", orient="records", lines=True)
#test_df_single_select[["question", "answer_text", "label"]].to_json("test.json", orient="records", lines=True)

In [72]:
print("Train size:", len(train_df_single_select), "Test size:", len(test_df_single_select))

Train size: 5544 Test size: 1386


In [79]:
dataset_single_select = DatasetDict({
    "train": Dataset.from_pandas(train_df_single_select),
    "test": Dataset.from_pandas(test_df_single_select)
})

In [80]:
dataset_single_select

DatasetDict({
    train: Dataset({
        features: ['question', 'answer_text', 'answer_label', 'label', '__index_level_0__'],
        num_rows: 5544
    })
    test: Dataset({
        features: ['question', 'answer_text', 'answer_label', 'label', '__index_level_0__'],
        num_rows: 1386
    })
})

In [None]:
#dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json"})
#print(dataset)

In [89]:
#model_name = "roberta-base"
model_name_roberta = "deepset/roberta-base-squad2"
tokenizer_roberta = AutoTokenizer.from_pretrained(model_name_roberta)

def tokenize_function_roberta_single_select(examples):
    text_inputs = [q + " " + a for q, a in zip(examples["question"], examples["answer_text"])]
    return tokenizer_roberta(text_inputs, truncation=True, padding="max_length", max_length=128)

tokenized_dataset_roberta = dataset_single_select.map(tokenize_function_roberta_single_select, batched=True)


Map:   0%|          | 0/5544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

In [92]:
#num_labels = len(label_encoder.classes_)  # Anzahl der Klassen (3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=label_encoder_single_select.classes_)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="./roberta_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)



In [None]:
# Use the One Cycle Policy
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",  # 'linear' or 'cosine'
    warmup_steps=500,  # Steps to gradually increase the learning rate
    max_grad_norm=1.0, # Gradient clipping
    report_to="none"
)



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Funktion zur Berechnung der Metriken
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Nimmt die Klasse mit der höchsten Wahrscheinlichkeit
    acc = accuracy_score(labels, predictions)  # Accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")  # Andere Scores
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Hier fügen wir die Metrik-Funktion hinzu!
)


trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2249,0.169862,0.945887,0.930416,0.945887,0.934197
2,0.0973,0.115767,0.968975,0.971862,0.968975,0.96869
3,0.0958,0.091591,0.979798,0.980607,0.979798,0.979795


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=2079, training_loss=0.6966733673749606, metrics={'train_runtime': 583.7577, 'train_samples_per_second': 28.491, 'train_steps_per_second': 3.561, 'total_flos': 1094565840850944.0, 'train_loss': 0.6966733673749606, 'epoch': 3.0})

In [None]:
model.save_pretrained("./roberta_finetuned")
tokenizer.save_pretrained("./roberta_finetuned")

('./roberta_finetuned/tokenizer_config.json',
 './roberta_finetuned/special_tokens_map.json',
 './roberta_finetuned/vocab.json',
 './roberta_finetuned/merges.txt',
 './roberta_finetuned/added_tokens.json',
 './roberta_finetuned/tokenizer.json')

In [None]:
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_encoder.classes_)}
print(label_mapping)

{'LABEL_0': '1-10', 'LABEL_1': '1-5', 'LABEL_2': '11-15', 'LABEL_3': '11-50', 'LABEL_4': '16-20', 'LABEL_5': '201-2000', 'LABEL_6': '21-30', 'LABEL_7': '31-40', 'LABEL_8': '51-200', 'LABEL_9': '6-10', 'LABEL_10': 'Adito', 'LABEL_11': 'Aerospace', 'LABEL_12': 'Applicant', 'LABEL_13': 'Automotive', 'LABEL_14': 'CAS', 'LABEL_15': 'Call', 'LABEL_16': 'Close.io', 'LABEL_17': 'Computers & Networks', 'LABEL_18': 'Construction company', 'LABEL_19': 'Consultant, Planner, Architect', 'LABEL_20': 'Craft enterprises', 'LABEL_21': 'Defense', 'LABEL_22': 'Education sector', 'LABEL_23': 'End User', 'LABEL_24': 'English', 'LABEL_25': 'Existing customer', 'LABEL_26': 'German', 'LABEL_27': 'Government', 'LABEL_28': 'HubSpot', 'LABEL_29': 'Industrial', 'LABEL_30': 'Italian', 'LABEL_31': 'Japanese ', 'LABEL_32': 'Medical', 'LABEL_33': 'Meeting', 'LABEL_34': 'Microsoft Dynamics', 'LABEL_35': 'Network Operators & Infrastructure', 'LABEL_36': 'New customer', 'LABEL_37': 'No', 'LABEL_38': 'Offer', 'LABEL_39':

In [None]:
test_df

Unnamed: 0,question,answer_text,answer_label,label
2474,What CRM system are you currently using?,We've recently transitioned to Adito for our CRM.,Adito,10
3120,What language should we use to communicate?,Using Italian gives a sense of connection to I...,Italian,30
4402,How many people work at your company?,Our workforce currently numbers between 51 and...,51-200,8
1824,What CRM system are you currently using?,Our team utilizes Pipedrive for CRM functional...,Pipedrive,41
5234,What industry are you in?,My government work centers on immigration and ...,Government,27
...,...,...,...,...
4186,How many people work at your company?,We're a tight-knit group of one to ten employees.,1-10,0
111,What type of customer are you?,I'm a new customer who is delighted with the o...,New customer,36
3009,What language should we use to communicate?,Our team’s expertise in German makes it the mo...,German,26
6590,What is the specific customer group you're tar...,Investors investing in businesses.,End User,23


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")

test_input = {
    "question": "What type of customer are you?",
    "answer_text": "first time here"
}

# Kombinieren wie im Training (Frage + Antwort)
test_text = test_input["question"] + " " + test_input["answer_text"]
result = classifier(test_text)
#print(result)

# Die vorhergesagte Klasse zurück in ein Label umwandeln
predicted_label = label_mapping[result[0]["label"]]
accuracy = result[0]["score"]

print(f"Vorhergesagtes Label: {predicted_label} (Accuracy: {accuracy})")


Device set to use cuda:0


Vorhergesagtes Label: New customer (Accuracy: 0.9966944456100464)


In [None]:
import random

In [None]:
from transformers import pipeline

# Initialize the classifier with the fine-tuned RoBERTa model and tokenizer
classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")

question_list = df_single_select["question"].tolist()

# Interactive loop
while True:
    # Get user input for question and answer
    random_index = random.randint(0, len(question_list) - 1)
    question = question_list[random_index]
    print(f"Question: {question}")
    answer_text = input("Enter the answer text: ")

    # Combine question and answer as done in training
    test_text = question + " " + answer_text

    # Run the classifier on the combined text
    result = classifier(test_text)

    # Output the result
    print(f"Model result: {result}")

    # Assuming the result contains a class label (e.g., 'LABEL_0', 'LABEL_1', etc.), map it back to a label
    predicted_label = label_mapping.get(result[0]["label"], "Unknown label")

    # Print the predicted label
    print("Predicted label:", predicted_label)

    # Ask if the user wants to continue
    continue_input = input("Do you want to test another input? (y/n): ")
    if continue_input.lower() != 'y':
        print("Exiting the interactive loop.")
        break


Device set to use cuda:0


Question: What type of customer are you?
Enter the answer text: long time ago
Model result: [{'label': 'LABEL_39', 'score': 0.6796396374702454}]
Predicted label: Partner
Do you want to test another input? (y/n): n
Exiting the interactive loop.


In [None]:
#print(label_mapping)

{'LABEL_0': '1-10', 'LABEL_1': '1-5', 'LABEL_2': '11-15', 'LABEL_3': '11-50', 'LABEL_4': '16-20', 'LABEL_5': '201-2000', 'LABEL_6': '21-30', 'LABEL_7': '31-40', 'LABEL_8': '51-200', 'LABEL_9': '6-10', 'LABEL_10': 'Adito', 'LABEL_11': 'Aerospace', 'LABEL_12': 'Applicant', 'LABEL_13': 'Automotive', 'LABEL_14': 'CAS', 'LABEL_15': 'Call', 'LABEL_16': 'Close.io', 'LABEL_17': 'Computers & Networks', 'LABEL_18': 'Construction company', 'LABEL_19': 'Consultant, Planner, Architect', 'LABEL_20': 'Craft enterprises', 'LABEL_21': 'Defense', 'LABEL_22': 'Education sector', 'LABEL_23': 'End User', 'LABEL_24': 'English', 'LABEL_25': 'Existing customer', 'LABEL_26': 'German', 'LABEL_27': 'Government', 'LABEL_28': 'HubSpot', 'LABEL_29': 'Industrial', 'LABEL_30': 'Italian', 'LABEL_31': 'Japanese ', 'LABEL_32': 'Medical', 'LABEL_33': 'Meeting', 'LABEL_34': 'Microsoft Dynamics', 'LABEL_35': 'Network Operators & Infrastructure', 'LABEL_36': 'New customer', 'LABEL_37': 'No', 'LABEL_38': 'Offer', 'LABEL_39':

In [None]:
import random
from transformers import pipeline
from torch.utils.tensorboard import SummaryWriter

# Initialize the classifier with the fine-tuned RoBERTa model and tokenizer
classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")


# Initialize TensorBoard Writer
log_dir = './logs'  # Path where logs will be stored
writer = SummaryWriter(log_dir)

# Assuming df_single_select is a DataFrame containing your questions
question_list = df_single_select["question"].tolist()

# Interactive loop
step = 0  # Step counter for logging
while True:
    # Get user input for question and answer
    random_index = random.randint(0, len(question_list) - 1)
    question = question_list[random_index]
    print(f"Question: {question}")

    # Prompt the user for an answer
    answer_text = input("Enter the answer text: ")

    # Combine question and answer as done in training
    test_text = question + " " + answer_text

    # Run the classifier on the combined text
    result = classifier(test_text)

    # Extract the predicted label and score
    predicted_label = result[0]["label"]
    predicted_score = result[0]["score"]

    # Map the label to a human-readable form
    readable_label = label_mapping.get(predicted_label, "Unknown label")

    # Log the result in TensorBoard
    writer.add_scalar('Prediction/Score', predicted_score, step)
    writer.add_text('Prediction/Label', readable_label, step)

    # Output the result
    print(f"Model result: {result}")
    print("Predicted label:", readable_label)

    # Increment the step for the next iteration
    step += 1

    # Ask if the user wants to continue
    continue_input = input("Do you want to test another input? (y/n): ")
    if continue_input.lower() != 'y':
        print("Exiting the interactive loop.")
        break

# Close the TensorBoard writer when done
writer.close()


Device set to use cuda:0


Question: What CRM system are you currently using?
Enter the answer text: none
Model result: [{'label': 'LABEL_16', 'score': 0.44915056228637695}]
Predicted label: Close.io
Do you want to test another input? (y/n): y
Question: What CRM system are you currently using?
Enter the answer text: close.io
Model result: [{'label': 'LABEL_16', 'score': 0.9967896342277527}]
Predicted label: Close.io
Do you want to test another input? (y/n): y
Question: What CRM system are you currently using?
Enter the answer text: hate
Model result: [{'label': 'LABEL_10', 'score': 0.7869200110435486}]
Predicted label: Adito
Do you want to test another input? (y/n): n
Exiting the interactive loop.


In [None]:
!pip install tensorboard



In [None]:
%tensorboard --logdir=./logs

UsageError: Line magic function `%tensorboard` not found.


In [None]:
import shutil

# Create a zip file for easier download
shutil.make_archive('/content/roberta_finetuned_model', 'zip', '/content', 'roberta_finetuned')

# The zip file is now ready to be downloaded
from google.colab import files
files.download("roberta_finetuned_model.zip")

'/content/roberta_finetuned_model.zip'

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define the folder where the model was saved (this is the directory containing your model and tokenizer)
folder_to_zip = '/content/roberta_finetuned'  # The folder containing the fine-tuned model

# Specify the folder in Google Drive to save the model
model_save_path = 'model_path/roberta_finetuned'



# Copy model folder directly to Google Drive
shutil.copytree(folder_to_zip, model_save_path)
print(f"Model saved to Google Drive at {model_save_path}")


Mounted at /content/drive
Model saved to Google Drive at /content/drive/MyDrive/CapStone_models/roberta_finetuned


# Multi-Select Questions

In [None]:
import json
import pandas as pd

In [None]:
# JSON-Datei laden
with open('final_multi_question_data.json', 'r') as f:
    data = json.load(f)

# Daten in ein DataFrame umwandeln
rows = []
for row in data:
    question = row["question"]
    for ans in row["answers"]:
        rows.append({
            "question": question,
            "text": ans["answer_text"],
            "labels": ans["answer_label"].split(", "),  # Convert to list
            #"timestamp": ans["timestamp"]
        })

# DataFrame erstellen
df_multi_select = pd.DataFrame(rows)

# DataFrame anzeigen
#print(df.head())

In [None]:
df_multi_select

Unnamed: 0,question,text,labels
0,What are your product interests?,I'm interested in improving data quality and g...,"[DataQuality, BusinessCards]"
1,What are your product interests?,My priorities are data enrichment and visit re...,"[DataEnrichment, VisitReport]"
2,What are your product interests?,I need data cleansing and improved data quality.,"[Data Cleansing, DataQuality]"
3,What are your product interests?,I'm focused on business cards and data cleansing.,"[BusinessCards, Data Cleansing]"
4,What are your product interests?,My interest lies in visit reports and data enr...,"[VisitReport, DataEnrichment]"
...,...,...,...
740,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Jens Roschma...","['Stephan Maier', 'Joachim Wagner', 'Jens Rosc..."
741,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Domiki Stein...","['Stephan Maier', 'Joachim Wagner', 'Domiki St..."
742,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Sean Kennin ...","['Stephan Maier', 'Joachim Wagner', 'Sean Kenn..."
743,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Tim Persson ...","['Stephan Maier', 'Joachim Wagner', 'Tim Perss..."


In [None]:
question_label_mapping = {}

for _, row in df_multi_select.iterrows():
    question = row["question"]
    labels = row["labels"]

    # Ensure it's a list and update the mapping
    if question in question_label_mapping:
        question_label_mapping[question].update(labels)
    else:
        question_label_mapping[question] = set(labels)  # Use a set to avoid duplicates


In [None]:
question_label_mapping = {q: list(labels) for q, labels in question_label_mapping.items()}
question_label_mapping

{'What are your product interests?': ['DataQuality',
  'DataEnrichment',
  'BusinessCards',
  'Data Cleansing',
  'VisitReport'],
 'What problem are you trying to solve?': ['Improve CRM data quality',
  'Capture trade fair contacts',
  'Scan business cards',
  'Extract data from emails',
  'Clean up CRM'],
 'What type of contact is it?': ['media',
  'Existing customer',
  'Competitor',
  'New customer',
  'Prospect',
  'Supplier',
  'Press'],
 'What are you interested in?': ['100', '256', '300', '200', '234'],
 'When would you like a follow-up?': ['1 week', '2 weeks', '3 weeks'],
 'What products interest you?': ['Double-Pulse Testing',
  'Automotive radar target simulation',
  'Display port debugging and compliance',
  'High-speed interconnect testing',
  'Noise figure measurements'],
 'What products are you interested in?': ['AKW100',
  'JTS',
  'JS EcoLine',
  'Notion',
  'MY-SYSTEM',
  'AX100'],
 'What follow-up is planned?': ['Phone',
  'Email',
  'Schedule a Visit',
  'No action']

In [None]:
df_multi_select

Unnamed: 0,question,text,labels
0,What are your product interests?,I'm interested in improving data quality and g...,"[DataQuality, BusinessCards]"
1,What are your product interests?,My priorities are data enrichment and visit re...,"[DataEnrichment, VisitReport]"
2,What are your product interests?,I need data cleansing and improved data quality.,"[Data Cleansing, DataQuality]"
3,What are your product interests?,I'm focused on business cards and data cleansing.,"[BusinessCards, Data Cleansing]"
4,What are your product interests?,My interest lies in visit reports and data enr...,"[VisitReport, DataEnrichment]"
...,...,...,...
740,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Jens Roschma...","['Stephan Maier', 'Joachim Wagner', 'Jens Rosc..."
741,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Domiki Stein...","['Stephan Maier', 'Joachim Wagner', 'Domiki St..."
742,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Sean Kennin ...","['Stephan Maier', 'Joachim Wagner', 'Sean Kenn..."
743,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Tim Persson ...","['Stephan Maier', 'Joachim Wagner', 'Tim Perss..."


In [None]:
# !pip install datasets
from datasets import Dataset

In [None]:
dataset_df_multi_select = Dataset.from_pandas(df_multi_select)
dataset_df_multi_select

Dataset({
    features: ['question', 'text', 'labels'],
    num_rows: 745
})

In [None]:
all_labels = set(label for sublist in df_multi_select['labels'] for label in sublist)
all_labels = list(all_labels)
len(all_labels)

53

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

In [None]:
model_name="deepset/roberta-base-squad2"
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define weights
ASSIGNED_LABEL_WEIGHT = 1.0   # Weight for labels that are actually assigned
UNASSIGNED_LABEL_WEIGHT = 0.1 # Weight for valid labels that are not assigned
INVALID_LABEL_WEIGHT = -100   # Mask invalid labels so they don't contribute to loss


# Tokenization and encoding function with batch processing fixed
def tokenize_and_encode_data(examples):
    batch_size = len(examples['question'])  # Number of examples in the batch

    # Combine question and text
    input_text = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["text"])]
    encoding = tokenizer(input_text, padding="max_length", truncation=True, max_length=128)

    # Initialize labels matrix for the batch
    labels_matrix = np.full((batch_size, len(all_labels)), INVALID_LABEL_WEIGHT, dtype=np.float32)  # Default: mask invalid labels

    # Process each example in the batch
    for idx in range(batch_size):
        question = examples['question'][idx]
        valid_labels = question_label_mapping.get(question, [])  # Get valid labels for the question
        assigned_labels = examples['labels'][idx]  # Get actual labels assigned

        # Assign weights
        for label in valid_labels:
            label_idx = all_labels.index(label)
            if label in assigned_labels:
                labels_matrix[idx, label_idx] = ASSIGNED_LABEL_WEIGHT  # Assigned labels get full weight
            else:
                labels_matrix[idx, label_idx] = UNASSIGNED_LABEL_WEIGHT  # Unassigned valid labels get small weight

    # Add labels to encoding
    encoding["labels"] = labels_matrix.tolist()

    return encoding

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
tokenized_dataset_df_multi_select = dataset_df_multi_select.map(tokenize_and_encode_data, batched=True, remove_columns=['question', 'text'])

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset_df_multi_select[0]

{'labels': [-100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  1.0,
  0.10000000149011612,
  -100.0,
  0.10000000149011612,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  0.10000000149011612,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  1.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0,
  -100.0],
 'input_ids': [0,
  2264,
  32,
  110,
  1152,
  3168,
  116,
  646,
  3388,
  510,
  742,
  38,
  437,
  2509,
  11,
  3927,
  414,
  1318,
  8,
  10846,
  265,
  3591,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [None]:
tokenized_dataset_df_multi_select = tokenized_dataset_df_multi_select.train_test_split(test_size=0.2)
tokenized_dataset_df_multi_select

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 596
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 149
    })
})

In [None]:
# Create a dictionary that maps labels to integers
label_to_int = {label: idx for idx, label in enumerate(all_labels)}

# Create a dictionary that maps integers back to labels
int_to_label = {idx: label for idx, label in enumerate(all_labels)}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(all_labels), problem_type="multi_label_classification", id2label=int_to_label, label2id=label_to_int)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Mehr Epochen für bessere Generalisierung
    per_device_train_batch_size=32,  # Falls GPU es erlaubt, größere Batch Size
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,  # Statt fester Warmup-Steps eine Ratio nutzen (~10% des Trainings)
    weight_decay=0.01,  # Höheres Weight Decay für bessere Generalisierung
    logging_dir='./logs',
    logging_steps=50,  # Häufigeres Logging zur besseren Überwachung
    evaluation_strategy='epoch',  # Evaluierung am Ende jeder Epoche
    save_strategy="epoch",  # Speichern nach jeder Epoche statt nach Schritten
    learning_rate=6e-5,  # Niedrigere Lernrate für stabilere Konvergenz
    lr_scheduler_type="linear",  # Statt Cosine einen linearen Scheduler nutzen
    report_to="none",
    load_best_model_at_end=True,  # Bestes Modell am Ende laden
    metric_for_best_model="accuracy",  # Falls du F1 bevorzugst, ändere das
    greater_is_better=True
)




In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# !pip install evaluate
import evaluate

In [None]:
#!pip install ipdb
import ipdb

In [None]:
import numpy as np
import torch
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

def multi_label_metrics(predictions, labels, threshold=0.5):
    """
    Computes multi-label classification metrics.

    Args:
        predictions: Raw model predictions (logits).
        labels: True labels (0 or 1), with -100 ignored.
        threshold: Threshold for converting probabilities to binary labels.

    Returns:
        A dictionary containing F1-score, ROC-AUC, and accuracy.
    """
    sigmoid = torch.nn.Sigmoid()

    # Convert logits to probabilities
    probs = sigmoid(torch.tensor(predictions))

    # Convert probabilities to binary predictions (0 or 1)
    y_pred = (probs >= threshold).numpy().astype(int)
    print(str(y_pred))

    # Convert labels to a NumPy array
    labels = np.array(labels)

    # Mask out labels that are -100 (ignored in loss)
    valid_mask = labels != -100  # Boolean mask
    y_true = np.where(valid_mask, labels, 0)  # Replace -100 with 0 (ignored in metric calculation)

    # Ensure both y_true and y_pred are in the same shape (batch_size, num_labels)
    if y_true.shape != y_pred.shape:
        raise ValueError(f"Shape mismatch: y_true {y_true.shape} vs y_pred {y_pred.shape}")

    # Compute metrics only if there are valid labels
    if np.any(y_true):
        f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)
        roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    else:
        f1_micro = 0.0
        roc_auc = 0.0  # Avoid errors when all labels are -100

    accuracy = accuracy_score(y_true, y_pred)

    return {'f1': f1_micro, 'roc_auc': roc_auc, 'accuracy': accuracy}

def compute_metrics(p: EvalPrediction):
    """
    Computes metrics using Hugging Face's Trainer.

    Args:
        p: EvalPrediction object from Trainer.

    Returns:
        Dictionary of computed metrics.
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Ensure labels are properly formatted
    labels = np.array(p.label_ids, dtype=int)

    return multi_label_metrics(predictions=preds, labels=labels)


In [None]:
# Step 5: Prepare Trainer with the dataset and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_df_multi_select["train"],
    eval_dataset=tokenized_dataset_df_multi_select["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # If you want to compute metrics like accuracy, you can define a custom function here
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,-616.385254,0.0,0.5,0.0


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


KeyboardInterrupt: 

Ab hier alt aktuell

In [None]:
# Strip extra spaces and quotation marks from labels
df_multi_select['labels'] = df_multi_select['labels'].apply(lambda x: [label.strip().strip('\'') for label in x])

# List of all unique labels
all_labels = set(label for sublist in df_multi_select['labels'] for label in sublist)

# Initialize new columns for each label
for label in all_labels:
    df_multi_select[label] = df_multi_select['labels'].apply(lambda x: 1 if label in x else 0)

# Drop the 'labels' column as it is no longer needed
df_multi_select = df_multi_select.drop(columns=['labels'])


# Create a dictionary that maps labels to integers
label_to_int = {label: idx for idx, label in enumerate(all_labels)}

# Create a dictionary that maps integers back to labels
int_to_label = {idx: label for idx, label in enumerate(all_labels)}

In [None]:
df_multi_select

Unnamed: 0,question,text,100,Marisa Peng,JTS,Competitor,Phone,AKW100,200,Improve CRM data quality,...,1 week,Automotive radar target simulation,Jens Roschmann,Schedule a Visit,Extract data from emails,Oliver Eibel,Double-Pulse Testing,3 weeks,VisitReport,Sandro Kalter
0,Productinterests,I need high-quality business cards to make a s...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Productinterests,My primary interest is in improving data quali...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Productinterests,I'm looking for a solution to enhance my visit...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Productinterests,I'm interested in business cards and data clea...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Productinterests,I need help improving my data quality through ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150,Who to copy in follow up,"Copy Marisa Peng, Jessica Hanke and Sean Kennin.",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1151,Who to copy in follow up,"Copy Marisa Peng, Jessica Hanke and Tim Persson.",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1152,Who to copy in follow up,"Copy Marisa Peng, Sandro Kalter and Jens Rosch...",0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1153,Who to copy in follow up,"Copy Marisa Peng, Sandro Kalter and Domiki Stein.",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
#int_to_label

In [None]:
# !pip install datasets
from datasets import Dataset

In [None]:
dataset_df_multi_select = Dataset.from_pandas(df_multi_select)
dataset_df_multi_select

Dataset({
    features: ['question', 'text', '100', 'Marisa Peng', 'JTS', 'Competitor', 'Phone', 'AKW100', '200', 'Improve CRM data quality', 'Johannes Wagner', 'Prospect', 'Scan business cards', 'DataEnrichment', 'Notion', 'Existing customer', 'BusinessCards', '2 weeks', 'No action', 'Data Cleansing', 'Jessica Hanke', 'High-speed interconnect testing', 'Domiki Stein', 'Joachim Wagner', 'Clean up CRM', 'AX100', 'Tim Persson', 'New customer', '300', 'JS EcoLine', '256', 'Display port debugging and compliance', 'MY-SYSTEM', 'Press', '100,200', '234', 'media', 'Stephan Maier', 'Erik Schneider', 'Sean Kennin', 'Angelina Haug', 'Supplier', 'DataQuality', 'Noise figure measurements', 'Email', 'Capture trade fair contacts', '1 week', 'Automotive radar target simulation', 'Jens Roschmann', 'Schedule a Visit', 'Extract data from emails', 'Oliver Eibel', 'Double-Pulse Testing', '3 weeks', 'VisitReport', 'Sandro Kalter'],
    num_rows: 1155
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

In [None]:
question_label_mapping.get("Productinterests", [])

['DataEnrichment',
 'BusinessCards',
 'Data Cleansing',
 'VisitReport',
 'DataQuality']

In [None]:
#question_label_mapping

In [None]:
# Load tokenizer
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# neu mit Berücksichtigung von validen labels

def tokenize_function_multi(examples):
    # Take a batch of texts and concatenate question and answer
    text = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["text"])]

    # Encode them with padding and truncation
    encoding = tokenizer(text, padding=True, truncation=True, max_length=384)

    # Initialize a labels matrix with zeros (shape: batch_size x num_labels)
    labels_matrix = np.zeros((len(text), len(all_labels)))

    # Iterate over the batch of examples
    for idx, row in enumerate(examples['question']):
        # Get the valid labels for the current question
        valid_labels = question_label_mapping.get(row, [])

        # Get the labels that are associated with the current example
        labels_batch = examples["labels"]#[idx]
        #print(labels_batch)

        # For each valid label, check if it is in the labels of the current example
        for label in labels_batch:
            if label in valid_labels:
                label_id = all_labels.index(label)  # Find the index of the valid label
                labels_matrix[idx, label_id] = 1  # Set to 1 if the label is valid and present

    # Add the labels matrix to the encoding
    encoding["labels"] = labels_matrix.tolist()

    return encoding


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function_multi(examples):
  # take a batch of texts
  text = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["text"])]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  #print(text)
  #print(1)

  # Erstelle eine One-Hot-Encoding-Matrix mit der gleichen Größe wie die Anzahl der Labels
  #labels_matrix = np.zeros((len(text), len(all_labels)))  # Hier `all_labels` ist die gesamte Liste aller möglichen Labels
    # Filtere die Labels basierend auf der Frage
  #for idx, row in examples.iterrows():
  #  question = row["question"]
  #  valid_labels = question_label_mapping.get(question, [])
  #  for label in row["labels"]:
  #    if label in valid_labels:
  #        label_id = label2id.get(label)
  #        if label_id is not None:
  #            labels_matrix[idx, label_id] = 1  # Setze 1 für das gültige Label

  # Füge das One-Hot-Label-Matrix-Array zur Tokenizer-Ausgabe hinzu
  #encoding["labels"] = labels_matrix.tolist()




  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in all_labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(all_labels)))
  # fill numpy array
  for idx, label in enumerate(all_labels):
    labels_matrix[:, idx] = labels_batch[label]

  #print(labels_matrix[0])
  #print()

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
tokenized_dataset_multi_select = dataset_df_multi_select.map(tokenize_function_multi, batched=True, remove_columns=dataset_df_multi_select.column_names)


Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

In [None]:
example = tokenized_dataset_multi_select[50]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
#tokenizer.decode(example['labels'])

In [None]:
tokenized_dataset_multi_select = tokenized_dataset_multi_select.train_test_split(test_size=0.2)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(all_labels), problem_type="multi_label_classification", id2label=int_to_label, label2id=label_to_int)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

In [None]:


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Mehr Epochen für bessere Generalisierung
    per_device_train_batch_size=32,  # Falls GPU es erlaubt, größere Batch Size
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,  # Statt fester Warmup-Steps eine Ratio nutzen (~10% des Trainings)
    weight_decay=0.01,  # Höheres Weight Decay für bessere Generalisierung
    logging_dir='./logs',
    logging_steps=50,  # Häufigeres Logging zur besseren Überwachung
    evaluation_strategy='epoch',  # Evaluierung am Ende jeder Epoche
    save_strategy="epoch",  # Speichern nach jeder Epoche statt nach Schritten
    learning_rate=3e-5,  # Niedrigere Lernrate für stabilere Konvergenz
    lr_scheduler_type="linear",  # Statt Cosine einen linearen Scheduler nutzen
    report_to="none",
    load_best_model_at_end=True,  # Bestes Modell am Ende laden
    metric_for_best_model="accuracy",  # Falls du F1 bevorzugst, ändere das
    greater_is_better=True
)




In [None]:
#!pip install evaluate
import evaluate

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

#!pip install evaluate
import evaluate

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
# Step 5: Prepare Trainer with the dataset and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_multi_select["train"],
    eval_dataset=tokenized_dataset_multi_select["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # If you want to compute metrics like accuracy, you can define a custom function here
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.452763,0.0,0.5,0.0
2,No log,0.309083,0.0,0.5,0.0
3,0.448900,0.247199,0.0,0.5,0.0
4,0.448900,0.214315,0.0,0.5,0.0
5,0.448900,0.196051,0.0,0.5,0.0
6,0.229500,0.18554,0.0,0.5,0.0
7,0.229500,0.17897,0.0,0.5,0.0
8,0.187400,0.174773,0.0,0.5,0.0
9,0.187400,0.172507,0.0,0.5,0.0
10,0.187400,0.171651,0.0,0.5,0.0


TrainOutput(global_step=190, training_loss=0.265409339101691, metrics={'train_runtime': 291.5318, 'train_samples_per_second': 20.444, 'train_steps_per_second': 0.652, 'total_flos': 392214988830720.0, 'train_loss': 0.265409339101691, 'epoch': 10.0})

In [None]:
type(tokenized_dataset_multi_select["train"]["labels"])

list

In [None]:
model.save_pretrained("./roberta_finetuned_multiselect")
tokenizer.save_pretrained("./roberta_finetuned_multiselect")

('./roberta_finetuned_multiselect/tokenizer_config.json',
 './roberta_finetuned_multiselect/special_tokens_map.json',
 './roberta_finetuned_multiselect/vocab.json',
 './roberta_finetuned_multiselect/merges.txt',
 './roberta_finetuned_multiselect/added_tokens.json',
 './roberta_finetuned_multiselect/tokenizer.json')

In [None]:
from transformers import pipeline

# Lade die Pipeline für Multi-Label-Klassifikation
classifier = pipeline("text-classification", model="./roberta_finetuned_multiselect", tokenizer="./roberta_finetuned_multiselect", framework="tf")

# Frage stellen
question = "What are your product interests?"

# Benutzerantwort eingeben
answer = input("Bitte gib deine Antwort auf die Frage ein: ")

# Kombiniere Frage und Antwort
input_text = question + " " + answer

# Vorhersage der Labels mit der Pipeline
predictions = classifier(input_text)

# Ausgabe der Vorhersage
print("Vorhersage der Labels:", predictions)


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0


Bitte gib deine Antwort auf die Frage ein: data quality
Vorhersage der Labels: [{'label': 'High-speed interconnect testing', 'score': 0.44536134600639343}]


AttributeError: 'list' object has no attribute 'logits'

Ab hier alt

In [None]:
# 🔹 Step 3: Clean Labels
mlb = MultiLabelBinarizer()
df_multi_select['labels'] = mlb.fit_transform(df_multi_select['labels']).tolist()  # Convert to binary format

In [None]:
df_multi_select

Unnamed: 0,question,text,labels
0,What are your product interests?,I'm interested in improving data quality and g...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,What are your product interests?,My priorities are data enrichment and visit re...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,What are your product interests?,I need data cleansing and improved data quality.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,What are your product interests?,I'm focused on business cards and data cleansing.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,What are your product interests?,My interest lies in visit reports and data enr...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
740,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Jens Roschma...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
741,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Domiki Stein...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
742,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Sean Kennin ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
743,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Tim Persson ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."


In [None]:
import numpy as np

In [None]:
for index, row in df_multi_select.iterrows():
    labels = row["labels"]
    labels = np.array(labels).astype(np.float32).tolist()
    df_multi_select.at[index, "labels"] = labels

df_multi_select

Unnamed: 0,question,text,labels
0,What are your product interests?,I'm interested in improving data quality and g...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,What are your product interests?,My priorities are data enrichment and visit re...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,What are your product interests?,I need data cleansing and improved data quality.,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,What are your product interests?,I'm focused on business cards and data cleansing.,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,What are your product interests?,My interest lies in visit reports and data enr...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
740,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Jens Roschma...","[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
741,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Domiki Stein...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
742,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Sean Kennin ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
743,Who should I CC on the follow-up?,"CC Stephan Maier, Joachim Wagner, Tim Persson ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [None]:
dataset_multiselect = Dataset.from_pandas(df_multi_select)

In [None]:
dataset_multiselect

Dataset({
    features: ['question', 'text', 'labels'],
    num_rows: 745
})

In [None]:
#!pip install -Uqq ipdb
import ipdb

In [None]:
model_name = 'deepset/roberta-base-squad2'
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# prompt: Step 4: Tokenization and other transformations roberta huggingface considering question and answer as input and label as output

def tokenize_function_multi(examples):
    # Use separator token to distinguish question & text
    text_inputs = [q + " [SEP] " + a for q, a in zip(examples["question"], examples["text"])]
    #ipdb.set_trace()
    tokenized_inputs = tokenizer(text_inputs, truncation=True, padding="max_length", max_length=128)
    tokenized_inputs["labels"] = [torch.tensor(label, dtype=torch.float) for label in examples["labels"]]


    return tokenized_inputs

In [None]:
tokenized_dataset_multiselect = dataset_multiselect.map(tokenize_function_multi, batched=True)

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset_multiselect = tokenized_dataset_multiselect.train_test_split(test_size=0.2)

In [None]:
num_labels = len(mlb.classes_)  # Number of unique labels
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 4: Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
#!pip install evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Stelle sicher, dass logits ein Tensor sind (falls sie als ndarray vorliegen)
    logits = torch.tensor(logits) if isinstance(logits, np.ndarray) else logits

    # Wende Sigmoid auf die Logits an, um Wahrscheinlichkeiten zu erhalten
    predictions = torch.sigmoid(logits).cpu().numpy()

    # Verwende einen Schwellenwert von 0.5, um Vorhersagen im multi-hot Format zu erhalten
    predictions = (predictions > 0.5).astype(int)

    # Konvertiere Labels zu NumPy-Array und stelle sicher, dass sie im richtigen Format sind
    labels = labels.cpu().numpy()

    # Berechne Accuracy und F1-Score
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')  # macro: F1 Score über alle Klassen hinweg

    return {'accuracy': accuracy, 'f1': f1}



In [None]:

# Step 5: Prepare Trainer with the dataset and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_multiselect["train"],
    eval_dataset=tokenized_dataset_multiselect["test"],
    tokenizer=tokenizer,
    compute_metrics=None,  # If you want to compute metrics like accuracy, you can define a custom function here
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4454,0.413896
2,0.3915,0.355573
3,0.3195,0.290695


TrainOutput(global_step=114, training_loss=0.3958316309410229, metrics={'train_runtime': 51.7145, 'train_samples_per_second': 34.574, 'train_steps_per_second': 2.204, 'total_flos': 117664496649216.0, 'train_loss': 0.3958316309410229, 'epoch': 3.0})

In [None]:
model.save_pretrained("./roberta_finetuned_multiselect")
tokenizer.save_pretrained("./roberta_finetuned_multiselect")

('./roberta_finetuned_multiselect/tokenizer_config.json',
 './roberta_finetuned_multiselect/special_tokens_map.json',
 './roberta_finetuned_multiselect/vocab.json',
 './roberta_finetuned_multiselect/merges.txt',
 './roberta_finetuned_multiselect/added_tokens.json')

In [None]:
from transformers import pipeline

# Lade die Pipeline für Multi-Label-Klassifikation
classifier = pipeline("text-classification", model="./roberta_finetuned_multiselect", tokenizer="./roberta_finetuned_multiselect", framework="tf")

# Frage stellen
question = "What are your product interests?"

# Benutzerantwort eingeben
answer = input("Bitte gib deine Antwort auf die Frage ein: ")

# Kombiniere Frage und Antwort
input_text = question + " " + answer

# Vorhersage der Labels mit der Pipeline
predictions = classifier(input_text)

# Ausgabe der Vorhersage
print("Vorhersage der Labels:", predictions)


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0


Bitte gib deine Antwort auf die Frage ein: data quality
Vorhersage der Labels: [{'label': 'LABEL_40', 'score': 0.29404422640800476}]
