In [33]:
import pandas as pd
import json

In [34]:
# JSON-Datei laden
with open('final_single_question_data.json', 'r') as f:
    data = json.load(f)

# Daten in ein DataFrame umwandeln
rows = []
for entry in data:
    #question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            #"question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["answer_text"],
            "answer_label": answer["answer_label"],
            "timestamp": answer["timestamp"]
        })

# DataFrame erstellen
df_single_select = pd.DataFrame(rows)

# DataFrame anzeigen
#print(df.head())

In [35]:
df_single_select

Unnamed: 0,question,type,answer_text,answer_label,timestamp
0,What type of customer are you?,SINGLE_SELECT,I'm a first-time buyer exploring your offerings.,New customer,2025-01-24T00:52:21.492617
1,What type of customer are you?,SINGLE_SELECT,This is my initial purchase from your company.,New customer,2025-01-24T00:52:21.492675
2,What type of customer are you?,SINGLE_SELECT,I'm a new customer looking for information.,New customer,2025-01-24T00:52:21.492687
3,What type of customer are you?,SINGLE_SELECT,I've never used your services before.,New customer,2025-01-24T00:52:21.492696
4,What type of customer are you?,SINGLE_SELECT,"I'm a brand-new customer, excited to try your ...",New customer,2025-01-24T00:52:21.492704
...,...,...,...,...,...
6925,What is the specific customer group you're tar...,SINGLE_SELECT,Our specific customer base is R&D personnel in...,R&D,2025-01-24T01:06:02.563679
6926,What is the specific customer group you're tar...,SINGLE_SELECT,We're aiming to reach R&D professionals workin...,R&D,2025-01-24T01:06:02.563685
6927,What is the specific customer group you're tar...,SINGLE_SELECT,The specific customer group is R&D scientists ...,R&D,2025-01-24T01:06:02.563691
6928,What is the specific customer group you're tar...,SINGLE_SELECT,Our focus is on R&D teams working on national ...,R&D,2025-01-24T01:06:02.563697


In [36]:
df_single_select = df_single_select[['question', 'answer_text', 'answer_label']]

In [37]:
df_single_select

Unnamed: 0,question,answer_text,answer_label
0,What type of customer are you?,I'm a first-time buyer exploring your offerings.,New customer
1,What type of customer are you?,This is my initial purchase from your company.,New customer
2,What type of customer are you?,I'm a new customer looking for information.,New customer
3,What type of customer are you?,I've never used your services before.,New customer
4,What type of customer are you?,"I'm a brand-new customer, excited to try your ...",New customer
...,...,...,...
6925,What is the specific customer group you're tar...,Our specific customer base is R&D personnel in...,R&D
6926,What is the specific customer group you're tar...,We're aiming to reach R&D professionals workin...,R&D
6927,What is the specific customer group you're tar...,The specific customer group is R&D scientists ...,R&D
6928,What is the specific customer group you're tar...,Our focus is on R&D teams working on national ...,R&D


In [38]:
from sklearn.preprocessing import LabelEncoder

In [39]:
label_encoder = LabelEncoder()
df_single_select["label"] = label_encoder.fit_transform(df_single_select["answer_label"])  # z. B. "Yes" → 0, "No" → 1, "No Preference" → 2

print(label_encoder.classes_)  # Zeigt, welche Zahl welchem Label entspricht

['1-10' '1-5' '11-15' '11-50' '16-20' '201-2000' '21-30' '31-40' '51-200'
 '6-10' 'Adito' 'Aerospace' 'Applicant' 'Automotive' 'CAS' 'Call'
 'Close.io' 'Computers & Networks' 'Construction company'
 'Consultant, Planner, Architect' 'Craft enterprises' 'Defense'
 'Education sector' 'End User' 'English' 'Existing customer' 'German'
 'Government' 'HubSpot' 'Industrial' 'Italian' 'Japanese ' 'Medical'
 'Meeting' 'Microsoft Dynamics' 'Network Operators & Infrastructure'
 'New customer' 'No' 'Offer' 'Partner' 'Physical Security' 'Pipedrive'
 'Production company' 'Public Safety / Law Enforcement' 'R&D'
 'SAP Sales Cloud' 'Salesforce' 'Satisfied' 'Scaffolding company'
 'Spanish' 'Trading company' 'Unsatisfied' 'Very satisfied'
 'Very unsatisfied' 'Wholesaler, Distributor' 'Yes' 'larger than 2000'
 'more than 40']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_select["label"] = label_encoder.fit_transform(df_single_select["answer_label"])  # z. B. "Yes" → 0, "No" → 1, "No Preference" → 2


In [40]:
from sklearn.model_selection import train_test_split

In [41]:
train_df, test_df = train_test_split(df_single_select, test_size=0.2, random_state=42, stratify=df_single_select["label"])

In [42]:
# Speichere die Datasets als JSON
train_df[["question", "answer_text", "label"]].to_json("train.json", orient="records", lines=True)
test_df[["question", "answer_text", "label"]].to_json("test.json", orient="records", lines=True)

In [43]:
print("Train size:", len(train_df), "Test size:", len(test_df))

Train size: 5544 Test size: 1386


In [44]:
# pip install datasets

In [45]:
from datasets import load_dataset

In [46]:
dataset = load_dataset("json", data_files={"train": "train.json", "test": "test.json"})
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer_text', 'label'],
        num_rows: 5544
    })
    test: Dataset({
        features: ['question', 'answer_text', 'label'],
        num_rows: 1386
    })
})


In [47]:
from transformers import AutoTokenizer

In [48]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    text_inputs = [q + " " + a for q, a in zip(examples["question"], examples["answer_text"])]
    return tokenizer(text_inputs, truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/5544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

In [49]:
from transformers import AutoModelForSequenceClassification

In [50]:
num_labels = len(label_encoder.classes_)  # Anzahl der Klassen (3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
from transformers import TrainingArguments

In [52]:
training_args = TrainingArguments(
    output_dir="./roberta_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)



In [53]:
# Use the One Cycle Policy
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="linear",  # 'linear' or 'cosine'
    warmup_steps=500,  # Steps to gradually increase the learning rate
    max_grad_norm=1.0, # Gradient clipping
    report_to="none"
)



In [54]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Funktion zur Berechnung der Metriken
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Nimmt die Klasse mit der höchsten Wahrscheinlichkeit
    acc = accuracy_score(labels, predictions)  # Accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")  # Andere Scores
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [55]:
from transformers import Trainer

In [56]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Hier fügen wir die Metrik-Funktion hinzu!
)


trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1592,0.146767,0.959596,0.951587,0.959596,0.953841
2,0.0878,0.11575,0.975469,0.976074,0.975469,0.975446
3,0.0728,0.107475,0.976912,0.978866,0.976912,0.976602


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=2079, training_loss=0.6596670270005547, metrics={'train_runtime': 495.507, 'train_samples_per_second': 33.566, 'train_steps_per_second': 4.196, 'total_flos': 1094565840850944.0, 'train_loss': 0.6596670270005547, 'epoch': 3.0})

In [57]:
model.save_pretrained("./roberta_finetuned")
tokenizer.save_pretrained("./roberta_finetuned")

('./roberta_finetuned/tokenizer_config.json',
 './roberta_finetuned/special_tokens_map.json',
 './roberta_finetuned/vocab.json',
 './roberta_finetuned/merges.txt',
 './roberta_finetuned/added_tokens.json',
 './roberta_finetuned/tokenizer.json')

In [84]:
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_encoder.classes_)}
print(label_mapping)

{'LABEL_0': '1-10', 'LABEL_1': '1-5', 'LABEL_2': '11-15', 'LABEL_3': '11-50', 'LABEL_4': '16-20', 'LABEL_5': '201-2000', 'LABEL_6': '21-30', 'LABEL_7': '31-40', 'LABEL_8': '51-200', 'LABEL_9': '6-10', 'LABEL_10': 'Adito', 'LABEL_11': 'Aerospace', 'LABEL_12': 'Applicant', 'LABEL_13': 'Automotive', 'LABEL_14': 'CAS', 'LABEL_15': 'Call', 'LABEL_16': 'Close.io', 'LABEL_17': 'Computers & Networks', 'LABEL_18': 'Construction company', 'LABEL_19': 'Consultant, Planner, Architect', 'LABEL_20': 'Craft enterprises', 'LABEL_21': 'Defense', 'LABEL_22': 'Education sector', 'LABEL_23': 'End User', 'LABEL_24': 'English', 'LABEL_25': 'Existing customer', 'LABEL_26': 'German', 'LABEL_27': 'Government', 'LABEL_28': 'HubSpot', 'LABEL_29': 'Industrial', 'LABEL_30': 'Italian', 'LABEL_31': 'Japanese ', 'LABEL_32': 'Medical', 'LABEL_33': 'Meeting', 'LABEL_34': 'Microsoft Dynamics', 'LABEL_35': 'Network Operators & Infrastructure', 'LABEL_36': 'New customer', 'LABEL_37': 'No', 'LABEL_38': 'Offer', 'LABEL_39':

In [63]:
test_df

Unnamed: 0,question,answer_text,answer_label,label
2474,What CRM system are you currently using?,We've recently transitioned to Adito for our CRM.,Adito,10
3120,What language should we use to communicate?,Using Italian gives a sense of connection to I...,Italian,30
4402,How many people work at your company?,Our workforce currently numbers between 51 and...,51-200,8
1824,What CRM system are you currently using?,Our team utilizes Pipedrive for CRM functional...,Pipedrive,41
5234,What industry are you in?,My government work centers on immigration and ...,Government,27
...,...,...,...,...
4186,How many people work at your company?,We're a tight-knit group of one to ten employees.,1-10,0
111,What type of customer are you?,I'm a new customer who is delighted with the o...,New customer,36
3009,What language should we use to communicate?,Our team’s expertise in German makes it the mo...,German,26
6590,What is the specific customer group you're tar...,Investors investing in businesses.,End User,23


In [102]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")

test_input = {
    "question": "What type of customer are you?",
    "answer_text": "first time here"
}

# Kombinieren wie im Training (Frage + Antwort)
test_text = test_input["question"] + " " + test_input["answer_text"]
result = classifier(test_text)
#print(result)

# Die vorhergesagte Klasse zurück in ein Label umwandeln
predicted_label = label_mapping[result[0]["label"]]
accuracy = result[0]["score"]

print(f"Vorhergesagtes Label: {predicted_label} (Accuracy: {accuracy})")


Device set to use cuda:0


Vorhergesagtes Label: New customer (Accuracy: 0.9964930415153503)


In [96]:
import random

In [105]:
from transformers import pipeline

# Initialize the classifier with the fine-tuned RoBERTa model and tokenizer
classifier = pipeline("text-classification", model="./roberta_finetuned", tokenizer="./roberta_finetuned")

question_list = df_single_select["question"].tolist()

# Interactive loop
while True:
    # Get user input for question and answer
    random_index = random.randint(0, len(question_list) - 1)
    question = question_list[random_index]
    print(f"Question: {question}")
    answer_text = input("Enter the answer text: ")

    # Combine question and answer as done in training
    test_text = question + " " + answer_text

    # Run the classifier on the combined text
    result = classifier(test_text)

    # Output the result
    print(f"Model result: {result}")

    # Assuming the result contains a class label (e.g., 'LABEL_0', 'LABEL_1', etc.), map it back to a label
    predicted_label = label_mapping.get(result[0]["label"], "Unknown label")

    # Print the predicted label
    print("Predicted label:", predicted_label)

    # Ask if the user wants to continue
    continue_input = input("Do you wan#t to test another input? (y/n): ")
    if continue_input.lower() != 'y':
        print("Exiting the interactive loop.")
        break


Device set to use cuda:0


Question: What kind of company is this?
Enter the answer text: tech
Model result: [{'label': 'LABEL_18', 'score': 0.7822200059890747}]
Predicted label: Construction company
Do you wan#t to test another input? (y/n): n
Exiting the interactive loop.


In [107]:
import shutil

# Create a zip file for easier download
shutil.make_archive('/content/roberta_finetuned_model', 'zip', '/content', 'roberta_finetuned')

# The zip file is now ready to be downloaded
from google.colab import files
files.download("roberta_finetuned_model.zip")

'/content/roberta_finetuned_model.zip'

In [111]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define the folder where the model was saved (this is the directory containing your model and tokenizer)
folder_to_zip = '/content/roberta_finetuned'  # The folder containing the fine-tuned model

# Specify the folder in Google Drive to save the model
model_save_path = 'model_path/roberta_finetuned'



# Copy model folder directly to Google Drive
shutil.copytree(folder_to_zip, model_save_path)
print(f"Model saved to Google Drive at {model_save_path}")


Mounted at /content/drive
Model saved to Google Drive at /content/drive/MyDrive/CapStone_models/roberta_finetuned
