# Train with Transformer

In [None]:
# !pip install transformers



In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Data Preparation
We load the data from Google Drive and prepare it for training.


In [None]:
# Note: If running locally, comment out the next two lines
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Code Cell: Data Preparation
path = /content/drive/My Drive/Colab Notebooks/Chatbot
data_path = f'{path}/train-without-Prep.csv'
test_data = pd.read_csv(f'{path}/test-without-Prep.csv')



In [None]:
data = pd.read_csv(data_path).sample(frac=1, random_state=42)
train_texts, test_texts, train_labels, test_labels = train_test_split(data['utterances'], data['intent'], test_size=0.2)

In [None]:
test_texts_callback = test_data['utterances']
test_labels_callback = pd.Categorical(test_data['intent']).codes

In [None]:
# Konvertiere Kategorien in numerische Werte
train_labels = pd.Categorical(train_labels).codes
test_labels = pd.Categorical(test_labels).codes

# Tokenization and Model Initialization
We initialize the BERT tokenizer and model, and tokenize the training and test data.

In [None]:
# Code Cell: Tokenization and Model Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
train_labels = pd.Categorical(train_labels).codes
test_labels = pd.Categorical(test_labels).codes

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, return_tensors="tf")


# Custom Callback and Training Function
We define a custom callback to log test metrics and a function to train the model with different hyperparameters.

In [None]:
class TestCallback(tf.keras.callbacks.Callback):
    def __init__(self, test_data):
        self.test_data = test_data
        self.test_history = []

    def on_epoch_end(self, epoch, logs={}):
        if (epoch + 1) % 5 == 0:
            x, y = self.test_data
            loss, accuracy = self.model.evaluate(x, y, verbose=0)
            self.test_history.append({"epoch": epoch + 1, "test_loss": loss, "test_accuracy": accuracy})


In [None]:
# Function to Train and Evaluate Model
def train_eval_model(learning_rate, epochs, batch_size):
    # Initialize the model and optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model = TFBertForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=len(data['intent'].unique()))
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
    # Reset the default graph (useful for multiple training runs)
    tf.compat.v1.reset_default_graph()
    
    # Train the model
    history = model.fit(
        [train_encodings.input_ids, train_encodings.attention_mask], np.array(train_labels),
        validation_data=([test_encodings.input_ids, test_encodings.attention_mask], np.array(test_labels)),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[test_callback]
    )
    
    # Collect and return training and validation metrics
    results = {
        "Epochs": list(range(1, epochs + 1)),
        "Training Loss": history.history["loss"],
        "Validation Loss": history.history["val_loss"],
        "Training Accuracy": history.history["accuracy"],
        "Validation Accuracy": history.history["val_accuracy"],
        "Test Loss (every 5 epochs)": [None] * epochs,
        "Test Accuracy (every 5 epochs)": [None] * epochs
    }
    
    # Add test metrics from the custom callback
    for test_result in test_callback.test_history:
        epoch = test_result["epoch"]
        results["Test Loss (every 5 epochs)"][epoch - 1] = test_result["test_loss"]
        results["Test Accuracy (every 5 epochs)"][epoch - 1] = test_result["test_accuracy"]
    
    return results

# Hyperparameter Tuning
In this section, we perform hyperparameter tuning by training the model with different combinations of learning rates, batch sizes, and epochs. The performance metrics for each combination are saved to a CSV file.

In [None]:
# Define hyperparameters to tune
learning_rates = [0.1, 0.01, 0.001, 0.0001]
batch_sizes = [16, 32, 64]
epoche = 25

In [None]:
# Initialize the custom callback with the separate test dataset
test_callback = TestCallback(([test_encodings_callback.input_ids, test_encodings_callback.attention_mask], np.array(test_labels_callback)))

In [None]:

# Initialize list to collect training evaluations
trainings_eval = []

# Loop through all combinations of hyperparameters
i = 0
for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        i += 1
        print(f'Run {i}: Learning Rate = {learning_rate}, Batch Size = {batch_size}')
        
        # Train and evaluate the model
        training_result = train_eval_model(learning_rate, epoche, batch_size)
        
        # Append the result to the list
        trainings_eval.append(training_result)
        
        # Save intermediate results to CSV
        df = pd.DataFrame(trainings_eval)
        df.to_csv(f"{path}/BERT_Hyperparameter_Results_LR_{learning_rate}_BS_{batch_size}.csv", index=False)

# Save final results to CSV
df = pd.DataFrame(trainings_eval)
df.to_csv("{path}/BERT_Hyperparameter_Results.csv", index=False)
