# Training BoW
This Jupyter Notebook is part of a bachelor thesis that aims to investigate the capabilities of specialized chatbots. Specifically, we will train a Bag-of-Words (BoW) model and evaluate its performance.

# Import Libraries
We import all the necessary libraries for data manipulation, machine learning, and visualization.


In [None]:
# Import Libraries
import nltk
import numpy as np
import tensorflow as tf
import tflearn
import random
import os
import json
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import GermanStemmer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tflearn.data_utils import to_categorical, pad_sequences
import datetime


# Data Preparation
We load the data from Google Drive and prepare it for training.

# Mount Google Drive (Only for Google Colab, comment this out if running locally)
from google.colab import drive
drive.mount('/content/drive/')

# Change this path if you are running the notebook locally
path = '/content/drive/My Drive/Colab Notebooks/Chatbot'

# Load the training data
# Change this path if you are running the notebook locally
df = pd.read_csv('{path}/train.csv')

# Load the test data
df_test = pd.read_csv('{path}/test.csv')


# Text Preprocessing
We tokenize the utterances and remove stopwords.

In [None]:
# Initialize the German stemmer and stopwords
stemmer = GermanStemmer()
stop_words = stopwords.words('german')
ignore_words = ['?', '.', ',', '='] + stop_words

# Tokenize the utterances and remove stopwords
df['tokenized_utterances'] = df['utterances'].apply(lambda x: nltk.word_tokenize(x, language='german'))
df['filtered_utterances'] = df['tokenized_utterances'].apply(lambda x: [word for word in x if word.lower() not in ignore_words])


In [None]:
# Function to flatten a nested list
def flatten_list(nested_list):
    result = []
    for element in nested_list:
        if isinstance(element, list):
            result.extend(flatten_list(element))
        else:
            result.append(element)
    return result

# Create a Bag-of-Words (BoW) representation
all_words = flatten_list(df['filtered_utterances'])
all_words = [stemmer.stem(w.lower()) for w in all_words]
unique_words = sorted(list(set(all_words)))
classes = sorted(list(set(df['intent'].values)))

We also create a Bag-of-Words representation for each utterance.

In [None]:
# Create a numerical representation for each filtered utterance
def bow_representation(words_list):
    return [1 if word in words_list else 0 for word in unique_words]


In [None]:
df['BoW'] = df['filtered_utterances'].apply(bow_representation)
df_test['BoW'] = df_test['words'].apply(lambda row: compute_bow(row, words))
# Show the DataFrame with the new columns
df.head()

In [None]:
# Initialize the LabelEncoder
le = LabelEncoder()
# Fit the LabelEncoder and transform the 'intent' column
df['intent_numerical'] = le.fit_transform(df['intent'])

## Testing and Prediction Functions

In [None]:
# Define synonyms for domain-specific terms
synonyms = {
    'ordner': 'akte',
    'dateien': 'dokument',
    'vorgang': 'register'
}

In [None]:
# Function to preprocess a question
def preprocess_question(question):
    tokenized_words = nltk.word_tokenize(question, language='german')
    ignore_words = set([stemmer.stem(word.lower()) for word in ignore_words])
    processed_words = [stemmer.stem(synonyms.get(word.lower(), word).lower()) for word in tokenized_words]
    return [word for word in processed_words if word not in ignore_words]

In [None]:
# Function to generate Bag-of-Words representation
def generate_bow(question, words):
    processed_words = preprocess_question(question)
    return np.array(bow_representation(processed_words, words))

In [101]:
ERROR_THRESHOLD = 0.30

In [None]:
# Function to classify a question
def classify_question(question, model):
    bow_array = generate_bow(question, unique_words)
    results = model.predict(np.array([bow_array]))[0]
    filtered_results = [(i, r) for i, r in enumerate(results) if r > ERROR_THRESHOLD]
    filtered_results.sort(key=lambda x: x[1], reverse=True)
    return [(classes[i], r) for i, r in filtered_results]

In [None]:
# Function to predict the class of a question
def predict_class(question, model):
    predicted_class = classify_question(question, model)
    return predicted_class[0][0] if predicted_class else ""

# Model Training
We define the architecture of the neural network and train it using different hyperparameters.

In [75]:
# Get today's date as YYYYMMDD
today = datetime.date.today().strftime("%Y%m%d")
# Identifier for PyTorch
identifier = "tflearn"

In [None]:
# Initialize hyperparameters
learning_rates = [0.1, 0.01, 0.001]
batch_sizes = [16, 32, 64]
epochs = [i for i in range(100, 350, 50)]

In [93]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42)

In [None]:
# Convert labels to binary vectors
y_categorical = to_categorical(df['intent_numerical'], nb_classes=len(set(df['intent_numerical'])))

# Split data into training and val sets
X_train, X_val, y_train, y_val = train_test_split(df['BoW'].tolist(), y_categorical, test_size=0.2)

In [None]:
model_path = f"{path}/{today}_{identifier}_train_logs"

In [108]:
# Custom callback to log metrics
class MetricLogger(tflearn.callbacks.Callback):
    def __init__(self):
        self.epoch_data = []

    def on_epoch_end(self, training_state):
        data = {}
        # Getting metrics for training
        data['train_acc'] = training_state.acc_value
        data['train_loss'] = training_state.global_loss

        # Getting metrics for validation
        if training_state.val_acc is not None:
            data['val_acc'] = training_state.val_acc
        if training_state.val_loss is not None:
            data['val_loss'] = training_state.val_loss

        self.epoch_data.append(data)




In [None]:
# Function to train and evaluate the model
def train_eval_model(learning_rate, epoch, batch_size):
    # Reset the default graph (important for retraining)
    tf.compat.v1.reset_default_graph()
    # Initialize the metric logger
    logger = MetricLogger()
    # Define the neural network architecture
    net = tflearn.input_data(shape=[None, len(X_train[0])])
    net = tflearn.fully_connected(net, 64)
    net = tflearn.fully_connected(net, 64)
    net = tflearn.fully_connected(net, len(y_train[0]), activation='softmax')
    net = tflearn.regression(net, learning_rate=learning_rate)
    
    # Initialize and train the model
    model = tflearn.DNN(net, tensorboard_dir=model_path)
    model.fit(X_train, y_train,validation_set=(X_val, y_val), n_epoch=epoch, batch_size=batch_size, show_metric=True,  callbacks=logger)
    # Convert epoch data to DataFrame
    df_metric = pd.DataFrame(logger.epoch_data)
    
    # Save the model
    model_path = f"{path}/LR {learning_rate} BatchSize {batch_size} Epoch {epoch} model.tflearn"
    model.save(model_path)

    # If you only want the last epoch
    metrics = df_metric.iloc[-1]
    # Calculate the accuracy of the model
    df_test['actual'] = df_test['utterances'].apply(lambda row: predicted_class(row, model))
    metrics["test_acc"] = accuracy_score(df_test['intent'], df_test['actual'])
    metrics["learning_rate"]= learning_rate
    metrics["epoche"]= epoch
    metrics["batch_size"]= batch_size
    print(metrics)
    return metrics

In [None]:
train_eval_results = []
for lr in learning_rates:
    for epoch in epochs:
        for batch in batch_sizes:
            result = train_eval_model(lr, epoch, batch)
            train_eval_results.append(result)

In [None]:
df=pd.DataFrame(train_eval_results)
df.to_csv( f"/{path}/BoW_Hyperparameter_Results.csv", index= False)

In [None]:
# save all of our data structures
pickle.dump({'words': words, 'classes': classes, 'train_x': X_train, 'train_y': y_train}, open(f"{path}/{today}_{identifier}_train_data", "wb"))