# Introduction
This Jupyter Notebook is part of a bachelor thesis that aims to explore the capabilities of specialized chatbots, particularly those built using BERT and Bag-of-Words (BoW). We will preprocess and analyze two sets of CSV files containing performance metrics from these models.



In [25]:
# !pip install seaborn
# !pip install jinja2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Data Preprocessing for BERT
In this section, we read the CSV file containing data related to BERT models into a DataFrame. We then sort the data and reset the index.

In [None]:
# Read and preprocess the BERT data
df_bert = pd.read_csv("transformer_data.csv")
df_bert = df_bert[['batch_size', 'epoch', 'learning_rate', 'train_accuracy', 'train_loss', 'val_accuracy', 'val_loss', 'test_accuracy']]
df_bert.sort_values(by=['batch_size', 'learning_rate', 'epoch'], ascending=[True, False, True], inplace=True)
df_bert.reset_index(drop=True, inplace=True)


# Print LaTeX table for BERT data
print("BERT Data:")
print(df_bert.to_latex(index=False))


# Data Preprocessing for BoW
In this section, we read the CSV file containing data related to BoW models into a DataFrame. We then sort the data and reset the index.

In [None]:
# Read and preprocess the BoW data
df_bow = pd.read_csv("BoW_Hyperparameter_Results.csv")
df_bow = df_bow[['batch_size', 'epoch', 'learning_rate', 'train_accuracy', 'train_loss', 'val_accuracy', 'val_loss', 'test_accuracy']]
df_bow.sort_values(by=['batch_size', 'learning_rate', 'epoch'], ascending=[True, False, True], inplace=True)
df_bow.reset_index(drop=True, inplace=True)


In [None]:
# Print LaTeX table for BoW data
print("BoW Data:")
print(df_bow.to_latex(index=False))


# Data Visualization
Finally, we visualize the data by plotting accuracy against epochs for different learning rates and batch sizes. We do this separately for BERT and BoW models.

In [None]:
#  Plotting
for df, model_type in zip([df_bert, df_bow], ['BERT', 'BoW']):
    batch_sizes = df['batch_size'].unique()
    for batch in batch_sizes:
        subset = df[df['batch_size'] == batch]
        plt.figure(figsize=(10, 6))

        for lr in subset['learning_rate'].unique():
            specific_lr_data = subset[subset['learning_rate'] == lr]
            plt.plot(specific_lr_data['epoch'], specific_lr_data['test_accuracy'], label=f"Learning Rate: {lr}", marker='o')

        plt.title(f"{model_type} - Accuracy vs Epochs for Batch Size: {batch}")
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.ylim(0, 1)
        plt.yticks(np.arange(0, 1.1, 0.2))
        plt.legend()
        plt.grid(True)
        plt.show()
