In [1]:
import numpy as np
import pandas as pd 
import json

In [2]:
import nltk

In [None]:
nltk.download('stopwords')

In [3]:
from nltk.corpus import stopwords

In [4]:
import string

In [5]:
# Functionality for preprocessing the data independent of the training and inference process
class PreprocessingHelper: 

    def __init__(self, labels): 
        self.labels = labels
    
    # Read in data 
    # Produces a dataframe of the form label | inputs
    def read_in_json(self, data_src): 

        f = open(data_src, "r")
        json_data = f.read()
        f.close()

        data = json.loads(json_data)
        df = pd.json_normalize(data['intents'])

        df_sentences = pd.DataFrame({"label": [], "input": []})

        labels = []
        inputs = []
    

        for index, row in df.iterrows(): 
            for sentence in row["input"]:
                labels.append(row["label"])
                inputs.append(sentence)
            
        df_sentences["label"] = labels
        df_sentences["input"] = inputs
    
        return df_sentences

    # Function that encodes labels as One Hot Vectors
    def one_hot_labels(self, sentence): 
        return [1 if sentence == label else 0 for label in self.labels]

        
    # Crate bag of words
    # Expects dataframe of form specified in the upper function 
    def fill_bag_of_words(self,table): 
        vocabulary = []
        for row in table:
            for word in row: 
                vocabulary.append(word)
        return [x for x in set(vocabulary)]

In [6]:
class ChatBotHelper: 

    def __init__(self, bag_of_words): 
        self.bag_of_words = bag_of_words
    
    @staticmethod
    def nlp_preprocessing(sentence): 
        stop_words = set(stopwords.words('english'))

        tokens = nltk.word_tokenize(sentence) 

        lowercased_tokens = [token.lower() for token in tokens]
        
        filtered_tokens_stopwords = [token for token in lowercased_tokens if token not in stop_words]

        filtered_tokens = [token for token in filtered_tokens_stopwords if token not in string.punctuation]

        filtered_tokens_two = [token for token in filtered_tokens if token not in ["'","'s", "’"]]

      #  return filtered_tokens

        return filtered_tokens_two
        
    
    def embed(self, sentence):
        return [1 if word in sentence else 0 for word in self.bag_of_words]

    

In [7]:
# Create Training and Test Set 
preprocessingHelper = PreprocessingHelper(["greeting", "weather", "news", "motivation", "breakfast"])

training_data = preprocessingHelper.read_in_json("Data/training_data.json")

training_data = training_data.sample(frac=1).reset_index(drop=True)

training_data_x, training_data_y = training_data['input'], training_data['label']

training_data_x = training_data_x.apply(lambda x: ChatBotHelper.nlp_preprocessing(x))


bag_of_words = preprocessingHelper.fill_bag_of_words(training_data_x)

chatbotHelper = ChatBotHelper(bag_of_words)

X_train = np.array(training_data_x.apply(lambda x: chatbotHelper.embed(x)).tolist())
y_train = np.array(training_data_y.apply(lambda x: preprocessingHelper.one_hot_labels(x)).tolist())


In [8]:
test_data = preprocessingHelper.read_in_json("Data/test_data.json")

test_data_x, test_data_y = test_data['input'], test_data['label']

test_data_x = test_data_x.apply(lambda x: ChatBotHelper.nlp_preprocessing(x))

X_test = np.array(test_data_x.apply(lambda x: chatbotHelper.embed(x)).tolist())
y_test = np.array(test_data_y.apply(lambda x: preprocessingHelper.one_hot_labels(x)).tolist())

In [7]:
# Create Model 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([

    Dense(128, input_dim=98, activation='relu'),  # 128 units, ReLU activation
    
    Dropout(0.5),

    Dense(64, activation='relu'),  # 64 units, ReLU activation
    
    Dense(5, activation='softmax')  # Output layer with softmax for classification
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=200, batch_size=15)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NameError: name 'X_train' is not defined

In [None]:
model.evaluate(X_test,y_test)

In [None]:
# Analyzing the error

In [9]:
# Confusion Matrix 
from sklearn.metrics import confusion_matrix
import seaborn

In [None]:
predictions = np.argmax(model.predict(X_test), axis = 1) 
cnf_matrix = confusion_matrix(np.argmax(y_test, axis=1), predictions)
seaborn.heatmap(cnf_matrix, annot= True)

In [None]:
# Classification Report
from sklearn.metrics import classification_report
report = classification_report(np.argmax(y_test, axis=1), predictions, target_names=["greeting", "weather", "news", "motivation", "breakfast"])
print(report)

In [8]:
# Betrachten des bag_of_words
#bag_of_words
# => '’', "'s", "'m"
from difflib import SequenceMatcher

class ChatBotHelper_Improved: 

    def __init__(self, bag_of_words): 
        self.bag_of_words = bag_of_words
    
    @staticmethod
    def nlp_preprocessing(sentence): 
        stop_words = set(stopwords.words('english'))

        tokens = nltk.word_tokenize(sentence) 

        lowercased_tokens = [token.lower() for token in tokens]
        
        filtered_tokens_stopwords = [token for token in lowercased_tokens if token not in stop_words]

        filtered_tokens = [token for token in filtered_tokens_stopwords if token not in string.punctuation]

        filtered_tokens_two = [token for token in filtered_tokens if token not in ["'","'s", "’"]]

      #  return filtered_tokens

        return filtered_tokens_two
        
    
    def embed(self, sentence):
        return [1 if word in sentence else 0 for word in self.bag_of_words or lexical_similarity(word, word2)>0.75]

    def lexical_similarity(word1, word2):
        # Ratio of longest common subsequence
        return SequenceMatcher(None, word1, word2).ratio()
        

    

In [9]:
# Mehr Daten für Motivation
def add_data(df):
    new_rows = pd.DataFrame([
        {'label': 'motivation', 'input': 'Give me some words of motivation'},
        {'label': 'motivation', 'input': 'Give me some words of motivation'}
    ])
    return pd.concat([df, new_rows], ignore_index=True)

In [10]:
# Create Training and Test Set 
preprocessingHelper = PreprocessingHelper(["greeting", "weather", "news", "motivation", "breakfast"])

training_data = preprocessingHelper.read_in_json("Data/training_data.json")

training_data = add_data(training_data)

training_data = training_data.sample(frac=1).reset_index(drop=True)

training_data_x, training_data_y = training_data['input'], training_data['label']

training_data_x = training_data_x.apply(lambda x: ChatBotHelper.nlp_preprocessing(x))


bag_of_words = preprocessingHelper.fill_bag_of_words(training_data_x)

chatbotHelper = ChatBotHelper_Improved(bag_of_words)

X_train_improved = np.array(training_data_x.apply(lambda x: chatbotHelper.embed(x)).tolist())
y_train_improved = np.array(training_data_y.apply(lambda x: preprocessingHelper.one_hot_labels(x)).tolist())


In [11]:
test_data = preprocessingHelper.read_in_json("Data/test_data.json")

test_data_x, test_data_y = test_data['input'], test_data['label']

test_data_x = test_data_x.apply(lambda x: ChatBotHelper_Improved.nlp_preprocessing(x))

X_test_improved = np.array(test_data_x.apply(lambda x: chatbotHelper.embed(x)).tolist())
y_test_improved = np.array(test_data_y.apply(lambda x: preprocessingHelper.one_hot_labels(x)).tolist())

In [12]:
# Create Model 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([

    Dense(128, input_dim=98, activation='relu'),  # 128 units, ReLU activation
    
    Dropout(0.5),

    Dense(64, activation='relu'),  # 64 units, ReLU activation
    
    Dense(5, activation='softmax')  # Output layer with softmax for classification
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_improved, y_train_improved, epochs=200, batch_size=15)

Epoch 1/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1737 - loss: 1.6374  
Epoch 2/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1636 - loss: 1.6040
Epoch 3/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2855 - loss: 1.5452
Epoch 4/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2843 - loss: 1.5547
Epoch 5/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3316 - loss: 1.5287
Epoch 6/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4549 - loss: 1.5076
Epoch 7/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5342 - loss: 1.4749
Epoch 8/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5762 - loss: 1.4287
Epoch 9/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x2960ee060>

In [13]:
model.evaluate(X_test_improved,y_test_improved)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.8400 - loss: 0.6280


[0.6279746890068054, 0.8399999737739563]

In [None]:
# Hyperparamter Tuning 
# Omitted to prevent overfitting

In [19]:
X_train

NameError: name 'X_train' is not defined

In [16]:
model.predict(np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
)

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_1_1/Cast:0", shape=(32,), dtype=float32). Expected shape (None, 98), but input has incompatible shape (32,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32,), dtype=int64)
  • training=False
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [21]:
# Save Model
model.save('chatbot_model.h5', include_optimizer=False)



In [22]:
# Save Bag of Words
with open('vocab.json', 'w') as f:
    json.dump(bag_of_words, f)