In [39]:
# Imports
import json 
import pandas as pd
import nltk
import numpy as np

In [2]:
def read_json(src): 
    f = open(src, "r")
    json_data = f.read()
    f.close()

    data = json.loads(json_data)
    df = pd.json_normalize(data['intents'])

    df_sentences = pd.DataFrame({"label": [], "input": []})

    labels = []
    inputs = []
    

    for index, row in df.iterrows(): 
        for sentence in row["input"]:
            labels.append(row["label"])
            inputs.append(sentence)
            
    df_sentences["label"] = labels
    df_sentences["input"] = inputs
    
    return df_sentences

In [120]:
# Function that encodes labels as One Hot Vectors
def one_hot_labels(sentence, labels): 
    return np.array([1 if sentence == label else 0 for label in labels])

In [19]:
# NLP Preprocessing 
def preprocessing(sentence):
    # Word Tokenization 
    tokens = nltk.word_tokenize(sentence)
    # Lowercasing 
    tokens_lower = [token.lower() for token in tokens]
    # Removing Stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens_cleaned = [tokens for tokens in tokens_lower if not tokens in stop_words]
    # Stemming
    ps = nltk.stem.PorterStemmer() 
    stems = [ps.stem(w) for w in tokens_cleaned]
        
    return "".join(stems)

In [177]:
train_df = read_json("Training_Data/training_data.json")
train_df['label'] = train_df['label'].apply(lambda x: one_hot_labels(x, ['greeting', 'breakfast','weather','news', 'motivation']))
train_df['input'] = train_df['input'].apply(preprocessing)

test_df = read_json("Training_Data/test.json")
test_df['label'] = test_df['label'].apply(lambda x: one_hot_labels(x,['greeting', 'breakfast','weather','news', 'motivation']))
test_df['input'] = test_df['input'].apply(preprocessing)
test_df.head()

Unnamed: 0,label,input
0,"[1, 0, 0, 0, 0]",hello!
1,"[1, 0, 0, 0, 0]",goodmorn!
2,"[1, 0, 0, 0, 0]",hey!
3,"[1, 0, 0, 0, 0]","yo,'sgo?"
4,"[1, 0, 0, 0, 0]",sup?


In [178]:
train_df = read_json("Training_Data/train.json")
test_df = read_json("Training_Data/test.json")

In [179]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

training_sentences = read_json("Training_Data/train.json")

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences["input"])

# Convert sentences to sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences["input"])

# Pad the sequences
training_padded = pad_sequences(training_sequences, maxlen=10, padding='post', truncating='post')

# Print results
print("Word Index:", tokenizer.word_index)
print("Sequences:", training_sequences)
print("Padded Sequences:", training_padded)

X_train =  pad_sequences(training_sequences, maxlen=10, padding='post', truncating='post')
y_train = train_df['label'].apply(lambda x: one_hot_labels(x, ['greeting', 'breakfast','weather','news', 'motivation']))
X_test = pad_sequences(tokenizer.texts_to_sequences(read_json("Training_Data/test.json")['input']),  maxlen=10, padding='post', truncating='post')
y_test = test_df['label'].apply(lambda x: one_hot_labels(x,['greeting', 'breakfast','weather','news', 'motivation']))

y_train = [x.tolist() for x in y_train]
y_test = [x.tolist() for x in y_test]

X_train = X_train.tolist()
X_test = X_test.tolist()

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

mymodel = Sequential([
    Dense(384, input_shape=(10,), activation='relu'), # 107
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(5, activation='softmax')
]) 

mymodel.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
mymodel.fit(X_train, y_train, epochs=200, batch_size=15)

Word Index: {'<OOV>': 1, 'i': 2, 'what': 3, 'me': 4, 'today': 5, 'the': 6, 'will': 7, 'is': 8, 'do': 9, 'it': 10, 'something': 11, 'give': 12, 'to': 13, 'eat': 14, 'morning': 15, 'weather': 16, 'be': 17, 'or': 18, 'say': 19, 'did': 20, 'news': 21, 'any': 22, 'headlines': 23, 'am': 24, 'make': 25, 'happy': 26, 'a': 27, "don't": 28, 'should': 29, 'idea': 30, 'hi': 31, 'hello': 32, 'good': 33, 'hey': 34, "what's": 35, 'up': 36, 'how': 37, 'need': 38, 'an': 39, 'umbrella': 40, 'rain': 41, 'warm': 42, 'rainy': 43, 'sunny': 44, 'tell': 45, 'does': 46, 'forecast': 47, 'miss': 48, 'on': 49, 'happen': 50, 'interesting': 51, 'stories': 52, 'making': 53, 'inform': 54, 'about': 55, 'has': 56, 'relevant': 57, 'happened': 58, 'sad': 59, 'feel': 60, 'unmotivated': 61, 'motivate': 62, 'quote': 63, 'that': 64, 'want': 65, 'anything': 66, 'recipe': 67, 'ideas': 68, 'hungry': 69, 'cook': 70, 'know': 71, 'you': 72, 'have': 73, 'breakfast': 74}
Sequences: [[31], [32], [33, 15], [34], [35, 36], [15], [37, 8

<keras.callbacks.History at 0x28838b190>

In [172]:
y_train

[[1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0,

In [180]:
loss, accuracy = mymodel.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


Test Accuracy: 0.32


In [153]:
mymodel.evaluate(X_test,y_test)



[0.0, 1.0]

In [166]:
test = ["Do I need an umbrella today?"]
sequence = tokenizer.texts_to_sequences(test) 
x_padded = pad_sequences(sequence, maxlen=10, padding='post', truncating='post')
mymodel.predict(x_padded)



array([[1., 0., 0., 0., 0.]], dtype=float32)