In [None]:
!pip install speechrecognition

In [None]:
!pip install gtts

In [None]:
!pip install tensorflow

In [None]:
# Import Libraries
import json
import nltk
import time
import random
import string
import pickle
import numpy as np
import pandas as pd
from gtts import gTTS
from io import BytesIO
import tensorflow as tf
import IPython.display as ipd
import speech_recognition as sr
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Flatten, Dense, GlobalMaxPool1D

In [None]:
# Package sentence tokenizer
nltk.download('punkt')
# Package lemmatization
nltk.download('wordnet')
# Package multilingual wordnet data
nltk.download('omw-1.4')

In [None]:
# Importing the dataset
with open('data.json') as content:
  data1 = json.load(content)

# Gets all data into a list
tags = [] # data tag
inputs = [] # input data or patterns
responses = {} # data responses
words = [] # Word data
classes = [] # Class or Tag Data
documents = [] # Document Sentence Data
ignore_words = ['?', '!'] # Ignores special character tags

for intent in data1['intents']:
  responses[intent['tag']]=intent['responses']
  for lines in intent['patterns']:
    inputs.append(lines)
    tags.append(intent['tag'])
    for pattern in intent['patterns']:
      w = nltk.word_tokenize(pattern)
      words.extend(w)
      documents.append((w, intent['tag']))
      # add to our classes list
      if intent['tag'] not in classes:
        classes.append(intent['tag'])

# Convert json data into dataframe
data = pd.DataFrame({"patterns":inputs, "tags":tags})

In [None]:
data

In [None]:
# Removing Punctuations
data['patterns'] = data['patterns'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation])
data['patterns'] = data['patterns'].apply(lambda wrd: ''.join(wrd))

In [None]:
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

print (len(words), "unique lemmatized words", words)

In [None]:
# sort classes
classes = sorted(list(set(classes)))
print (len(classes), "classes", classes)

In [None]:
# documents = combination between patterns and intents
print (len(documents), "documents")

In [None]:
# Tokenize the data
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(data['patterns'])
train = tokenizer.texts_to_sequences(data['patterns'])
train

In [None]:
# Apply padding
x_train = pad_sequences(train)
print(x_train) # Padding Sequences

In [None]:
# Encoding the outputs
le = LabelEncoder()
y_train = le.fit_transform(data['tags'])
print(y_train) #Label Encodings

In [None]:
# input length
input_shape = x_train.shape[1]
print(input_shape)

In [None]:
# define vocabulary
vocabulary = len(tokenizer.word_index)
print("number of unique words : ", vocabulary)

# output length
output_length = le.classes_.shape[0]
print("output length: ", output_length)

In [None]:
pickle.dump(words, open('words.pkl','wb'))
pickle.dump(classes, open('classes.pkl','wb'))

In [None]:
pickle.dump(le, open('le.pkl','wb'))
pickle.dump(tokenizer, open('tokenizers.pkl','wb'))

In [None]:
# Define a function to perform NLU and sentiment analysis
def perform_nlu_sentiment(user_input):
    # Tokenize the user input into individual words
    words = word_tokenize(user_input.lower())

    # Remove stop words from the tokenized words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Perform part-of-speech tagging to identify the type of each word
    tagged_words = nltk.pos_tag(filtered_words)

    # Identify named entities in the user input
    named_entities = nltk.ne_chunk(tagged_words)

    # Perform sentiment analysis on the user input
    sentiment_analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = sentiment_analyzer.polarity_scores(user_input)

    # Identify the intent of the user input
    intent = identify_intent(tagged_words)

    # Return the results of the NLU and sentiment analysis
    return {
        'words': words,
        'filtered_words': filtered_words,
        'tagged_words': tagged_words,
        'named_entities': named_entities,
        'sentiment_scores': sentiment_scores,
        'intent': intent
    }

# Define a function to identify the intent of the user input
def identify_intent(tagged_words):
    # Implement your own logic here to identify the intent based on the part-of-speech tags
    # This can involve using a rule-based approach or a machine learning model
    # For example, you could use a decision tree classifier to classify the intent based on the tags
    # Return the identified intent
    return 'unknown'

In [None]:
# Creating the model
i = Input(shape=(input_shape,))
x = Embedding(vocabulary+1,10)(i) # Layer Embedding
x = LSTM(7, return_sequences=True)(x) # Layer Long Short Term Memory
x = Flatten()(x) # Layer Flatten
x = Dense(output_length, activation="softmax")(x) # Layer Dense
model  = Model(i,x)

# Compiling the model
model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Training the model
train = model.fit(x_train, y_train, epochs=400)

In [None]:
# Make Input Chat
while True:
    texts_p = []
    prediction_input = input('You: ')

    # Remove punctuation and convert to lowercase
    prediction_input = [letters.lower() for letters in prediction_input if letters not in string.punctuation]
    prediction_input = ''.join(prediction_input)
    texts_p.append(prediction_input)

    # Tokenization and Padding
    prediction_input = tokenizer.texts_to_sequences(texts_p)
    prediction_input = np.array(prediction_input).reshape(-1)
    prediction_input = pad_sequences([prediction_input], input_shape)

    # Get the output from the model
    output = model.predict(prediction_input)
    output = output.argmax()

    # Find responses according to tag data and play voice bots
    response_tag = le.inverse_transform([output])[0]
    print("🤖 ChatBot:", random.choice(responses[response_tag]))

    # Generate speech response
    tts = gTTS(random.choice(responses[response_tag]), lang='id')
    tts.save('ChatBot.wav')

    # Play the generated speech response
    ipd.display(ipd.Audio('ChatBot.wav', autoplay=True))

    print("="*60 + "\n")

    # Exit condition
    # Exit condition
    if response_tag.lower() == "goodbye" or str(prediction_input[0]).lower() == "exit":
      break

