In [2]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
from datasets import load_dataset

dataset = load_dataset("naklecha/minecraft-question-answer-700k")

2024-04-26 04:31:40.734045: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 04:31:40.734099: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 04:31:40.735581: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [4]:
df = dataset['train'].to_pandas()
print(len(df))
df = df.drop('source', axis=1)
filtered_df = df[df['question'].str.contains("\?")]
print(len(filtered_df))
filtered_df = filtered_df[filtered_df['answer'] != '']
print(len(filtered_df))
def filter_row(row, max_words=50):
    return len(str(row['question']).split()) <= max_words and len(str(row['answer']).split()) <= max_words and len(str(row['question']).split()) > 1 and len(str(row['answer']).split()) > 1

# Apply custom function to filter rows
filtered_df = filtered_df[filtered_df.apply(filter_row, axis=1)]

# Display the filtered DataFrame
print(len(filtered_df))

694814
666869
666549
471115


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import unicodedata

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# Function for preprocessing text
def preprocess_text(text):
    # Convert text to lowercase
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub("(\\W)"," ",text) 
    text = re.sub('\S*\d\S*\s*','', text)
    text =  "<sos> " +  text + "<eos>"
    return text

# Apply preprocessing to question and answer columns
preprocessed_df = filtered_df.copy()
preprocessed_df['question'] = preprocessed_df['question'].apply(preprocess_text)
preprocessed_df['answer'] = preprocessed_df['answer'].apply(preprocess_text)

# Print the preprocessed DataFrame
preprocessed_df

Unnamed: 0,question,answer
0,<sos> what is the first statistic to decrease ...,<sos> saturation is the first statistic to dec...
2,<sos> what is the average hunger restoration v...,<sos> the average hunger restoration value of ...
3,<sos> what foods in minecraft cannot be eaten ...,<sos> the following items cannot be eaten on t...
5,<sos> what is the effect of changing the game ...,<sos> in the bedrock edition a player can sti...
7,<sos> what types of leaves can be found natura...,<sos> in minecraft jungle bushes can generate...
...,...,...
694806,<sos> what happens to the player when they go ...,<sos> in hard and hardcore mode going without...
694807,<sos> what happens to a player s saturation le...,<sos> saturation is the first statistic to dec...
694809,<sos> what is the purpose of the nourishment t...,<sos> the nourishment table below can help by ...
694811,<sos> what is the name of the red sweet and s...,<sos> the red sweet and slightly sour fruit ...


In [6]:
# Preprocessing the data
questions = preprocessed_df['question'].values.tolist()
answers = preprocessed_df['answer'].values.tolist()

# Tokenizing the data
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(np.concatenate((questions, answers), axis=0))

vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
question_seqs = tokenizer.texts_to_sequences(questions)
answer_seqs = tokenizer.texts_to_sequences(answers)
# Padding sequences for equal length
# Pad sequences separately for questions and answers
max_len=32
question_seqs = pad_sequences(question_seqs, maxlen=max_len, padding='post', truncating='post')
answer_seqs = pad_sequences(answer_seqs, maxlen=max_len, padding='post', truncating='post')

In [7]:
tokenizer.texts_to_sequences("<sos>")

[[], [20], [1652], [20], []]

In [8]:
tokenizer.word_index["<sos>"]

2

In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Attention, Concatenate, Dropout

# Define the model architecture
latent_dim = 256  # Dimensionality of the encoding space
max_len=32
# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, latent_dim, input_shape=(max_len,))
encoder_GRU = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.1, recurrent_dropout=0.1)
encoder_outputs, state_h = encoder_GRU(encoder_embedding(encoder_inputs))
encoder_states = [state_h]

# Decoder
decoder_inputs = Input(shape=(max_len-1,))
decoder_embedding = Embedding(vocab_size, latent_dim, input_shape=(max_len-1,))
decoder_GRU = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.1, recurrent_dropout=0.1)
decoder_outputs, _ = decoder_GRU(decoder_embedding(decoder_inputs), initial_state=encoder_states)

# Attention mechanism
attention_layer = Attention()
attention_output = attention_layer([decoder_outputs, encoder_outputs])

# Concatenate attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_output])

# Add dropout layer for regularization
decoder_concat_input = Dropout(0.1)(decoder_concat_input)

# Output layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

  super().__init__(**kwargs)


In [10]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print model summary
model.summary()

# Train the model
model.fit([question_seqs, answer_seqs[:, :-1]], answer_seqs[:, 1:],
          batch_size=128,
          epochs=15)

Epoch 1/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m813s[0m 219ms/step - loss: 3.4732
Epoch 2/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m806s[0m 219ms/step - loss: 1.8799
Epoch 3/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m804s[0m 219ms/step - loss: 1.6390
Epoch 4/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m800s[0m 217ms/step - loss: 1.5200
Epoch 5/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m799s[0m 217ms/step - loss: 1.4483
Epoch 6/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m807s[0m 219ms/step - loss: 1.3883
Epoch 7/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m815s[0m 221ms/step - loss: 1.3328
Epoch 8/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m807s[0m 219ms/step - loss: 1.2841
Epoch 9/15
[1m3681/3681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m802s[0m 218ms/step - loss: 1.2432
Epoch 10/15
[1m3681/3681[0m [32m━━

<keras.src.callbacks.history.History at 0x789471956920>

In [14]:


# Function to generate a response given a input sentence
def generate_response(input_text):
    # Tokenize the input text
    input_sequence = tokenizer.texts_to_sequences([input_text])
    # Pad the input sequence
    input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post')
    
    # Initialize the decoder input sequence with start token
    decoder_input_sequence = np.zeros((1, max_len-1))
    decoder_input_sequence[0, 0] = tokenizer.word_index['<sos>']
    
    # Generate response using the trained model
    for i in range(max_len - 1):
        predictions = model.predict([input_sequence, decoder_input_sequence])
        predicted_id = np.argmax(predictions[0, i, :])
        if predicted_id == tokenizer.word_index['<eos>']:
            break
        decoder_input_sequence[0, i+1] = predicted_id
    
    # Convert output sequence to text
    output_text = ''
    for token_index in decoder_input_sequence[0]:
        if token_index == tokenizer.word_index['<eos>'] or token_index == 0:
            break
        output_text += tokenizer.index_word[token_index] + ' '
    
    return output_text.strip()

# Test the function with input "how are you"
input_text = "What determines the color of bee nests in Minecraft"
response = generate_response(input_text)
print("Response:", response[5:])



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 446ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3

In [18]:
input_text = "How do you craft items in minecraft?"
response = generate_response(input_text)
print("Response:", response[5:])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36

In [16]:
model.save('Minecraft_chatbot.keras')
