<a href="https://colab.research.google.com/github/LokeshM01/Chatbot-using-RNN/blob/main/220591_Lokesh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [None]:
def load_babi_task(task_number):
  train_file = f"train_qa.txt"
  test_file = f"test_qa.txt"
  def read_data(file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()
        return [line.strip().lower() for line in lines]

  def process_data(lines):
        data = []
        context = []
        for line in lines:
            if line.startswith("1 "):
                context = []
            if "\t" in line:
                q, a, _ = map(str.lower, line.split("\t"))
                data.append((context.copy(), q, a))
            else:
                context.append(line)
        return data

  train_data = process_data(read_data(train_file))
  test_data = process_data(read_data(test_file))

  return train_data, test_data

In [None]:
def preprocess_data(data, word2idx):
    context_data = []
    question_data = []
    answer_data = []

    for context, question, answer in data:
        context_ids = [word2idx[word] for word in ' '.join(context).split()]
        question_ids = [word2idx[word] for word in question.split()]
        answer_ids = [word2idx[word] for word in answer.split()]

        context_data.append(context_ids)
        question_data.append(question_ids)
        answer_data.append(answer_ids)

    return (
        pad_sequences(context_data, padding='post'),
        pad_sequences(question_data, padding='post'),
        pad_sequences(answer_data, padding='post')
    )


In [None]:
train_data, test_data = load_babi_task(1)

In [None]:
vocab = set()
for context, question, answer in train_data + test_data:
    vocab.update(set(' '.join(context).split()))
    vocab.update(set(question.split()))
    vocab.update(set(answer.split()))

word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}
idx2word = {idx + 1: word for idx, word in enumerate(vocab)}

In [None]:
train_context, train_question, train_answer = preprocess_data(train_data, word2idx)
test_context, test_question, test_answer = preprocess_data(test_data, word2idx)

In [None]:
vocab_size = len(word2idx) + 1
embedding_dim = 50
hidden_units = 50

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=train_context.shape[1]))
model.add(SimpleRNN(hidden_units, return_sequences=True))
model.add(SimpleRNN(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
num_epochs = 10
batch_size = 32

model.fit(train_context, train_answer, epochs=num_epochs, batch_size=batch_size, validation_split=0.1)