# Step 1: Extract and Load Data
Load the movie_lines.txt and movie_conversations.txt files to process the dialogues.

In [1]:
import os
import numpy as np
import re
from collections import defaultdict

# Paths to dataset files
lines_file = "movie_lines.txt"
conversations_file = "movie_conversations.txt"

# Load movie lines
lines = {}
with open(lines_file, encoding='utf-8', errors='ignore') as file:
    for line in file:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            lines[parts[0]] = parts[4]

# Load conversations
conversations = []
with open(conversations_file, encoding='utf-8', errors='ignore') as file:
    for line in file:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            conv_ids = eval(parts[3])  # Convert string list to Python list
            conversations.append(conv_ids)

# Step 2: Create Input-Response Pairs
We pair consecutive lines of dialogue as input-response pairs.

In [2]:
# Create pairs of input and responses
input_texts = []
response_texts = []

for conv in conversations:
    for i in range(len(conv) - 1):
        # Check if line IDs exist in the dictionary
        if conv[i] in lines and conv[i + 1] in lines:
            input_texts.append(lines[conv[i]])
            response_texts.append(lines[conv[i + 1]])

print(f"Number of pairs: {len(input_texts)}")


Number of pairs: 221282


# Step 3: Preprocess Text
Clean the text by lowercasing, removing special characters, and handling contractions.

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Preprocess inputs and responses
input_texts = [preprocess_text(text) for text in input_texts]
response_texts = [preprocess_text(text) for text in response_texts]

print("Sample input:", input_texts[0])
print("Sample response:", response_texts[0])


Sample input: can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again
Sample response: well i thought we'd start with pronunciation if that's okay with you


# Step 4: Tokenization and Padding
Tokenize the text and convert it into sequences for training.

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenizer for input and output
input_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
output_tokenizer = Tokenizer(filters='', oov_token='<OOV>')

# Fit tokenizers on texts
input_tokenizer.fit_on_texts(input_texts)
output_tokenizer.fit_on_texts(response_texts)

# Convert texts to sequences
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
output_sequences = output_tokenizer.texts_to_sequences(response_texts)

# Pad sequences
max_length = 20
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=max_length, padding='post')

# Add special tokens to output sequences
start_token = output_tokenizer.word_index['<start>'] = len(output_tokenizer.word_index) + 1
end_token = output_tokenizer.word_index['<end>'] = len(output_tokenizer.word_index) + 1

output_sequences = [[start_token] + seq + [end_token] for seq in output_sequences]

print(f"Vocabulary size (input): {len(input_tokenizer.word_index)}")
print(f"Vocabulary size (output): {len(output_tokenizer.word_index)}")


2024-12-13 23:54:30.559104: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Vocabulary size (input): 58049
Vocabulary size (output): 58965


# Step 5: Build and Train the Model
We will use the LSTM-based Encoder-Decoder architecture.

### 1: Import Libraries

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

### 2: Define Hyperparameters

In [7]:
# Hyperparameters
embedding_dim = 256
units = 512
batch_size = 16
vocab_size_input = len(input_tokenizer.word_index) + 1  # Vocabulary size of input
vocab_size_output = len(output_tokenizer.word_index) + 1  # Vocabulary size of output

### 3: Encoder

In [8]:
# Ensure vocab_size_input is greater than the maximum index in encoder_input_data
vocab_size_input = 176893  # This should cover the max index + 1
embedding_dim = 128  
units = 256          


# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size_input, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


### 4: Decoder

In [9]:
# Ensure vocab_size_output is greater than the maximum index in decoder_input_data
vocab_size_output = 176893  # Adjust this to cover max index + 1 in your output data
embedding_dim = 128 
units = 256         


# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size_output, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_output, activation='softmax')
output = decoder_dense(decoder_outputs)


### 5: Compile the Model

In [10]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], output)

# Compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

###  6: Train the Model

In [13]:
# Prepare data
encoder_input_data = np.array(input_sequences)
decoder_input_data = np.array([seq[:-1] for seq in output_sequences])
decoder_output_data = np.array([seq[1:] for seq in output_sequences])

# Train
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=batch_size,
    epochs=20,
    validation_split=0.2
)

Epoch 1/20


2024-12-13 23:55:53.651173: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2898214912 exceeds 10% of free system memory.


[1m    1/11065[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24:25:07[0m 8s/step - accuracy: 0.0000e+00 - loss: 12.0826

2024-12-13 23:55:57.812782: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2898214912 exceeds 10% of free system memory.


[1m    2/11065[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:09:44[0m 5s/step - accuracy: 0.1579 - loss: 12.0808    

2024-12-13 23:56:02.452088: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2898214912 exceeds 10% of free system memory.


[1m    3/11065[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:31:25[0m 4s/step - accuracy: 0.2368 - loss: 12.0790

2024-12-13 23:56:06.969132: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2898214912 exceeds 10% of free system memory.


[1m    4/11065[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:04:39[0m 5s/step - accuracy: 0.2852 - loss: 12.0767

2024-12-13 23:56:11.554149: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2898214912 exceeds 10% of free system memory.


[1m   63/11065[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:50:16[0m 5s/step - accuracy: 0.5378 - loss: 8.8439

KeyboardInterrupt: 

# 2. Evaluate the Model
Evaluate the chatbot on unseen dialogue pairs by generating responses using the trained model.

### Inference Functions

In [14]:
# Encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model for inference
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

### Generate Response

In [15]:
def decode_sequence(input_seq):
    # Encode the input sequence
    states_value = encoder_model.predict(input_seq)
    
    # Generate the target sequence
    target_seq = np.array([[output_tokenizer.word_index['<start>']]])
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = output_tokenizer.index_word.get(sampled_token_index, '')
        
        if sampled_word == '<end>' or len(decoded_sentence) > max_length:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word
            
        # Update target sequence and states
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]
    
    return decoded_sentence

### Test the Chatbot

In [16]:
test_sentence = "how are you?"
test_sequence = input_tokenizer.texts_to_sequences([preprocess_text(test_sentence)])
test_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
response = decode_sequence(test_sequence)
print("Bot:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

# Step 7: Create the GUI Interface
Integrate the chatbot with a GUI:

In [17]:
import tkinter as tk
from tkinter import scrolledtext
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to process user input and generate response
def send_message():
    user_message = user_input.get()  # Get user input from the entry field
    if user_message.strip():
        chat_area.insert(tk.END, "You: " + user_message + "\n")
        
        # Preprocess the input
        input_sequence = input_tokenizer.texts_to_sequences([preprocess_text(user_message)])
        input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='post')
        
        # Generate the response
        bot_response = decode_sequence(input_sequence)
        chat_area.insert(tk.END, "Bot: " + bot_response + "\n\n")
        
        # Clear user input field
        user_input.delete(0, tk.END)

# Preprocess function (same as before)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# GUI setup
root = tk.Tk()
root.title("Chatbot")

# Chat display area
chat_area = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=60, height=20, font=("Arial", 12))
chat_area.pack(pady=10)
chat_area.configure(state='normal')  # Allow user input in the chat area

# User input field
user_input = tk.Entry(root, width=50, font=("Arial", 14))
user_input.pack(pady=10)

# Send button
send_button = tk.Button(root, text="Send", command=send_message, font=("Arial", 14), bg="lightblue")
send_button.pack()

# Start the GUI event loop
root.mainloop()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54