In [1]:
# Task for creating chatbot -> Dataset Movie dialogue -> train over data to learn pattern for appropiate response
# Read data and preprocessing, tokenization, text_to_seq, padding
# vectorization for tokens

# Encoder-Decoder model arch
# Train
# evaluate and predict

In [2]:
!pip install convokit

Collecting convokit
  Downloading convokit-3.1.0.tar.gz (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scipy>1.14 (from convokit)
  Downloading scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting numpy>=2.0.0 (from convokit)
  Downloading numpy-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.p

In [3]:
!pip install tensorflow --upgrade



In [4]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
convokit 3.1.0 requires numpy>=2.0.0, but you have numpy 1.26.4 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have

In [5]:
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Embedding, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from convokit import Corpus, download
tf.config.run_functions_eagerly(True)

In [6]:
# Download the movie corpus
corpus = Corpus(filename=download("movie-corpus"))

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading movie-corpus to /root/.convokit/saved-corpora/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [7]:
# Extract conversations
conversations = corpus.get_conversation_ids()

In [8]:
# Prepare input and output texts
input_texts = []
output_texts = []

for conv_id in conversations:
    conversation = corpus.get_conversation(conv_id)
    utterance = conversation.get_utterance_ids()

    # Pairing
    for i in range(len(utterance)-1):
        input_text = corpus.get_utterance(utterance[i]).text
        output_text = corpus.get_utterance(utterance[i + 1]).text
        input_texts.append(input_text)
        output_texts.append(output_text)

In [9]:
input_texts = input_texts[:5000]
output_texts = output_texts[:5000]

In [10]:
# Preprocess text
def preprocess_text(text):
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", "is", text)
    text = re.sub(r"\'ll", "will", text)
    text = re.sub(r"\'re", "are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower()

In [11]:
input_texts = [preprocess_text(text) for text in input_texts]
output_texts = [preprocess_text(text) for text in output_texts]

In [12]:
# Tokenization
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(input_texts + output_texts)
input_sequence = tokenizer.texts_to_sequences(input_texts)
output_sequence = tokenizer.texts_to_sequences(output_texts)

In [13]:
# Padding
max_length = 100  #max(len(seq) for seq in input_sequence + output_sequence)
input_sequences = pad_sequences(input_sequence, maxlen=max_length, padding="post")
output_sequences = pad_sequences(output_sequence, maxlen=max_length, padding="post")

In [14]:
# Convert to numpy arrays
input_sequences = np.array(input_sequences, dtype=np.int32)
output_sequences = np.array(output_sequences, dtype=np.int32)

In [15]:
# Define the Encoder-Decoder Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
hidden_units = 256

# Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_input)
encoder_gru = GRU(hidden_units, return_state=True)
encoder_output, state_h = encoder_gru(encoder_embedding)
encoder_state = [state_h]

# Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_input)
decoder_gru = GRU(hidden_units, return_sequences=True, return_state=True)
decoder_output, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_output)

# Model
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare Target Data for Training
decoder_target_data = np.zeros_like(output_sequences)
for i, seq in enumerate(output_sequences):
    decoder_target_data[i, :-1] = seq[1:]

# Train the Model
model.fit([input_sequences, output_sequences], decoder_target_data, epochs=5, batch_size=32)

Epoch 1/5




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 2s/step - loss: 2.3734
Epoch 2/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 2s/step - loss: 0.6059
Epoch 3/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 2s/step - loss: 0.5849
Epoch 4/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 2s/step - loss: 0.5660
Epoch 5/5
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m301s[0m 2s/step - loss: 0.5474


<keras.src.callbacks.history.History at 0x7b20cdafab60>