In [1]:
# Task for creating chatbot -> Dataset Movie dialogue -> train over data to learn pattern for appropiate response
# Read data and preprocessing, tokenization, text_to_seq, padding
# vectorization for tokens

# Encoder-Decoder model arch
# Train
# evaluate and predict

In [2]:
!pip install convokit

Collecting numpy>=2.0.0 (from convokit)
  Using cached numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
  Using cached numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytensor 2.26.4 requires numpy<2,>=1.17.0, but you have numpy 2.0.2 which is incompatible.
cupy-cuda12x 12.2.0 requires numpy<1.27,>=1.20, but you have numpy 2.0.2 which is incompatible.
langchain 0.3.14 requires numpy<2,>=1.22.4; python_version < "3.12", but you have numpy 2.0.2 which is in

In [3]:
!pip install tensorflow --upgrade



In [4]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
convokit 3.1.0 requires numpy>=2.0.0, but you have numpy 1.26.4 which is incompatible.
en-core-web-sm 3.7.1 requires spacy<3.8.0,>=3.7.2, but you have spacy 3.8.4 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.1 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [5]:
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Embedding, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from convokit import Corpus, download
tf.config.run_functions_eagerly(True)

In [6]:
# Download the movie corpus
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /root/.convokit/saved-corpora/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [7]:
# Extract conversations
conversations = corpus.get_conversation_ids()

In [8]:
# Prepare input and output texts
input_texts = []
output_texts = []

for conv_id in conversations:
    conversation = corpus.get_conversation(conv_id)
    utterance = conversation.get_utterance_ids()

    # Pairing
    for i in range(len(utterance)-1):
        input_text = corpus.get_utterance(utterance[i]).text
        output_text = corpus.get_utterance(utterance[i + 1]).text
        input_texts.append(input_text)
        output_texts.append(output_text)

In [9]:
# Preprocess text
def preprocess_text(text):
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", "is", text)
    text = re.sub(r"\'ll", "will", text)
    text = re.sub(r"\'re", "are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower()

In [10]:
input_texts = [preprocess_text(text) for text in input_texts]
output_texts = [preprocess_text(text) for text in output_texts]

In [11]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + output_texts)
input_sequence = tokenizer.texts_to_sequences(input_texts)
output_sequence = tokenizer.texts_to_sequences(output_texts)

In [12]:
# Padding
max_length = max(len(seq) for seq in input_sequence + output_sequence)
input_sequences = pad_sequences(input_sequence, maxlen=max_length, padding="post")
output_sequences = pad_sequences(output_sequence, maxlen=max_length, padding="post")

In [13]:
# Convert to numpy arrays
input_sequences = np.array(input_sequences, dtype=np.int32)
output_sequences = np.array(output_sequences, dtype=np.int32)

In [14]:
# Define the Encoder-Decoder Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
hidden_units = 512

# Encoder
encoder_input = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_input)
encoder_gru = GRU(hidden_units, return_state=True)
encoder_output, state_h = encoder_gru(encoder_embedding)
encoder_state = [state_h]

# Decoder
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_input)
decoder_gru = GRU(hidden_units, return_sequences=True, return_state=True)
decoder_output, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_output = decoder_dense(decoder_output)

# Model
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare Target Data for Training
decoder_target_data = np.zeros_like(output_sequences)
for i, seq in enumerate(output_sequences):
    decoder_target_data[i, :-1] = seq[1:]

# Train the Model
model.fit([input_sequences, output_sequences], decoder_target_data, epochs=10, batch_size=32)



Epoch 1/10


ResourceExhaustedError: Exception encountered when calling Dense.call().

[1m{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:AddV2] name: [0m

Arguments received by Dense.call():
  • inputs=tf.Tensor(shape=(32, 552, 512), dtype=float32)
  • training=True