In [1]:
!pip uninstall tensorflow -y



In [2]:
!pip install tensorflow-gpu



In [3]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13966761671273704624
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 9331618332426397510
physical_device_desc: "device: XLA_CPU device"
]


In [4]:
import tensorflow as tf
assert tf.__version__.startswith('2')
tf.random.set_seed(1234)

!pip install tensorflow-datasets==1.2.0
import tensorflow_datasets as tfds

import os
import re
import numpy as np

import matplotlib.pyplot as plt





In [5]:
from tensorflow.keras import layers, activations, models, preprocessing
from tensorflow.keras import preprocessing, utils

In [6]:
path_to_zip = tf.keras.utils.get_file(
    'cornell_movie_dialogs.zip',
    origin=
    'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip',
    extract=True)

path_to_dataset = os.path.join(
    os.path.dirname(path_to_zip), "cornell movie-dialogs corpus")

path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt')
path_to_movie_conversations = os.path.join(path_to_dataset,
                                           'movie_conversations.txt')

In [7]:
# Maximum number of samples to preprocess
MAX_SAMPLES = 100000

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    # adding a start and an end token to the sentence
    return sentence


def load_conversations():
    # dictionary of line id to text
    id2line = {}
    with open(path_to_movie_lines, errors='ignore') as file:
        lines = file.readlines()
    for line in lines:
        parts = line.replace('\n', '').split(' +++$+++ ')
        id2line[parts[0]] = parts[4]
    
    inputs, outputs = [], []
    with open(path_to_movie_conversations, 'r') as file:
        lines = file.readlines()
    print(len(lines))
    for line in lines:
        parts = line.replace('\n', '').split(' +++$+++ ')
        # get conversation in a list of line ID
        conversation = [line[1:-1] for line in parts[3][1:-1].split(', ')]
        for i in range(len(conversation) - 1):
            inputs.append(preprocess_sentence(id2line[conversation[i]]))
            outputs.append(preprocess_sentence(id2line[conversation[i + 1]]))
            if len(inputs) >= MAX_SAMPLES:
                print("True-----")
                return inputs, outputs
    return inputs, outputs


questions, answers = load_conversations()

83097
True-----


In [8]:
print('Sample question: {}'.format(questions[20]))
print('Sample answer: {}'.format(answers[20]))

Sample question: i really , really , really wanna go , but i can t . not unless my sister goes .
Sample answer: i m workin on it . but she doesn t seem to be goin for him .


In [9]:
for an in range(len(answers)):
    answers[an] = '<START> ' + answers[an] + ' <END>'

In [10]:
print('Sample answer: {}'.format(answers[20]))

Sample answer: <START> i m workin on it . but she doesn t seem to be goin for him . <END>


In [11]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index) + 1
print('Found %s unique tokens.' % VOCAB_SIZE)

Found 32979 unique tokens.


In [12]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
print(maxlen_questions)
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
print(maxlen_answers)

299
505


In [13]:
batch_size = 64  # Batch size for training.
epochs = 1  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
embedding_dim = 100

In [14]:
encoder_inputs = tf.keras.layers.Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(latent_dim, dropout=0.2,
                                                         recurrent_dropout=0.2,
                                                         return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2, return_state=True,
                                    return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE, activation=tf.keras.activations.softmax)
output = decoder_dense(decoder_outputs)

In [15]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 100)    3297900     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    3297900     input_2[0][0]                    
______________________________________________________________________________________________

In [16]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['acc'])

In [16]:
dataToTake = 500

In [17]:
size_all = len(tokenized_answers)
print(size_all)

100000


In [18]:
startIndex =  0
toIndex = dataToTake

In [None]:
while True:
    print(startIndex,toIndex)
    if startIndex >= size_all:
        break
    ansBatchToTake = tokenized_answers[startIndex:toIndex]
    queBatchToTake = tokenized_questions[startIndex:toIndex]
    startIndex += dataToTake
    toIndex += dataToTake
    toIndex = min(toIndex,size_all)
    # encoder_input_data
    encoder_input_data = preprocessing.sequence.pad_sequences(queBatchToTake, maxlen=maxlen_questions, padding='post')
    #print(encoder_input_data.shape)
    # decoder_input_data
    decoder_input_data = preprocessing.sequence.pad_sequences(ansBatchToTake, maxlen=maxlen_answers, padding='post')
    #print(decoder_input_data.shape)
    # decoder_output_data
    for i in range(len(ansBatchToTake)):
        ansBatchToTake[i] = ansBatchToTake[i][1:]
    decoder_output_data = preprocessing.sequence.pad_sequences(ansBatchToTake, maxlen=maxlen_answers, padding='post')
    decoder_output_data = utils.to_categorical(decoder_output_data, VOCAB_SIZE)

    indices = np.arange(len(encoder_input_data))
    np.random.shuffle(indices)
    encoder_input_data = encoder_input_data[indices]
    decoder_input_data = decoder_input_data[indices]
    decoder_output_data = decoder_output_data[indices]    
    #print(decoder_output_data.shape)
    model.fit([encoder_input_data, decoder_input_data],
              decoder_output_data,
              epochs=1)

0 2000


(100000, 299) 299
(100000, 505) 505
