# Chatbot

seq2seq encoder-decoder using tensorflow addons for the decoder


- dataset1 http://www.cs.cornell.edu/~cristian/memorability.html
- dataset2 https://www.kaggle.com/grafstor/simple-dialogs-for-chatbot (plus some small chat conversations)

In [1]:
#some imports

import pandas as pd 
import string
import numpy as np
from tqdm import tqdm

from collections import OrderedDict

In [2]:
#per chat only
import json
import string
import numpy as np

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
#importing tensorflow

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
#select GPU

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected.")

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
tf.config.experimental.set_memory_growth(gpus[1], enable=True)
#tf.config.gpu_options.allow_growth = True
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

## Uploading dataset

In [5]:
df=pd.read_csv('archive/dialogs.txt',sep='\t')

In [6]:
df.head()

Unnamed: 0,Q,ANS
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [7]:
#uploading files

lines= open('dataset_cornel/movie_lines.txt',encoding='utf-8',errors='ignore').read().split('\n')
conversations= open('dataset_cornel/movie_conversations.txt',encoding='utf-8',errors='ignore').read().split('\n')
lines

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.",
 'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow',
 "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.",
 'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No',
 'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?',
 'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?',
 'L868 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ The "real you".',
 'L867 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ What good stuff?',
 "L866 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ I figured yo

In [8]:
#preparing the dataset

conversation_index=[] #nested list of conversation indexes
for conversation in conversations:
    conversation_index.append(conversation.split('+++$+++')[-1][2:-1].replace("'","").split(','))

In [9]:
dict_i2text={} #index to text dictionary
for line in lines[:-1]:   #-1 cause the last one is empty
    key=line.split('+++$+++')[0][1:-1]
    dict_i2text[int(key)]=line.split('+++$+++')[-1]

ord_list_i2text = list(OrderedDict(sorted(dict_i2text.items())).items())

In [10]:
long_threshold=6 #max input dim

In [11]:

cut_dataset=14*1000 #how many items
cut_offset=0*1000   #offset

ord_list_i2text = [x for x in ord_list_i2text if not len(x[1].split())>long_threshold][cut_offset:cut_dataset+cut_offset]

ord_list_i2text


[(49, ' Did you change your hair?'),
 (50, ' No.'),
 (51, ' You might wanna think about it'),
 (59, ' I missed you.'),
 (62, ' With the teeth of your zipper?'),
 (63, ' You the new guy?'),
 (64, ' So they tell me...'),
 (66, ' So -- which Dakota you from?'),
 (67, " North, actually.  How'd you   ?"),
 (71, ' Thirty-two.'),
 (72, ' Get out!'),
 (73, ' How many people go here?'),
 (74, ' Couple thousand. Most of them evil'),
 (77, " That I'm used to."),
 (87, ' That girl -- I --'),
 (88, ' You burn, you pine, you perish?'),
 (89, ' Who is she?'),
 (91, ' Why not?'),
 (108, ' Tempestuous?'),
 (123, " Who's that?"),
 (124, ' Patrick Verona   Random skid.'),
 (127, ' He always look so'),
 (128, ' Block E?'),
 (130, ' Just a little.'),
 (131, " What's this?"),
 (132, ' An attempted slit.'),
 (141, ' He always have that shit-eating grin?'),
 (143, ' You know French?'),
 (158, " That's her?  Bianca's sister?"),
 (159, ' The mewling, rampalian wretch herself.'),
 (161, ' In the microwave.'),
 (

In [12]:
len(ord_list_i2text)

14000

In [13]:
question=[] #defining question and answer lists
answer=[]

In [14]:
#uploading q&a first dataset
for i in range(len(ord_list_i2text)-1):
    if ord_list_i2text[i][0]-ord_list_i2text[i+1][0]!=1:
        question.append(ord_list_i2text[i][1])
        answer.append(ord_list_i2text[i+1][1])



In [15]:
#uploading q&a second dataset
dataset2=[df['Q'],df['ANS']]

for j in range(len(dataset2[0])):
    if not (len(dataset2[0][j].split())>long_threshold or (len(dataset2[1][j].split())>long_threshold)):
        question.append(dataset2[0][j])
        answer.append(dataset2[1][j])

## Prepocessing dataset

- Set to lower case
- Remove contract form
- Remove punctuation
- add sos and eos tokens to answers


In [17]:
contractions_dict = {
"im": "i am",
"dont": "do not",
"doesnt": "does not",
"theres": "there is",
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I would",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": " what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}


In [18]:
question


[' Did you change your hair?',
 ' No.',
 ' You might wanna think about it',
 ' I missed you.',
 ' With the teeth of your zipper?',
 ' You the new guy?',
 ' So they tell me...',
 ' So -- which Dakota you from?',
 " North, actually.  How'd you   ?",
 ' Thirty-two.',
 ' Get out!',
 ' How many people go here?',
 ' Couple thousand. Most of them evil',
 " That I'm used to.",
 ' That girl -- I --',
 ' You burn, you pine, you perish?',
 ' Who is she?',
 ' Why not?',
 ' Tempestuous?',
 " Who's that?",
 ' Patrick Verona   Random skid.',
 ' He always look so',
 ' Block E?',
 ' Just a little.',
 " What's this?",
 ' An attempted slit.',
 ' He always have that shit-eating grin?',
 ' You know French?',
 " That's her?  Bianca's sister?",
 ' The mewling, rampalian wretch herself.',
 ' In the microwave.',
 ' Make anyone cry today?',
 " Where've you been?",
 ' Nowhere... Hi, Daddy.',
 " What's a synonym for throbbing?",
 ' I know.',
 ' You decided.',
 ' What if she never starts dating?',
 ' Enough!',
 " 

In [20]:
#function to expand the contractions and remove punctuation

def exp_remPunt(l):
    '''
    params: l is a list (of quest or ans)
    '''
    clean_l=[]
    table = str.maketrans(dict.fromkeys(string.punctuation))
    remove_digits = str.maketrans('', '', string.digits)

    for sent in l:
        for word in sent.split():
            if word in contractions_dict:
                sent = sent.replace(word, contractions_dict[word])  #expand
                
        sent = sent.translate(remove_digits)
        clean_l.append(sent.translate(table).lower()) #remove punt and set to lower
    
    return clean_l


In [21]:
question=exp_remPunt(question)
answer=exp_remPunt(answer)

In [22]:
answer

[' no',
 ' you might wanna think about it',
 ' i missed you',
 ' with the teeth of your zipper',
 ' you the new guy',
 ' so they tell me',
 ' so  which dakota you from',
 ' north actually  howd you   ',
 ' thirtytwo',
 ' get out',
 ' how many people go here',
 ' couple thousand most of them evil',
 ' that im used to',
 ' that girl  i ',
 ' you burn you pine you perish',
 ' who is she',
 ' why not',
 ' tempestuous',
 ' whos that',
 ' patrick verona   random skid',
 ' he always look so',
 ' block e',
 ' just a little',
 ' whats this',
 ' an attempted slit',
 ' he always have that shiteating grin',
 ' you know french',
 ' thats her  biancas sister',
 ' the mewling rampalian wretch herself',
 ' in the microwave',
 ' make anyone cry today',
 ' whereve you been',
 ' nowhere hi daddy',
 ' whats a synonym for throbbing',
 ' i know',
 ' you decided',
 ' what if she never starts dating',
 ' enough',
 ' but she does not want to date',
 ' exactly my point',
 ' tumescent',
 ' youre not helping',
 '

## Creating embeddings

- create list with all the words
- clean the list: each word must have just one occurrence (set)
- create dicts for word to index and index to word

In [23]:
#create list with all the words
words_list=[]

for sent in question:
    for word in sent.split():
        words_list.append(word)
        
for sent in answer:
    for word in sent.split():
        words_list.append(word)
        
words_list = set(words_list)
vocab_len=len(words_list) #length of the vocabulary without offset

vocab_len

5857

In [24]:
#define funct to create dicts
def index_to_word(words_list):
    
    d= { (index +3) : word for index,word in enumerate(words_list)}
    
    d[0]='<pad>'
    d[1]='<sos>'
    d[2]='<eos>'
    return d

def word_to_index(words_list):
    d= { word : (index +3) for index,word in enumerate(words_list)}
    
    d['<pad>']=0
    d['<sos>']=1
    d['<eos>']=2
    return d

In [25]:
dict_i2w=index_to_word(words_list) #index to word
dict_w2i=word_to_index(words_list) #word to index

In [26]:
#sent translation to embedding
encoder_input_data=[]
decoder_input_data=[]
decoder_output_data=[]
for sent in question:
    emb_str=[]
    for word in sent.split():
        emb_str.append(dict_w2i[word])
    encoder_input_data.append(emb_str) 
    
for sent in answer:
    emb_str=[]
    for word in sent.split():
        emb_str.append(dict_w2i[word])
    decoder_input_data.append([1]+emb_str+[2]) #ans with sos ans eos
    decoder_output_data.append(emb_str+[2]) #ans with just eos


In [27]:
encoder_input_data

[[3762, 2900, 5157, 4419, 5682],
 [1440],
 [2900, 2951, 2258, 5820, 2423, 3414],
 [2459, 2873, 2900],
 [1265, 3542, 3144, 2004, 4419, 301],
 [2900, 3542, 1153, 5344],
 [695, 4520, 1680, 5176],
 [695, 1041, 1462, 2900, 4227],
 [5508, 3255, 3498, 2900],
 [4042],
 [5823, 4808],
 [3638, 2770, 1158, 596, 5460],
 [1738, 1884, 993, 2004, 2426, 59],
 [1159, 3537, 4872, 5488],
 [1159, 509, 2459],
 [2900, 1080, 2900, 2562, 2900, 2273],
 [1377, 3907, 3328],
 [4895, 4259],
 [910],
 [3820, 1159],
 [1406, 4412, 757, 3570],
 [1803, 2324, 219, 695],
 [659, 2339],
 [5540, 1054, 5064],
 [253, 1871],
 [3962, 2734, 536],
 [1803, 2324, 3754, 1159, 1395, 2094],
 [2900, 1346, 1096],
 [1649, 3005, 5662, 30],
 [3542, 707, 4668, 1329, 2514],
 [1026, 3542, 2191],
 [3678, 835, 3781, 4142],
 [1564, 2900, 16],
 [330, 2234, 5264],
 [253, 1054, 5165, 1465, 4079],
 [2459, 1346],
 [2900, 920],
 [2955, 1492, 3328, 4414, 2976, 4287],
 [3257],
 [3878, 3328, 2441, 4259, 3252, 5488, 512],
 [5187, 2947, 1226],
 [5458],
 [146

In [28]:
#bucketing threshold

#here it is possibile to add as many buckets as needed
buck_t1=long_threshold

#buck_tmax=max([len(x) for x in encoder_input_data]) 


b1_enc_in=[]  #bucket 1 encoder input
b1_dec_in=[]  #bucket 1 decoder input
b1_dec_out=[] #bucket 1 decoder output

for index in range(len(encoder_input_data)):
    if len(encoder_input_data[index])<=buck_t1:
        b1_enc_in.append(encoder_input_data[index])
        b1_dec_in.append(decoder_input_data[index])
        b1_dec_out.append(decoder_output_data[index])
        
b1_dec_out

[[1440, 2],
 [2900, 2951, 2258, 5820, 2423, 3414, 2],
 [2459, 2873, 2900, 2],
 [1265, 3542, 3144, 2004, 4419, 301, 2],
 [2900, 3542, 1153, 5344, 2],
 [695, 4520, 1680, 5176, 2],
 [695, 1041, 1462, 2900, 4227, 2],
 [5508, 3255, 3498, 2900, 2],
 [4042, 2],
 [5823, 4808, 2],
 [3638, 2770, 1158, 596, 5460, 2],
 [1738, 1884, 993, 2004, 2426, 59, 2],
 [1159, 3537, 4872, 5488, 2],
 [1159, 509, 2459, 2],
 [2900, 1080, 2900, 2562, 2900, 2273, 2],
 [1377, 3907, 3328, 2],
 [4895, 4259, 2],
 [910, 2],
 [3820, 1159, 2],
 [1406, 4412, 757, 3570, 2],
 [1803, 2324, 219, 695, 2],
 [659, 2339, 2],
 [5540, 1054, 5064, 2],
 [253, 1871, 2],
 [3962, 2734, 536, 2],
 [1803, 2324, 3754, 1159, 1395, 2094, 2],
 [2900, 1346, 1096, 2],
 [1649, 3005, 5662, 30, 2],
 [3542, 707, 4668, 1329, 2514, 2],
 [1026, 3542, 2191, 2],
 [3678, 835, 3781, 4142, 2],
 [1564, 2900, 16, 2],
 [330, 2234, 5264, 2],
 [253, 1054, 5165, 1465, 4079, 2],
 [2459, 1346, 2],
 [2900, 920, 2],
 [2955, 1492, 3328, 4414, 2976, 4287, 2],
 [3257, 2]

In [29]:
len(b1_dec_out)

14873

In [30]:
#function to find max len of decoder input
def get_max_(l):
    return max([len(sent) for sent in l])

In [31]:
#padding encoder and decoder inputs
b1_dec_len=get_max_(b1_dec_in)

b1_enc_in = pad_sequences(b1_enc_in,buck_t1, padding='post')
b1_dec_in = pad_sequences(b1_dec_in,b1_dec_len, padding='post')
b1_dec_out = pad_sequences(b1_dec_out,b1_dec_len, padding='post')

b1_dec_len

10

In [32]:
b1_dec_in.shape,b1_enc_in.shape

((14873, 10), (14873, 6))

## Create dataset

In [34]:
#imports and variables to create the dataset

batch_size=64 #batch size for training


In [37]:
dataset = tf.data.Dataset.from_tensor_slices((b1_enc_in, b1_dec_in,b1_dec_out))#
dataset= dataset.shuffle(len(b1_enc_in)).batch(batch_size, drop_remainder=True)
dataset

<BatchDataset shapes: ((64, 6), (64, 10), (64, 10)), types: (tf.int32, tf.int32, tf.int32)>

## Building models

- define model for training
- define model for inference
 DA MODIFICAER


In [74]:
lat_dim=512  #used later

In [75]:
import chatbot_models_class as cb

In [77]:
encoder_=cb.encoder_model(lat_dim=lat_dim,vocab_len=vocab_len)
decoder_=cb.decoder_model(lat_dim=lat_dim,batch_size=batch_size,vocab_len=vocab_len,buck_t1=buck_t1)

## Training the models

In [78]:
#define training vaiables
n_epochs=20
steps_per_epoch= len(b1_enc_in) //batch_size

In [79]:
#get spare categorical crossentropy from logits and mask the pads so they are not considered in the loss
def custom_loss(labels,pred):

    
    loss_fun=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
    loss=loss_fun(y_true=labels, y_pred=pred)
    
    #masking zeros
    mask = tf.cast(tf.logical_not(tf.math.equal(labels,0)),dtype=tf.float32)
    
    return tf.reduce_mean(mask*loss)

In [80]:
#define optimizers
opt_a=tf.keras.optimizers.Adam()
opt_r=tf.keras.optimizers.RMSprop()

In [81]:
metrics_=[tf.keras.metrics.Accuracy()]
mean_loss=tf.keras.metrics.Mean()

In [82]:
#custom training loop

for epoch in range(1,n_epochs+1):
    print("Epoch {}/{} \n".format(epoch,n_epochs))
    step_counter=0

    for enc_inp_batch,dec_in_batch, dec_out_batch in dataset.take(steps_per_epoch):
        step_counter+=1
        #print("\r", end="")
        with tf.GradientTape() as tape:
            encoder_embedding=encoder_.encoder_embedding_layer(enc_inp_batch)
            encoder_output,enc_state_h,enc_state_c =encoder_.encoder(encoder_embedding,
                                                                    initial_state=[tf.zeros((batch_size, lat_dim*2)), tf.zeros((batch_size, lat_dim*2))])
            
            decoder_embedding=decoder_.decoder_embedding_layer(dec_in_batch)
            
            #setup memory 
            decoder_.attention_mechanism.setup_memory(encoder_output)
            
            decoder_initial_state = decoder_.get_initial_states(batch_size,enc_state_h,enc_state_c)
                
            #compute model output    
            model_output,_,_=decoder_.training_decoder(decoder_embedding,
                                                       initial_state=decoder_initial_state,
                                                       sequence_length=batch_size*[len(b1_dec_out[0])]
                                                      )
            logits=model_output.rnn_output
            
            main_loss = custom_loss(dec_out_batch,logits)
            loss= tf.add_n([main_loss]+encoder_.losses+decoder_.losses)
        
        variables = encoder_.trainable_variables + decoder_.trainable_variables  
        gradients=tape.gradient(loss,variables)
        
        #optimizer
        opt_r.apply_gradients(zip(gradients,variables))
        
        
        mean_loss(loss)
        

        
        #metrics
        for metric in metrics_:
            metric(dec_out_batch,tf.argmax(logits,axis=2))
        
        
        display_print_loss="".join(["{:.4f}".format(m.result()) for m in [mean_loss]])
        display_print_acc="".join("{:.4f}".format(m.result().numpy()) for m in metrics_)
        
        print("\r"+"loss:"+display_print_loss+"  _  metrics:"+display_print_acc+"  -  iteration:"+str(step_counter), end="  ")
        
        #print("mean_loss",mean_loss)#,"metrics",metrics, "\n")
        
    for metric in [mean_loss]+metrics_:
        metric.reset_states()
            

Epoch 1/20 

loss:2.4991  _  metrics:0.1091  -  iteration:232  Epoch 2/20 

loss:2.2216  _  metrics:0.1228  -  iteration:232  Epoch 3/20 

loss:2.1250  _  metrics:0.1301  -  iteration:232  Epoch 4/20 

loss:2.0447  _  metrics:0.1374  -  iteration:232  Epoch 5/20 

loss:1.9627  _  metrics:0.1449  -  iteration:232  Epoch 6/20 

loss:1.8718  _  metrics:0.1550  -  iteration:232  Epoch 7/20 

loss:1.7664  _  metrics:0.1668  -  iteration:232  Epoch 8/20 

loss:1.6547  _  metrics:0.1803  -  iteration:232  Epoch 9/20 

loss:1.5397  _  metrics:0.1955  -  iteration:232  Epoch 10/20 

loss:1.4218  _  metrics:0.2110  -  iteration:232  Epoch 11/20 

loss:1.3118  _  metrics:0.2277  -  iteration:232  Epoch 12/20 

loss:1.2158  _  metrics:0.2425  -  iteration:232  Epoch 13/20 

loss:1.1312  _  metrics:0.2560  -  iteration:232  Epoch 14/20 

loss:1.0502  _  metrics:0.2695  -  iteration:232  Epoch 15/20 

loss:0.9835  _  metrics:0.2814  -  iteration:232  Epoch 16/20 

loss:0.9291  _  metrics:0.2898  -  

In [93]:
#saving weights

encoder_.save_weights('models_Att/encoder_att_weights_inv_input')
decoder_.save_weights('models_Att/decoder_att_weights_inv_input')

## Saving Dictionaries

In [97]:
import json

a_file = open("dict_w2i_chatbot.json", "w")

json.dump(dict_w2i, a_file)

a_file.close()

a_file = open("dict_i2w_chatbot.json", "w")

json.dump(dict_i2w, a_file)

a_file.close()

a_file = open("contractions_dict.json", "w")

json.dump(contractions_dict, a_file)

a_file.close()
