In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
import torch.autograd as autograd
from torch.nn import init
import torch.nn.utils.rnn 
from torch.nn.utils.rnn import pad_sequence
import datetime
import operator
import codecs
from datasets import load_dataset
import unicodedata
import string
import re
import random
import itertools

np.random.seed(0)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Dialog pairs consist of [context, response, label]
def get_dialog_pairs(dataset, evalset):
    dataset_length = len(dataset)
    dialog_pairs = []
    for i in range(dataset_length):
        conv_length = len(dataset[i])
        for j in range(1,conv_length):
            if isinstance(dataset[i][j-1], str):
                inputLine = dataset[i][j-1].strip()
            else:
                inputLine = dataset[i][j-1]['text'].strip() 
            if not inputLine or not isinstance(inputLine, str): 
                inputLine = 'Nothing'
            if isinstance(dataset[i][j], str):
                targetLine = dataset[i][j].strip()
            else:
                targetLine = dataset[i][j]['text'].strip() 
            if not targetLine or not isinstance(targetLine, str): 
                targetLine = 'Nothing'
            if isinstance(inputLine, str) and isinstance(targetLine, str) and inputLine and targetLine:
                if evalset[i] >= 2:
                    label = 1
                else:
                    label = 0
                dialog_pairs.append([inputLine, targetLine, label])

    return dialog_pairs

In [16]:
# Dialog responses consist of [response]
def get_dialog_responses(dataset):
    dataset_length = len(dataset)
    responses = []
    for i in range(dataset_length):
        conv_length = len(dataset[i])
        for j in range(1,conv_length):
            if isinstance(dataset[i][j-1], str):
                inputLine = dataset[i][j-1].strip()
            else:
                inputLine = dataset[i][j-1]['text'].strip() 
            if not inputLine or not isinstance(inputLine, str): 
                inputLine = 'Nothing'
            if isinstance(dataset[i][j], str):
                targetLine = dataset[i][j].strip()
            else:
                targetLine = dataset[i][j]['text'].strip() 
            if not targetLine or not isinstance(targetLine, str): 
                targetLine = 'Nothing'
            if isinstance(inputLine, str) and isinstance(targetLine, str) and inputLine and targetLine:
                responses.append([targetLine])

    return responses

In [17]:
# Get list of individual sentences
def get_dialog(dataset):
    dataset_length = len(dataset)
    dialog = []
    for i in range(dataset_length):
        conv_length = len(dataset[i])
        for j in range(1,conv_length):
            if isinstance(dataset[i][j-1], str):
                inputLine = dataset[i][j-1].strip()
            else:
                inputLine = dataset[i][j-1]['text'].strip() 
            if not isinstance(inputLine, str) or not inputLine: 
                inputLine = 'Nothing'
            dialog.append([inputLine])

    return dialog

In [18]:
MAX_LENGTH = 15
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [19]:
def normalizeString(s):
    if isinstance(s, str): 
        s = unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub(r"\s+", r" ", s).strip()
    return s

In [20]:
def shuffle_list(list):
    random.shuffle(list)

def create_vocab(dialog):
    vocab = []
    word_freq = {}
    
    for sentence in dialog:
        
        train_words = str(sentence).split(" ")
        
        for word in train_words:
          
            if word.lower() not in vocab:
                vocab.append(word.lower())         
                       
            if word.lower() not in word_freq:
                word_freq[word.lower()] = 1
            else:
                #type(word)
                word_freq[word] += 1
    
    word_freq_sorted = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)
    vocab = ["<UNK>"] + [pair[0] for pair in word_freq_sorted]
    
    return vocab


def create_word_to_id(vocab):             
    word_to_id = {word: id for id, word in enumerate(vocab)}
    
    return word_to_id


def create_id_to_vec(word_to_id, glovefile): 
    lines = open(glovefile, 'r', encoding='utf-8').readlines()
    id_to_vec = {}
    vector = None
    
    for line in lines:
        word = line.split()[0]
        vector = np.array(line.split()[1:], dtype='float32') #32
        
        if word in word_to_id:
            id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(vector))
            
    for word, id in word_to_id.items(): 
        if word_to_id[word] not in id_to_vec:
            v = np.zeros(*vector.shape, dtype='float32')
            v[:] = np.random.randn(*v.shape)*0.01
            id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(v))
            
    embedding_dim = id_to_vec[0].shape[0]
    
    return id_to_vec, embedding_dim

def load_id(sentence, word_to_id):
    sentence_ids = []

    max_sentence_len = 160
    
    sentence_words = sentence.split()
    if len(sentence_words) > max_sentence_len:
        sentence_words = sentence_words[:max_sentence_len]
    for word in sentence_words:
        if word in word_to_id:
            sentence_ids.append(word_to_id[word])
        else: 
            sentence_ids.append(0) #UNK

    return sentence_ids

def load_ids(pair, word_to_id):
    context_ids = []
    response_ids = []

    context_cell = pair[0]
    response_cell = pair[1]

    max_context_len = 160
    
    context_words = context_cell.split()
    if len(context_words) > max_context_len:
        context_words = context_words[:max_context_len]
    for word in context_words:
        if word in word_to_id:
            context_ids.append(word_to_id[word])
        else: 
            context_ids.append(0) #UNK
    
    response_words = response_cell.split()
    for word in response_words:
        if word in word_to_id:
            response_ids.append(word_to_id[word])
        else: 
            response_ids.append(0)

    return context_ids, response_ids

def load_ids_and_labels(pair, word_to_id):
    context_ids = []
    response_ids = []

    context_cell = pair[0]
    response_cell = pair[1]
    label_cell = pair[2]

    max_context_len = 160
    
    context_words = context_cell.split()
    if len(context_words) > max_context_len:
        context_words = context_words[:max_context_len]
    for word in context_words:
        if word in word_to_id:
            context_ids.append(word_to_id[word])
        else: 
            context_ids.append(0) #UNK
    
    response_words = response_cell.split()
    for word in response_words:
        if word in word_to_id:
            response_ids.append(word_to_id[word])
        else: 
            response_ids.append(0)
    
    label = np.array(label_cell).astype(np.float32)

    return context_ids, response_ids, label

In [21]:
def prepareData(embedding_dim):
    print("Start preparing training data ...")
    raw_dataset = load_dataset("conv_ai_2")
    raw_dataset1 = load_dataset("daily_dialog")
    train_dataset = raw_dataset["train"]
    train_dataset1 = raw_dataset1["train"]
    raw_dialog_list = train_dataset["dialog"]
    raw_dialog_list1 = train_dataset1["dialog"]
    
    eval_list = train_dataset["eval_score"]
    eval_list1 = [2 for i in range(len(raw_dialog_list1))]
    
    dialog_pairs = get_dialog_pairs(raw_dialog_list, eval_list)
    dialog_pairs1 = get_dialog_pairs(raw_dialog_list1, eval_list1)
    dialog_pairs.extend(dialog_pairs1)
    dialog_pairs = [[normalizeString(s) for s in l] for l in dialog_pairs]
    only_dialog_pairs = []
    for i in range(len(dialog_pairs)):
        only_dialog_pairs.append(dialog_pairs[i][0:2])
    
    dialog_indiv = get_dialog(raw_dialog_list)
    dialog_indiv1 = get_dialog(raw_dialog_list1)
    dialog_indiv.extend(dialog_indiv1)
    
    dialog_indiv = [[normalizeString(s) for s in l] for l in dialog_indiv]
    vocab = create_vocab(dialog_indiv)
            
    shuffle_list(dialog_pairs)
    
    for pair in dialog_pairs:
        if len(pair[0].split()) <= 0:
            pair[0] = 'Oh'
        if len(pair[1].split()) <= 0:
            pair[1] = 'Oh'
    
    training_data = dialog_pairs[:-(int(len(dialog_pairs) / 10))]
    
    word_to_id = create_word_to_id(vocab)
    id_to_vec, emb_dim = create_id_to_vec(word_to_id, 'dataset/GloVe/glove.6B.%dd.txt' %embedding_dim)

    validation_data = dialog_pairs[-(int(len(dialog_pairs) / 10)):]
    print('Training data: ')
    
    return training_data, validation_data
    #voc, pairs = readVocs(dialog_pairs, name)
    #print("Read {!s} sentence pairs".format(len(pairs)))
    #pairs = filterPairs(pairs)
    #print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    #print("Counting words...")
    #for pair in pairs:
       # voc.addSentence(pair[0])
       # voc.addSentence(pair[1])
    #print("Counted words:", voc.num_words)
    #return voc, pairs

In [38]:
def prepare_responses():
    raw_dataset = load_dataset("conv_ai_2")
    raw_dataset1 = load_dataset("daily_dialog")
    dataset = raw_dataset["train"]
    dataset1 = raw_dataset1["train"]
    dialog_list = dataset["dialog"]
    dialog_list1 = dataset1["dialog"]
    
    dialog_indiv = get_dialog(dialog_list)
    dialog_indiv1 = get_dialog(dialog_list1)
    dialog_indiv.extend(dialog_indiv1)
    
    dialog_indiv = [[normalizeString(s) for s in l] for l in dialog_indiv]
    vocab = create_vocab(dialog_indiv)
    
    responses = get_dialog_responses(dialog_list)
    responses1 = get_dialog_responses(dialog_list1)
    responses.append(responses1)
    
    word_to_id = create_word_to_id(vocab)
    id_to_vec, emb_dim = create_id_to_vec(word_to_id, 'dataset/GloVe/glove.6B.50d.txt')
    
    return responses, vocab, word_to_id, id_to_vec, emb_dim

In [23]:
PAD_token = 0  
SOS_token = 1 
EOS_token = 2

def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def contextVar(l):
    indexes_batch = [load_id(sentence, word_to_id) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def responseVar(l):
    indexes_batch = [load_id(sentence, word_to_id) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch, labels = [], [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
        labels.append(pair[2])
    inp, lengths = contextVar(input_batch)
    output, lengths1 = contextVar(output_batch)
    return inp, lengths, output, lengths1, labels
    #output, mask, max_target_len = responseVar(output_batch)
    #return inp, lengths, output, mask, max_target_len


# Example for validation
#small_batch_size = 5
#batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
#input_variable, lengths, target_variable, mask, max_target_len = batches

#print("input_variable:", input_variable)
#print("lengths:", lengths)
#print("target_variable:", target_variable)
#print("mask:", mask)
#print("max_target_len:", max_target_len)

In [59]:
class Encoder(nn.Module):

    def __init__(self, 
            emb_size, 
            hidden_size, 
            vocab_size, 
            p_dropout): 
    
            super(Encoder, self).__init__()
             
            self.emb_size = emb_size
            self.hidden_size = hidden_size
            self.vocab_size = vocab_size
            self.p_dropout = p_dropout
       
            self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
            self.lstm = nn.LSTM(self.emb_size, self.hidden_size)
            self.dropout_layer = nn.Dropout(self.p_dropout) 

            self.init_weights()
             
    def init_weights(self):
        init.uniform(self.lstm.weight_ih_l0, a = -0.01, b = 0.01)
        init.orthogonal(self.lstm.weight_hh_l0)
        self.lstm.weight_ih_l0.requires_grad = True
        self.lstm.weight_hh_l0.requires_grad = True
        
        embedding_weights = torch.FloatTensor(self.vocab_size, self.emb_size)
        embedding_weights = embedding_weights.to(device)
            
        for id, vec in id_to_vec.items():
            embedding_weights[id] = vec
        
        self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True)
            
    def forward(self, input_seq, input_lengths, hidden=None):
        embeddings = self.embedding(input_seq)
        embeddings = embeddings.to(device)
        packed = nn.utils.rnn.pack_padded_sequence(embeddings, input_lengths, enforce_sorted = False)
        
        _, (last_hidden, _) = self.lstm(packed, hidden)
        last_hidden = self.dropout_layer(last_hidden[-1])
        return last_hidden
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        embeddings = embeddings.to(device)
        _, (last_hidden, _) = self.lstm(embeddings)
        last_hidden = self.dropout_layer(last_hidden[-1])

        return last_hidden

    
class DualEncoder(nn.Module):
     
    def __init__(self, encoder):
        super(DualEncoder, self).__init__()
        self.encoder = encoder
        self.hidden_size = self.encoder.hidden_size
        M = torch.FloatTensor(self.hidden_size, self.hidden_size)
        M = M.to(device)
        init.xavier_normal(M)
        self.M = nn.Parameter(M, requires_grad = True)

    def forward(self, input_seq, response_seq, input_lengths, response_lengths):
        context_last_hidden = self.encoder(input_seq, input_lengths) #dimensions: (batch_size x hidden_size)
        context_last_hidden = context_last_hidden.to(device)
        response_last_hidden = self.encoder(response_seq, response_lengths) #dimensions: (batch_size x hidden_size)
        response_last_hidden = response_last_hidden.to(device)
        
        context = context_last_hidden.mm(self.M) #dimensions: (batch_size x hidden_size)
        context = context.view(-1, 1, self.hidden_size) #dimensions: (batch_size x 1 x hidden_size)
        context = context.to(device)
        
        response = response_last_hidden.view(-1, self.hidden_size, 1) #dimensions: (batch_size x hidden_size x 1)
        response = response.to(device)
        
        score = torch.bmm(context, response).view(-1, 1) #dimensions: (batch_size x 1 x 1) and lastly --> (batch_size x 1)
        score = score.to(device)

        return score
    
    def forward(self, context_tensor, response_tensor):
        
        context_last_hidden = self.encoder(context_tensor) #dimensions: (batch_size x hidden_size)
        context_last_hidden = context_last_hidden.to(device)
        
        response_last_hidden = self.encoder(response_tensor) #dimensions: (batch_size x hidden_size)
        context = context_last_hidden.mm(self.M) #dimensions: (batch_size x hidden_size)
        context = context.view(-1, 1, self.hidden_size) #dimensions: (batch_size x 1 x hidden_size)
        context = context.to(device)
        
        response = response_last_hidden.view(-1, self.hidden_size, 1) #dimensions: (batch_size x hidden_size x 1)
        response = response.to(device)
        
        score = torch.bmm(context, response).view(-1, 1) #dimensions: (batch_size x 1 x 1) and lastly --> (batch_size x 1)
        score = score.to(device)

        return score

In [41]:
def creating_model(emb_dim, hidden_size, vocab_size, p_dropout):

    #print(str(datetime.datetime.now()).split('.')[0], "Calling model...")

    encoder = Encoder(
            emb_size = emb_dim,
            hidden_size = hidden_size,
            vocab_size = vocab_size,
            p_dropout = p_dropout)

    dual_encoder = DualEncoder(encoder)

    #print(str(datetime.datetime.now()).split('.')[0], "Model created.\n")
    #print(dual_encoder)
    
    return encoder, dual_encoder

In [26]:
def increase_count(score, label):
    corr = False
    if ((score >= 0.5) and (label >= 1)) or ((score < 0.5) and (label  <= 0)):
        corr = True
    return corr


def get_accuracy(correct_count, length):
    accuracy = correct_count/length
        
    return accuracy

In [27]:
def train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func, is_train = True):

    if is_train:
        # Zero gradients
        optimizer.zero_grad()

    # Set device options
    context = context.to(device)
    response = response.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")
    lengths1 = lengths1.to("cpu")

    # Initialize variables
    loss = 0
    sum_loss = 0.0
    correct_count = 0

    # Forward pass through encoder
    score = c_encoder(context, response, lengths, lengths1)
    total_count = 0
    for n in range(len(score)):
        label = autograd.Variable(torch.DoubleTensor(torch.from_numpy(np.array(labels[n]).astype(float).reshape(1,1))), requires_grad = False)
        label = label.to(device)
        #print(score[n].reshape(1,1))
        loss = loss_func(score[n].reshape(1,1), label)
        sum_loss += loss.item()
        if increase_count(score[n].item(), label.item()):
            correct_count += 1

    if is_train:
        # Perform backpropatation
        loss.backward()
        # Adjust model weights
        optimizer.step()

    return loss, correct_count

In [28]:
def trainIterAll(training_data, validation_data, c_encoder, learning_rate, l2_penalty, n_iteration, batch_size, epochs):

    optimizer = torch.optim.Adam(c_encoder.parameters(), lr = learning_rate, weight_decay = l2_penalty)
    loss_func = torch.nn.BCEWithLogitsLoss()
    loss_func = loss_func.to(device)
    # Load batches for each iteration
    
    best_validation_accuracy = 0.0
     
    for epoch in range(epochs):
        training_batches = [batch2TrainData([random.choice(training_data) for _ in range(batch_size)])
                          for _ in range(n_iteration)]

        validation_batches = [batch2TrainData([random.choice(validation_data) for _ in range(batch_size)])
                          for _ in range(n_iteration)]

        # Initializations
        print('Initializing ...')
        start_iteration = 1
        print_loss = 0

        sum_loss_training = 0.0
        training_correct_count = 0
        training_total_count = 0
        c_encoder.train()

        # Training loop
        print("Training...")
        i = 0
        for iteration in range(start_iteration, n_iteration + 1):
            if (i % 1000 == 0):
                print('Iteration ', i)
            i += 1
            training_batch = training_batches[iteration - 1]
            # Extract fields from batch
            context, lengths, response, lengths1, labels = training_batch

            # Run a training iteration with batch
            loss, correct_count = train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func)
            training_correct_count += correct_count
            training_total_count += 64
            sum_loss_training += loss
            print_loss += loss

            # Print progress
            if iteration % 1000 == 0:
                print_loss_avg = print_loss / 1000
                print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
                print('Correct count: ', training_correct_count)
                print('Total count: ', training_total_count)
                print_loss = 0

        training_accuracy = get_accuracy(training_correct_count, training_total_count)
        c_encoder.eval()
        
        # Iterate through validation set
        validation_correct_count = 0
        validation_total_count = 0
        sum_loss_validation = 0.0
        i = 0
        for iteration in range(start_iteration, n_iteration + 1):
            if (i % 1000 == 0):
                print('Iteration val ', i)
            i += 1
            validation_batch = validation_batches[iteration - 1]
            # Extract fields from batch
            context, lengths, response, lengths1, labels = validation_batch

            # Run a training iteration with batch
            loss, correct_count = train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func, False)
            sum_loss_validation += loss
            validation_correct_count += correct_count
            validation_total_count += 64

        validation_accuracy = get_accuracy(validation_correct_count, validation_total_count)

        print(str(datetime.datetime.now()).split('.')[0], 
              "Epoch: %d/%d" %(epoch,epochs),  
              "TrainLoss: %.3f" %(sum_loss_training/validation_total_count), 
              "TrainAccuracy: %.3f" %(training_accuracy), 
              "ValLoss: %.3f" %(sum_loss_validation/validation_total_count), 
              "ValAccuracy: %.3f" %(validation_accuracy))

        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
            torch.save(dual_encoder.state_dict(), 'dataset/models/retrieval_encoder_dict.pt')
            torch.save(optimizer.state_dict(), 'dataset/models/retrieval_optimizer_dict.pt')
            print("New best found and saved.")
                
    print(str(datetime.datetime.now()).split('.')[0], "Training and validation epochs finished.")
    

In [342]:
n_iteration = 4000
batch_size = 64
training_data, validation_data = prepareData(50)

In [343]:
encoder, dual_encoder = creating_model(hidden_size = 50, 
                                       p_dropout = 0.85)

encoder = encoder.to(device)
dual_encoder = dual_encoder.to(device)
#encoder.cuda()
#dual_encoder.cuda

for name, param in dual_encoder.named_parameters():
    if param.requires_grad:
        print(name)

2021-11-28 19:34:46 Calling model...


  init.uniform(self.lstm.weight_ih_l0, a = -0.01, b = 0.01)
  init.orthogonal(self.lstm.weight_hh_l0)


2021-11-28 19:34:47 Model created.

DualEncoder(
  (encoder): Encoder(
    (embedding): Embedding(23978, 50)
    (lstm): LSTM(50, 50)
    (dropout_layer): Dropout(p=0.85, inplace=False)
  )
)
M
encoder.embedding.weight
encoder.lstm.weight_ih_l0
encoder.lstm.weight_hh_l0
encoder.lstm.bias_ih_l0
encoder.lstm.bias_hh_l0


  init.xavier_normal(M)


In [435]:
trainIterAll(training_data = training_data, validation_data = validation_data, c_encoder = dual_encoder, learning_rate = 0.0001, l2_penalty = 0.000001, 
            n_iteration = n_iteration, batch_size = batch_size, epochs = 100)

Initializing ...
Training...
Iteration  0


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
responses, vocab, word_to_id, id_to_vec, emb_dim = prepare_responses()
encoder, dual_encoder = creating_model(emb_dim = emb_dim, hidden_size = 50, vocab_size = len(vocab), p_dropout = 0.85)
encoder = encoder.to(device)
dual_encoder = dual_encoder.to(device)
dual_encoder.load_state_dict(torch.load('dataset/models/retrieval_encoder_dict.pt'))
dual_encoder.eval()

context_ids = load_id(context, word_to_id)

In [86]:
def getResponse(context):
    
    i = 0
    best_pos = 0
    best_score = 0
    for response in responses:
        if not isinstance(response[0], str):
            continue
        response_ids = load_id(response[0], word_to_id)
        contextt = autograd.Variable(torch.LongTensor(context_ids).view(-1,1))
        responset = autograd.Variable(torch.LongTensor(response_ids).view(-1, 1))
        contextt, responset = contextt.to(device), responset.to(device)
        score = dual_encoder(contextt, responset)
        if score > best_score:
            best_score = score
            best_pos = i
        i += 1
    return responses[best_pos][0]

In [88]:
response = getResponse('how do you do today')
print(response)

i like country music , but i like country music , but i like country music
