In [1]:
import numpy as np
import torch.nn as nn
import torch
import torch.autograd as autograd
from torch.nn import init
import torch.nn.utils.rnn 
from torch.nn.utils.rnn import pad_sequence
import datetime
from datasets import load_dataset
import unicodedata
import string
import re
import random
import os
import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Dialog pairs consist of [context, response, label]
def get_dialog_pairs(dataset, evalset):
    dataset_length = len(dataset)
    dialog_pairs = []
    for i in range(dataset_length):
        conv_length = len(dataset[i])
        for j in range(1,conv_length):
            if isinstance(dataset[i][j-1], str):
                inputLine = dataset[i][j-1].strip()
            else:
                inputLine = dataset[i][j-1]['text'].strip() 
            if not inputLine or not isinstance(inputLine, str): 
                inputLine = 'Nothing'
            if isinstance(dataset[i][j], str):
                targetLine = dataset[i][j].strip()
            else:
                targetLine = dataset[i][j]['text'].strip() 
            if not targetLine or not isinstance(targetLine, str): 
                targetLine = 'Nothing'
            if isinstance(inputLine, str) and isinstance(targetLine, str) and inputLine and targetLine:
                if evalset[i] >= 2:
                    label = 1
                else:
                    label = 0
                inputLine = normalizeString(inputLine)
                targetLine = normalizeString(targetLine)
                dialog_pairs.append([inputLine, targetLine, label])

    return dialog_pairs

In [4]:
# Get list of individual sentences
def get_dialog(dataset):
    dataset_length = len(dataset)
    dialog = []
    for i in range(dataset_length):
        conv_length = len(dataset[i])
        
        for j in range(1,conv_length):
            if isinstance(dataset[i][j-1], str):
                inputLine = dataset[i][j-1].strip()
            else:
                inputLine = dataset[i][j-1]['text'].strip()
                
            if not isinstance(inputLine, str) or not inputLine or len(inputLine.split()) <= 0: 
                continue
            inputLine = normalizeString(inputLine)
            dialog.append([inputLine])

    return dialog

In [5]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [6]:
def normalizeString(s):
    if isinstance(s, str): 
        s = unicodeToAscii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub(r"\s+", r" ", s).strip()
    return s

In [7]:
def shuffle_list(list):
    random.shuffle(list)

def create_vocab(dialog):
    vocab = []
    word_freq = {}
    
    for sentence in dialog:
        sen = "".join(c for c in sentence)
        train_words = str(sen).split(" ")
        
        for word in train_words:
          
            if word.lower() not in vocab:
                vocab.append(word.lower())         
                       
            if word.lower() not in word_freq:
                word_freq[word.lower()] = 1
            else:
                #type(word)
                word_freq[word] += 1
    
    word_freq_sorted = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)
    vocab = ["<UNK>"] + [pair[0] for pair in word_freq_sorted]
    
    return vocab


def create_word_to_id(vocab):             
    word_to_id = {word: id for id, word in enumerate(vocab)}
    
    return word_to_id


def create_id_to_vec(word_to_id, glovefile): 
    lines = open(glovefile, 'r', encoding='utf-8').readlines()
    id_to_vec = {}
    vector = None
    
    for line in lines:
        word = line.split()[0]
        vector = np.array(line.split()[1:], dtype='float32') #32
        
        if word in word_to_id:
            id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(vector))
            
    for word, id in word_to_id.items(): 
        if word_to_id[word] not in id_to_vec:
            v = np.zeros(*vector.shape, dtype='float32')
            v[:] = np.random.randn(*v.shape)*0.01
            id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(v))
            
    embedding_dim = id_to_vec[0].shape[0]
    
    return id_to_vec, embedding_dim

def load_id(sentence, word_to_id):
    sentence_ids = []

    max_sentence_len = 160
    
    sentence_words = sentence.split()
    if len(sentence_words) > max_sentence_len:
        sentence_words = sentence_words[:max_sentence_len]
    for word in sentence_words:
        if word in word_to_id:
            sentence_ids.append(word_to_id[word])
        else: 
            sentence_ids.append(0) #UNK

    return sentence_ids

In [8]:
class Voc:
    def __init__(self):
        self.vocab = {}
        self.sentences = []
        self.word2id = {}
        self.id2vec = None
        
    def save(self):
        torch.save({
                'voc_dict': self.__dict__,
            }, os.path.join('saveDir', 'save_voc.tar'))
    
    def load(self, filename):
        checkpoint = torch.load(filename)
        self.__dict__ = checkpoint['voc_dict']

In [9]:
def prepareData(embedding_dim):
    print("Start preparing training data ...")
    raw_dataset = load_dataset("conv_ai_2")
    raw_dataset1 = load_dataset("daily_dialog")
    train_dataset = raw_dataset["train"]
    train_dataset1 = raw_dataset1["train"]
    raw_dialog_list = train_dataset["dialog"]
    raw_dialog_list1 = train_dataset1["dialog"]
    
    eval_list = train_dataset["eval_score"]
    eval_list1 = [2 for i in range(len(raw_dialog_list1))]
    
    dialog_pairs = get_dialog_pairs(raw_dialog_list, eval_list)
    dialog_pairs1 = get_dialog_pairs(raw_dialog_list1, eval_list1)
    dialog_pairs.extend(dialog_pairs1)
    #dialog_pairs = [[normalizeString(s) for s in l] for l in dialog_pairs]
    only_dialog_pairs = []
    for i in range(len(dialog_pairs)):
        only_dialog_pairs.append(dialog_pairs[i][0:2])
    
    dialog_indiv = get_dialog(raw_dialog_list)
    dialog_indiv1 = get_dialog(raw_dialog_list1)
    dialog_indiv.extend(dialog_indiv1)
    
    #dialog_indiv = [[normalizeString(s) for s in l] for l in dialog_indiv]
    trimmed_sentences = []
    for s in dialog_indiv:
        if isinstance(s[0], str) and len(s[0].split()) < 15:
            trimmed_sentences.append(s)
    
    vocab = create_vocab(trimmed_sentences)
    voc = Voc()
    voc.vocab = vocab
    voc.sentences = trimmed_sentences
    shuffle_list(dialog_pairs)
    
    for pair in dialog_pairs:
        if len(pair[0].split()) <= 0:
            pair[0] = 'Oh'
        if len(pair[1].split()) <= 0:
            pair[1] = 'Oh'
            
    # Trim pairs to max 14 words and under
    trimmed_pairs = []
    for pair in dialog_pairs:
        if isinstance(pair[0], str) and len(pair[0].split()) < 15 and isinstance(pair[1], str) and len(pair[1].split()) < 15:
            trimmed_pairs.append(pair)
    
    #training_data = trimmed_pairs
    training_data = trimmed_pairs[:-(int(len(trimmed_pairs) / 10))]
    
    word_to_id = create_word_to_id(vocab)
    voc.word2id = word_to_id
    id_to_vec, emb_dim = create_id_to_vec(word_to_id, 'saveDir/GloVe/glove.6B.%dd.txt' %embedding_dim)
    voc.id2vec = id_to_vec
    voc.save()

    validation_data = trimmed_pairs[-(int(len(trimmed_pairs) / 10)):]
    
    return training_data, validation_data, voc

In [10]:
PAD_token = 0  
SOS_token = 1 
EOS_token = 2

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

# Returns padded sequence tensor and lengths
def sequenceVar(l, word_to_id):
    indexes_batch = [load_id(sentence, word_to_id) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns all items for a given batch of pairs
def batch2TrainData(pair_batch, word_to_id):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch, labels = [], [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
        labels.append(pair[2])
    inp, lengths = sequenceVar(input_batch, word_to_id)
    output, lengths1 = sequenceVar(output_batch, word_to_id)
    return inp, lengths, output, lengths1, labels

In [11]:
class Encoder(nn.Module):

    def __init__(self, 
            hidden_size,
            vocab_size,
            id_to_vec,
            embedding,
            p_dropout): 
    
            super(Encoder, self).__init__()
            self.hidden_size = hidden_size
            self.vocab_size = vocab_size
            self.id_to_vec = id_to_vec
            self.p_dropout = p_dropout
            self.embedding = embedding
            self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
            self.dropout_layer = nn.Dropout(self.p_dropout) 

            self.init_weights()
            
    def init_weights(self):
        init.uniform(self.lstm.weight_ih_l0, a = -0.01, b = 0.01)
        init.orthogonal(self.lstm.weight_hh_l0)
        self.lstm.weight_ih_l0.requires_grad = True
        self.lstm.weight_hh_l0.requires_grad = True
        
        embedding_weights = torch.FloatTensor(self.vocab_size, self.hidden_size)
        embedding_weights = embedding_weights.to(device)
        for id, vec in self.id_to_vec.items():
            embedding_weights[id] = vec
        
        self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True)
            
    def forward(self, input_seq, input_lengths, hidden=None):
        embeddings = self.embedding(input_seq)
        embeddings = embeddings.to(device)
        packed = nn.utils.rnn.pack_padded_sequence(embeddings, input_lengths, enforce_sorted = False)
        
        _, (last_hidden, _) = self.lstm(packed, hidden)
        last_hidden = self.dropout_layer(last_hidden[-1])
        return last_hidden
    
class DualEncoder(nn.Module):
     
    def __init__(self, encoder):
        super(DualEncoder, self).__init__()
        self.encoder = encoder
        self.hidden_size = self.encoder.hidden_size
        M = torch.FloatTensor(self.hidden_size, self.hidden_size)
        M = M.to(device)
        init.xavier_normal(M)
        self.M = nn.Parameter(M, requires_grad = True)

    def forward(self, input_seq, response_seq, input_lengths, response_lengths):
        context_last_hidden = self.encoder(input_seq, input_lengths)
        context_last_hidden = context_last_hidden.to(device)
        response_last_hidden = self.encoder(response_seq, response_lengths)
        response_last_hidden = response_last_hidden.to(device)
        
        context = context_last_hidden.mm(self.M)
        context = context.view(-1, 1, self.hidden_size)
        context = context.to(device)
        
        response = response_last_hidden.view(-1, self.hidden_size, 1) 
        response = response.to(device)
        
        score = torch.bmm(context, response).view(-1, 1)
        score = score.to(device)

        return score

In [12]:
def creating_model(hidden_size, vocab_size, id_to_vec, embedding, p_dropout):

    encoder = Encoder(
            hidden_size = hidden_size,
            vocab_size = vocab_size,
            id_to_vec = id_to_vec,
            embedding = embedding,
            p_dropout = p_dropout)

    dual_encoder = DualEncoder(encoder)
    
    return encoder, dual_encoder

In [13]:
def increase_count(score, label):
    corr = False
    if ((score >= 0.5) and (label >= 1)) or ((score < 0.5) and (label  <= 0)):
        corr = True
    return corr


def get_accuracy(correct_count, length):
    accuracy = correct_count/length
        
    return accuracy

In [14]:
def train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func, is_train = True):

    if is_train:
        # Zero gradients
        optimizer.zero_grad()

    # Set device options
    context = context.to(device)
    response = response.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")
    lengths1 = lengths1.to("cpu")

    # Initialize variables
    loss = 0
    sum_loss = 0.0
    correct_count = 0

    # Forward pass through encoder
    score = c_encoder(context, response, lengths, lengths1)
    total_count = 0
    for n in range(len(score)):
        total_count += 1
        label = autograd.Variable(torch.DoubleTensor(torch.from_numpy(np.array(labels[n]).astype(float).reshape(1,1))), requires_grad = False)
        label = label.to(device)
        loss = loss_func(score[n].reshape(1,1), label)
        sum_loss += loss.item()
        if increase_count(score[n].item(), label.item()):
            correct_count += 1

    if is_train:
        # Perform backpropatation
        loss.backward()
        # Adjust model weights
        optimizer.step()

    return loss, correct_count, total_count

In [15]:
def trainIterAll(training_data, validation_data, voc, c_encoder, learning_rate, l2_penalty, n_iteration, batch_size, epochs):

    optimizer = torch.optim.Adam(c_encoder.parameters(), lr = learning_rate, weight_decay = l2_penalty)
    loss_func = torch.nn.BCEWithLogitsLoss()
    loss_func = loss_func.to(device)
    total_training_accuracy = 0
    best_validation_accuracy = 0
    best_training_accuracy = 0
    
    word_to_id = voc.word2id
    
    for epoch in range(epochs):
        
        training_batches = [batch2TrainData([random.choice(training_data) for _ in range(batch_size)], word_to_id)
                          for _ in range(n_iteration)]

        validation_batches = [batch2TrainData([random.choice(validation_data) for _ in range(batch_size)], word_to_id)
                          for _ in range(n_iteration)]

        # Initializations
        start_iteration = 1
        print_loss = 0

        sum_loss_training = 0.0
        training_correct_count = 0
        training_total_count = 0
        #c_encoder.train()

        # Training loop
        c_encoder.train()
        i = 0
        for iteration in range(start_iteration, n_iteration + 1):
            if (i % 1000 == 0):
                print('Iteration ', i)
            i += 1
            training_batch = training_batches[iteration - 1]
            # Extract fields from batch
            context, lengths, response, lengths1, labels = training_batch

            # Run a training iteration with batch
            loss, correct_count, total_count = train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func)
            training_correct_count += correct_count
            training_total_count += total_count
            sum_loss_training += loss
            print_loss += loss

            # Print progress
            if iteration % 1000 == 0:
                print_loss_avg = print_loss / 1000
                print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
                print('Correct count: ', training_correct_count)
                print('Total count: ', training_total_count)
                print_loss = 0

        training_accuracy = get_accuracy(training_correct_count, training_total_count)
        
        c_encoder.eval()
        
        # Iterate through validation set
        validation_correct_count = 0
        validation_total_count = 0
        sum_loss_validation = 0.0
        i = 0
        for iteration in range(start_iteration, n_iteration + 1):
            if (i % 1000 == 0):
                print('Iteration val ', i)
            i += 1
            validation_batch = validation_batches[iteration - 1]
            # Extract fields from batch
            context, lengths, response, lengths1, labels = validation_batch

            # Run a training iteration with batch
            loss, correct_count, total_count = train(context, response, lengths, lengths1, labels, c_encoder, optimizer, loss_func, False)
            sum_loss_validation += loss
            validation_correct_count += correct_count
            validation_total_count += total_count

        validation_accuracy = get_accuracy(validation_correct_count, validation_total_count)

        print(str(datetime.datetime.now()).split('.')[0], 
              "Epoch: %d/%d" %(epoch,epochs),  
              "Train Accuracy: %.3f" %(training_accuracy), 
              "Validation Accuracy: %.3f" %(validation_accuracy))

        # Saving best result
        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
            print("Saving new best")
            torch.save({
                    'en': c_encoder.state_dict(),
                    'opt': optimizer.state_dict(),
                    'voc_dict': voc.__dict__
                }, os.path.join('saveDir', 'retrieval_model_Val4.tar'))
                
        # Saving best train result
        if training_accuracy > best_training_accuracy:
            best_training_accuracy = training_accuracy
            torch.save({
                        'en': c_encoder.state_dict(),
                        'opt': optimizer.state_dict(),
                        'voc_dict': voc.__dict__
                }, os.path.join('saveDir', 'retrieval_model_Train4.tar'))
        
    print(str(datetime.datetime.now()).split('.')[0], "Training and validation epochs finished.")
    

In [16]:
n_iteration = 8000
batch_size = 64
training_data, validation_data, voc = prepareData(50)

Start preparing training data ...


Reusing dataset conv_ai_2 (C:\Users\justi\.cache\huggingface\datasets\conv_ai_2\conv_ai_2\1.0.0\11d600ddce66bb9d07ca50d1b55b488145ef0d5d0206168c32f1043677875865)
Using custom data configuration default
Reusing dataset daily_dialog (C:\Users\justi\.cache\huggingface\datasets\daily_dialog\default\1.0.0\c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)


In [17]:
hidden_size = 100
vocab_size = len(voc.vocab)
id_to_vec = voc.id2vec
embedding = nn.Embedding(vocab_size, hidden_size)
encoder, dual_encoder = creating_model(hidden_size, vocab_size, id_to_vec, embedding, p_dropout = 0.85)

encoder = encoder.to(device)
dual_encoder = dual_encoder.to(device)

In [18]:
trainIterAll(training_data = training_data, validation_data = validation_data, voc = voc, c_encoder = dual_encoder, learning_rate = 0.0001, l2_penalty = 0.000001, n_iteration = n_iteration, batch_size = batch_size, epochs = 5)

Iteration  0
Iteration: 1000; Percent complete: 12.5%; Average loss: 0.5666
Correct count:  2441
Total count:  4000
Iteration  1000
Iteration: 2000; Percent complete: 25.0%; Average loss: 0.4819
Correct count:  5572
Total count:  8000
Iteration  2000
Iteration: 3000; Percent complete: 37.5%; Average loss: 0.4894
Correct count:  8730
Total count:  12000
Iteration  3000
Iteration: 4000; Percent complete: 50.0%; Average loss: 0.5043
Correct count:  11907
Total count:  16000
Iteration  4000
Iteration: 5000; Percent complete: 62.5%; Average loss: 0.4588
Correct count:  15176
Total count:  20000
Iteration  5000
Iteration: 6000; Percent complete: 75.0%; Average loss: 0.4486
Correct count:  18485
Total count:  24000
Iteration  6000
Iteration: 7000; Percent complete: 87.5%; Average loss: 0.4496
Correct count:  21810
Total count:  28000
Iteration  7000
Iteration: 8000; Percent complete: 100.0%; Average loss: 0.4486
Correct count:  25160
Total count:  32000
Iteration val  0
Iteration val  1000
It