# Dialogue Graph Auto Construction based on data with a regular structure


Goal: Extract regular structures from the data by building a dialogue graph
    
Tasks: 
* Cluster dialog data using embeddings of pre-trained models (BERT, ConveRT, S-BERT…)
* Evaluate the quality of clustering using intent’s labeling of Multi-WoZ dataset 
* Linking clusters of dialogs using naive approaches (Estimation of Probabilities by Frequency Models)
* Try other approaches (Deep Neural Networks) for linking clusters and improve the naive approach


In [1]:
from collections import Counter
from datasets import load_dataset
from dgl.dataloading import GraphDataLoader
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from torch import nn
from torch.nn import Linear
from torch.utils.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.nn import MessagePassing
from torch_scatter import scatter_add
import dgl
import dgl.nn.pytorch as dglnn
import math
import networkx as nx
import numpy as np
import os
import pandas as pd
import random
import sys
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F

2023-02-22 17:48:39.350488: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-22 17:48:40.544845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-22 17:48:40.544961: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
print(torch.cuda.device_count())

4


In [3]:
sys.path.insert(1, '/cephfs/home/ledneva/final_work/common_utils/')

In [4]:
from data_function import get_data
from functions_GTN import preprocessing
from early_stopping_tools import LRScheduler, EarlyStopping
from preprocess import Clusters, get_accuracy_k, get_all_accuracy_k

In [5]:
from model_fastgtn import FastGTNs

In [6]:
first_num_clusters = 400
second_num_clusters = 60

In [7]:
path = "/cephfs/home/ledneva/final_work/convert_one_prev_embeddings.csv"
clusters = Clusters(first_num_clusters, second_num_clusters, path)
clusters.form_clusters()

The data is loading...


No config specified, defaulting to: multi_woz_v22/v2.2_active_only
Found cached dataset multi_woz_v22 (/home/ledneva/.cache/huggingface/datasets/multi_woz_v22/v2.2_active_only/2.2.0/6719c8b21478299411a0c6fdb7137c3ebab2e6425129af831687fb7851c69eb5)


  0%|          | 0/3 [00:00<?, ?it/s]

The embeddings are loading...
The first stage of clustering has begun...
The second stage of clustering has begun...
The searching clusters for test and validation has begun...
Intent metric conveRT-one-prev, 60 two-stage clusters,                 user: 0.7370199324793312
Intent metric conveRT-one-prev, 60 two-stage clusters,                 system: 0.7554430398668559


## 4.1 GTN

Functions generating butches for two types of graphs and metric function

In [8]:
device = torch.device('cuda')

In [9]:
top_k = 10
batch_size = 512
embs_dim = len(clusters.user_cluster_embs[0])

In [10]:
null_cluster_emb = np.zeros(embs_dim)
fake_cluster_emb = np.zeros(embs_dim)

embs = np.concatenate([clusters.user_cluster_embs, clusters.system_cluster_embs, [null_cluster_emb, fake_cluster_emb]])

In [11]:
user_train_x, user_train_y, sys_train_x, sys_train_y = get_data(clusters.train_dataset, top_k, second_num_clusters, clusters.train_user_df, clusters.train_system_df)
user_test_x, user_test_y, sys_test_x, sys_test_y = get_data(clusters.test_dataset, top_k, second_num_clusters, clusters.test_user_df, clusters.test_system_df)
user_valid_x, user_valid_y, sys_valid_x, sys_valid_y = get_data(clusters.validation_dataset, top_k, second_num_clusters, clusters.valid_user_df, clusters.valid_system_df)

In [12]:
user_train_matrices, user_train_node_embs, user_train_labels = preprocessing(user_train_x, 
                                                                             user_train_y, 
                                                                             batch_size,
                                                                             top_k, embs,
                                                                             second_num_clusters, 1)

100%|█████████████████████████████████████████| 111/111 [03:20<00:00,  1.80s/it]


In [13]:
sys_train_matrices, sys_train_node_embs, sys_train_labels = preprocessing(sys_train_x, 
                                                                          sys_train_y, 
                                                                          batch_size,
                                                                          top_k, embs,
                                                                          second_num_clusters, 1)

100%|█████████████████████████████████████████| 111/111 [03:17<00:00,  1.78s/it]


In [14]:
user_test_matrices, user_test_node_embs, user_test_labels = preprocessing(user_test_x, 
                                                                          user_test_y, 
                                                                          batch_size,
                                                                          top_k, embs,
                                                                          second_num_clusters, 0)
sys_test_matrices, sys_test_node_embs, sys_test_labels = preprocessing(sys_test_x,
                                                                       sys_test_y, 
                                                                       batch_size,
                                                                       top_k, embs,
                                                                       second_num_clusters, 0)

100%|███████████████████████████████████████████| 15/15 [00:26<00:00,  1.77s/it]
100%|███████████████████████████████████████████| 15/15 [00:25<00:00,  1.69s/it]


In [15]:
user_valid_matrices, user_valid_node_embs, user_valid_labels = preprocessing(user_valid_x, 
                                                                            user_valid_y, 
                                                                            batch_size,
                                                                            top_k, embs,
                                                                            second_num_clusters, 1)
sys_valid_matrices, sys_valid_node_embs, sys_valid_labels = preprocessing(sys_valid_x,
                                                                         sys_valid_y, 
                                                                         batch_size,
                                                                         top_k, embs,
                                                                         second_num_clusters, 1)

100%|███████████████████████████████████████████| 15/15 [00:27<00:00,  1.81s/it]
100%|███████████████████████████████████████████| 15/15 [00:26<00:00,  1.76s/it]


## User model

In [43]:
class user_GTN_arguments():
    epoch = 30
    model = 'FastGTN'
    node_dim = 512
    num_channels = 5
    lr = 0.0005
    weight_decay = 0.0005
    num_layers = 3
    channel_agg = 'mean'
    remove_self_loops = False
    beta = 1
    non_local = False
    non_local_weight = 0
    num_FastGTN_layers = 2
    top_k = 10

In [44]:
user_args = user_GTN_arguments()
user_args.num_nodes = user_train_node_embs[0].shape[0]

In [45]:
user_model = FastGTNs(num_edge_type = 4,
                w_in = user_train_node_embs[0].shape[1],
                num_class=second_num_clusters,
                num_nodes = user_train_node_embs[0].shape[0],
                args = user_args)

user_model.to(device)
user_loss = nn.CrossEntropyLoss()

In [46]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
user_optimizer = torch.optim.Adam(user_model.parameters(), lr = user_args.lr)
user_lr_scheduler = LRScheduler(user_optimizer)
user_early_stopping = EarlyStopping()

In [47]:
train_num_batches = len(user_train_matrices)
valid_num_batches = len(user_valid_matrices)
old_valid_loss = np.inf

for epoch in range(user_args.epoch):
    train_epoch_loss = 0
    
    for num_iter in range(train_num_batches):
        A = user_train_matrices[num_iter]
        node_features = user_train_node_embs[num_iter]
        y_true = torch.from_numpy(user_train_labels[num_iter]).to(device)
        
        user_model.zero_grad()
        user_model.train()

        y_train = user_model(A, node_features, epoch=epoch)
        if -1 in y_true:
            train_loss = user_loss(y_train[y_true != -1], y_true[y_true != -1])
        else:
            train_loss = user_loss(y_train, y_true)
        # тут считать лосс, выкинуть фейки

        train_loss.backward()
        user_optimizer.step()
        train_epoch_loss += train_loss.detach().item()
        
    train_epoch_loss /= train_num_batches

    valid_epoch_loss = 0
    with torch.no_grad():
        for num_iter in range(valid_num_batches):
            A = user_valid_matrices[num_iter]
            node_features = user_valid_node_embs[num_iter]
            y_true = torch.from_numpy(user_valid_labels[num_iter]).to(device)
            
            y_valid = user_model.forward(A, node_features, epoch=epoch)
            if -1 in y_true:
                valid_loss = user_loss(y_valid[y_true != -1], y_true[y_true != -1])
            else:
                valid_loss = user_loss(y_valid, y_true)

            # тут считать лосс, выкинуть фейки
            valid_epoch_loss += valid_loss.detach().item()
        
        valid_epoch_loss /= valid_num_batches
    
        if abs(valid_epoch_loss - old_valid_loss) < 1e-4 or old_valid_loss < valid_epoch_loss:
            break
        old_valid_loss = valid_epoch_loss
    
    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, valid loss {valid_epoch_loss:.4f}')  
    
    user_lr_scheduler(valid_epoch_loss)
    user_early_stopping(valid_epoch_loss)
    
    if user_early_stopping.early_stop:
        break

Epoch 0, train loss 3.5486, valid loss 3.0496
Epoch 1, train loss 2.6952, valid loss 2.5077
Epoch 2, train loss 2.4510, valid loss 2.4064
Epoch 3, train loss 2.3867, valid loss 2.3713
Epoch 4, train loss 2.3542, valid loss 2.3506
Epoch 5, train loss 2.3312, valid loss 2.3346
Epoch 6, train loss 2.3123, valid loss 2.3202
Epoch 7, train loss 2.2955, valid loss 2.3060
Epoch 8, train loss 2.2797, valid loss 2.2917
Epoch 9, train loss 2.2647, valid loss 2.2780
Epoch 10, train loss 2.2505, valid loss 2.2653
Epoch 11, train loss 2.2375, valid loss 2.2541
Epoch 12, train loss 2.2256, valid loss 2.2442
Epoch 13, train loss 2.2146, valid loss 2.2352
Epoch 14, train loss 2.2044, valid loss 2.2270
Epoch 15, train loss 2.1947, valid loss 2.2196
Epoch 16, train loss 2.1857, valid loss 2.2124
Epoch 17, train loss 2.1771, valid loss 2.2057
Epoch 18, train loss 2.1690, valid loss 2.1999
Epoch 19, train loss 2.1613, valid loss 2.1939
Epoch 20, train loss 2.1540, valid loss 2.1881
Epoch 21, train loss 2.

In [48]:
user_model.eval()
test_num_batches = len(user_test_matrices)
user_true = []
user_test = []

with torch.no_grad():
    for num_iter in range(test_num_batches):
        A = user_test_matrices[num_iter]
        node_features = user_test_node_embs[num_iter]
        y_true = torch.from_numpy(user_test_labels[num_iter])
        y_test = torch.softmax(user_model.forward(A, node_features), 1)
        
        if -1 in y_true:
            user_test += y_test[y_true != -1].tolist()
            user_true += y_true[y_true != -1].tolist()
        else:
            user_test += y_test.tolist()
            user_true += y_true.tolist()

In [49]:
print("USER metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_user_df, user_test, clusters.test_dataset, 0))
print("Acc@3:", get_accuracy_k(3, clusters.test_user_df, user_test, clusters.test_dataset, 0))
print("Acc@5:", get_accuracy_k(5, clusters.test_user_df, user_test, clusters.test_dataset, 0))
print("Acc@10:", get_accuracy_k(10, clusters.test_user_df, user_test, clusters.test_dataset, 0))

USER metric
Acc@1: 0.39411810411810416
Acc@3: 0.6570222166722167
Acc@5: 0.7709083694083694
Acc@10: 0.9035682484182485


## System model

In [50]:
class sys_GTN_arguments():
    epoch = 30
    model = 'FastGTN'
    node_dim = 512
    num_channels = 5
    lr = 0.0005
    weight_decay = 0.0005
    num_layers = 3
    channel_agg = 'mean'
    remove_self_loops = False
    beta = 1
    non_local = False
    non_local_weight = 0
    num_FastGTN_layers = 2
    top_k = 10

In [51]:
sys_args = sys_GTN_arguments()
sys_args.num_nodes = sys_train_node_embs[0].shape[0]

In [52]:
sys_model = FastGTNs(num_edge_type = 4,
                w_in = sys_train_node_embs[0].shape[1],
                num_class=second_num_clusters, # разобраться что с фейками
                num_nodes = sys_train_node_embs[0].shape[0],
                args = sys_args)

sys_optimizer = torch.optim.Adam(sys_model.parameters(), lr=sys_args.lr)
sys_lr_scheduler = LRScheduler(sys_optimizer)
sys_early_stopping = EarlyStopping()

sys_model.cuda()
sys_loss = nn.CrossEntropyLoss()

In [53]:
train_num_batches = len(sys_train_matrices)
valid_num_batches = len(sys_valid_matrices)
old_valid_loss = np.inf

for epoch in range(sys_args.epoch):
    train_epoch_loss = 0
    
    for num_iter in range(train_num_batches):
        A = sys_train_matrices[num_iter]
        node_features = sys_train_node_embs[num_iter]
        y_true = torch.from_numpy(sys_train_labels[num_iter]).to(device)
        
        sys_model.zero_grad()
        sys_model.train()

        y_train = sys_model(A, node_features, epoch=epoch)
        if -1 in y_true:
            train_loss = sys_loss(y_train[y_true != -1], y_true[y_true != -1])
        else:
            train_loss = sys_loss(y_train, y_true)
        # тут считать лосс, выкинуть фейки

        train_loss.backward()
        sys_optimizer.step()
        train_epoch_loss += train_loss.detach().item()
        
    train_epoch_loss /= train_num_batches

    valid_epoch_loss = 0
    with torch.no_grad():
        for num_iter in range(valid_num_batches):
            A = sys_valid_matrices[num_iter]
            node_features = sys_valid_node_embs[num_iter]
            y_true = torch.from_numpy(sys_valid_labels[num_iter]).to(device)
            
            y_valid = sys_model.forward(A, node_features, epoch=epoch)
            if -1 in y_true:
                valid_loss = sys_loss(y_valid[y_true != -1], y_true[y_true != -1])
            else:
                valid_loss = sys_loss(y_valid, y_true)

            # тут считать лосс, выкинуть фейки
            valid_epoch_loss += valid_loss.detach().item()
        
        valid_epoch_loss /= valid_num_batches
        
        if abs(valid_epoch_loss - old_valid_loss) < 1e-4 or old_valid_loss < valid_epoch_loss:
            break
        old_valid_loss = valid_epoch_loss
    
    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, valid loss {valid_epoch_loss:.4f}')  
    
    sys_lr_scheduler(valid_epoch_loss)
    sys_early_stopping(valid_epoch_loss)
    
    if sys_early_stopping.early_stop:
        break

Epoch 0, train loss 3.4166, valid loss 2.8127
Epoch 1, train loss 2.3104, valid loss 2.1885
Epoch 2, train loss 2.0958, valid loss 2.1332
Epoch 3, train loss 2.0651, valid loss 2.1182
Epoch 4, train loss 2.0506, valid loss 2.1131
Epoch 5, train loss 2.0414, valid loss 2.1103
Epoch 6, train loss 2.0344, valid loss 2.1083
Epoch 7, train loss 2.0285, valid loss 2.1062
Epoch 8, train loss 2.0232, valid loss 2.1045
Epoch 9, train loss 2.0184, valid loss 2.1025
Epoch 10, train loss 2.0137, valid loss 2.1007
Epoch 11, train loss 2.0091, valid loss 2.0984
Epoch 12, train loss 2.0044, valid loss 2.0961
Epoch 13, train loss 1.9995, valid loss 2.0934
Epoch 14, train loss 1.9944, valid loss 2.0906
Epoch 15, train loss 1.9892, valid loss 2.0876
Epoch 16, train loss 1.9838, valid loss 2.0846
Epoch 17, train loss 1.9784, valid loss 2.0813
Epoch 18, train loss 1.9730, valid loss 2.0779
Epoch 19, train loss 1.9677, valid loss 2.0746
Epoch 20, train loss 1.9624, valid loss 2.0713
Epoch 21, train loss 1.

In [54]:
sys_model.eval()
test_num_batches = len(sys_test_matrices)
sys_true = []
sys_test = []

with torch.no_grad():
    for num_iter in range(test_num_batches):
        A = sys_test_matrices[num_iter]
        node_features = sys_test_node_embs[num_iter]
        y_true = torch.from_numpy(sys_test_labels[num_iter])
        y_test = torch.softmax(sys_model.forward(A, node_features), 1)
        
        if -1 in y_true:
            sys_test += y_test[y_true != -1].tolist()
            sys_true += y_true[y_true != -1].tolist()
        else:
            sys_test += y_test.tolist()
            sys_true += y_true.tolist()

In [55]:
print("SYSTEM metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_system_df, sys_test, clusters.test_dataset, 1))
print("Acc@3:", get_accuracy_k(3, clusters.test_system_df, sys_test, clusters.test_dataset, 1))
print("Acc@5:", get_accuracy_k(5, clusters.test_system_df, sys_test, clusters.test_dataset, 1))
print("Acc@10:", get_accuracy_k(10, clusters.test_system_df, sys_test, clusters.test_dataset, 1))

SYSTEM metric
Acc@1: 0.3633423909423909
Acc@3: 0.7104068958818959
Acc@5: 0.8322004134754134
Acc@10: 0.9161302558552559


In [56]:
print("ALL metric")
print("Acc@1:", get_all_accuracy_k(1, clusters.test_user_df, clusters.test_system_df, user_test, sys_test, clusters.test_dataset))
print("Acc@3:", get_all_accuracy_k(3, clusters.test_user_df, clusters.test_system_df, user_test, sys_test, clusters.test_dataset))
print("Acc@5:", get_all_accuracy_k(5, clusters.test_user_df, clusters.test_system_df, user_test, sys_test, clusters.test_dataset))
print("Acc@10:", get_all_accuracy_k(10, clusters.test_user_df, clusters.test_system_df, user_test, sys_test, clusters.test_dataset))


ALL metric
Acc@1: 0.37873024753024753
Acc@3: 0.6837145562770564
Acc@5: 0.8015543914418916
Acc@10: 0.9098492521367522
