# Dialogue Graph Auto Construction based on data with a regular structure


Goal: Extract regular structures from the data by building a dialogue graph
    
Tasks: 
* Cluster dialog data using embeddings of pre-trained models (BERT, ConveRT, S-BERT…)
* Evaluate the quality of clustering using intent’s labeling of Multi-WoZ dataset 
* Linking clusters of dialogs using naive approaches (Estimation of Probabilities by Frequency Models)
* Try other approaches (Deep Neural Networks) for linking clusters and improve the naive approach


In [None]:
from datasets import load_dataset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import Counter
from torch import nn
from torch_geometric.data import Data
from torch_scatter import scatter_add
from torch_geometric.nn import MessagePassing
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from dgl.dataloading import GraphDataLoader
from torch.utils.data import DataLoader
from tqdm import tqdm

import dgl
import dgl.nn.pytorch as dglnn
import torch.nn as nn
import pandas as pd
import numpy as np
import networkx as nx
import sys
import os
import torch
import math
import tensorflow as tf
import random
import torch.nn.functional as F
import time

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3"
print(torch.cuda.device_count())

In [None]:
sys.path.insert(1, '/cephfs/home/ledneva/final_work/common_utils/')

In [None]:
from data_function import get_data
from HGT_functions import get_data_dgl
from HGT_model import HGT
from early_stopping_tools import LRScheduler, EarlyStopping

In [None]:
first_num_clusters = 400
second_num_clusters = 60

In [None]:
from preprocess import Clusters, get_accuracy_k, get_all_accuracy_k
path = "/cephfs/home/ledneva/final_work/convert_one_prev_embeddings.csv"
clusters = Clusters(first_num_clusters, second_num_clusters, path)
clusters.form_clusters()

## 4.1 HGT

Functions generating butches for two types of graphs and metric function

In [None]:
import dgl
from torch.utils.data import DataLoader
import dgl.nn.pytorch as dglnn
import torch.nn as nn
from tqdm import tqdm

In [None]:
device = torch.device("cuda")

In [None]:
embs_dim = len(clusters.user_cluster_embs[0])
top_k = 10
batch_size = 256
null_cluster = 2 * second_num_clusters

In [None]:
def collate(samples):
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels).to(device)

In [None]:
null_cluster_emb = np.zeros(embs_dim)

embs = np.concatenate([clusters.user_cluster_embs, clusters.system_cluster_embs, [null_cluster_emb, null_cluster_emb]])

## 4.4 Preprocessing data

Data generation and preparation

In [None]:
user_train_x, user_train_y, user_train_embs, \
sys_train_x, sys_train_y, sys_train_embs = get_data(clusters.train_dataset, top_k, 
                                                    second_num_clusters, 
                                                    clusters.train_user_df, 
                                                    clusters.train_system_df,
                                                    clusters.train_user_embs,
                                                    clusters.train_system_embs)
user_test_x, user_test_y, user_test_embs, \
sys_test_x, sys_test_y, sys_test_embs = get_data(clusters.test_dataset, top_k,
                                                 second_num_clusters, 
                                                 clusters.test_user_df, 
                                                 clusters.test_system_df,
                                                 clusters.test_user_embs,
                                                 clusters.test_system_embs)
user_valid_x, user_valid_y, user_valid_embs, \
sys_valid_x, sys_valid_y, sys_valid_embs = get_data(clusters.validation_dataset, 
                                                    top_k, second_num_clusters, 
                                                    clusters.valid_user_df, 
                                                    clusters.valid_system_df,
                                                    clusters.valid_user_embs,
                                                    clusters.valid_system_embs)

In [None]:
node_dict = {}
edge_dict = {}
for ntype in range(null_cluster + 2):
    node_dict[str(ntype)] = len(node_dict)

edge_dict['user'] = 0
edge_dict['system'] = 1
edge_dict['null'] = 2
edge_dict['self'] = 3

In [None]:
user_train = get_data_dgl(user_train_x, user_train_y,
                          batch_size, top_k, embs,
                          np.array(user_train_embs), 
                          second_num_clusters, 1)
sys_train =  get_data_dgl(sys_train_x, sys_train_y, 
                         batch_size, top_k, embs,
                         np.array(sys_train_embs), 
                         second_num_clusters, 1)

In [None]:
user_test = get_data_dgl(user_test_x, user_test_y,
                         batch_size, top_k, embs, 
                         np.array(user_test_embs),
                         second_num_clusters, 0)
sys_test = get_data_dgl(sys_test_x, sys_test_y,
                        batch_size, top_k, embs, 
                        np.array(sys_test_embs), 
                        second_num_clusters, 0)

In [None]:
user_valid = get_data_dgl(user_valid_x, user_valid_y, 
                          batch_size, top_k, embs, 
                          np.array(user_valid_embs),
                          second_num_clusters, 1)
sys_valid = get_data_dgl(sys_valid_x, sys_valid_y,
                         batch_size, top_k, embs, 
                         np.array(sys_valid_embs),
                         second_num_clusters, 1)

## User_model

In [None]:
class user_HGT_arguments:
    epoch = 20 # эпохи!!!

In [None]:
user_args = user_HGT_arguments()

In [None]:
user_model = HGT(node_dict = node_dict,
                     edge_dict = edge_dict,
                     n_inp = 256,
                     n_hid = 512,
                     n_out = second_num_clusters + 1,
                     n_layers = 3,
                     n_heads = 1,
                     top_k = top_k, 
                     use_norm=True)

user_optimizer = torch.optim.Adam(user_model.parameters(), lr=0.001)
user_lr_scheduler = LRScheduler(user_optimizer, min_lr = 0.000001)
# user_early_stopping = EarlyStopping(1, 0.5)

user_model.to(device)
user_loss = nn.CrossEntropyLoss()

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
train_num_batches = len(user_train)
valid_num_batches = len(user_valid)
train_step = 1500
node_features = torch.from_numpy(embs).float().to(device)
pred_valid_loss = None

for epoch in range(user_args.epoch):
    start_time = time.time()
    train_epoch_loss = 0
    
    torch.cuda.empty_cache()

    for num_iter in tqdm(range(train_num_batches)):
        g = user_train[num_iter][0].to(device)
        graphs = torch.tensor(user_train[num_iter][1]).to(device)
        y_true = torch.from_numpy(user_train[num_iter][2]).to(device)
        etypes = user_train[num_iter][3]
        vtypes = user_train[num_iter][4]
        
        user_model.zero_grad()
        user_model.train()

        y_train = user_model(g, graphs, etypes, vtypes)
        if second_num_clusters in y_true:
            train_loss = user_loss(y_train[y_true != second_num_clusters], 
                                   y_true[y_true != second_num_clusters])
        else:
            train_loss = user_loss(y_train, y_true)

        user_optimizer.zero_grad() 
        torch.cuda.empty_cache()
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(user_model.parameters(), 0.25)
        user_optimizer.step()
        train_epoch_loss += train_loss.detach().item()
        train_step += 1
#         user_lr_scheduler.step(train_step)
#         print(num_iter)
        
    train_epoch_loss /= train_num_batches

    valid_epoch_loss = 0
    with torch.no_grad():
        for num_iter in tqdm(range(valid_num_batches)):
            g = user_valid[num_iter][0].to(device)
            graphs = torch.tensor(user_valid[num_iter][1]).to(device)
            y_true = torch.from_numpy(user_valid[num_iter][2]).to(device)
            etypes = user_valid[num_iter][3]
            vtypes = user_valid[num_iter][4]
            
            y_valid = user_model.forward(g, graphs, etypes, vtypes)
            
            if second_num_clusters in y_true:
                valid_loss = user_loss(y_valid[y_true != second_num_clusters], 
                                       y_true[y_true != second_num_clusters])
            else:
                valid_loss = user_loss(y_valid, y_true)

            # тут считать лосс, выкинуть фейки
            valid_epoch_loss += valid_loss.detach().item()
        
        valid_epoch_loss /= valid_num_batches
    
    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, \
           valid loss {valid_epoch_loss:.4f}, time {time.time() - start_time}')  
    
    if pred_valid_loss != None and valid_epoch_loss > pred_valid_loss:
        break
    
    pred_valid_loss = valid_epoch_loss

In [None]:
user_model.eval()
test_num_batches = len(user_test)
user_true = []
user_pred = []

with torch.no_grad():
    for num_iter in tqdm(range(test_num_batches)):
        g = user_test[num_iter][0].to(device)
        graphs = torch.tensor(user_test[num_iter][1]).to(device)
        y_true = torch.from_numpy(user_test[num_iter][2]).to(device)
        etypes = user_test[num_iter][3]
        vtypes = user_test[num_iter][4]
        
        probs = user_model.forward(g, graphs, etypes, vtypes)
        y_pred = torch.softmax(probs, 1)
        
        if second_num_clusters in y_true:
            user_pred += y_pred[y_true != second_num_clusters].tolist()
            user_true += y_true[y_true != second_num_clusters].tolist()
        else:
            user_pred += y_pred.tolist()
            user_true += y_true.tolist()

In [None]:
print("USER metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_user_df, user_pred, clusters.test_dataset, 0))
print("Acc@3:", get_accuracy_k(3, clusters.test_user_df, user_pred, clusters.test_dataset, 0))
print("Acc@5:", get_accuracy_k(5, clusters.test_user_df, user_pred, clusters.test_dataset, 0))
print("Acc@10:", get_accuracy_k(10, clusters.test_user_df, user_pred, clusters.test_dataset, 0))

In [None]:
# stopper

## System

In [None]:
class sys_HGT_arguments:
    epoch = 5 # эпохи!!!
    lr = 0.001

In [None]:
sys_args = sys_HGT_arguments()

In [None]:
sys_model = HGT(node_dict = node_dict,
                 edge_dict = edge_dict,
                 n_inp = 612,
                 n_hid = 16,
                 n_out = second_num_clusters + 1,
                 n_layers = 3,
                 n_heads = 1,
                 top_k = top_k, 
                 use_norm=True)

sys_optimizer = torch.optim.Adam(sys_model.parameters(), lr=sys_args.lr)
sys_lr_scheduler = LRScheduler(sys_optimizer, min_lr = 0.000001)
sys_early_stopping = EarlyStopping(1, 0.1)

sys_model.to(device)
sys_loss = nn.CrossEntropyLoss()

In [None]:
train_num_batches = len(sys_train)
valid_num_batches = len(sys_valid)
node_features = torch.from_numpy(embs).float().to(device)

for epoch in range(sys_args.epoch):
    start_time = time.time()
    train_epoch_loss = 0
    
    for num_iter in range(train_num_batches):
        print(f"Train: batch {num_iter}, epoch {epoch}")
        g = sys_train[num_iter][0].to(device)
        graphs = torch.tensor(sys_train[num_iter][1]).to(device)
        y_true = torch.from_numpy(sys_train[num_iter][2]).to(device)
        etypes = sys_train[num_iter][3]
        vtypes = sys_train[num_iter][4]
        
        sys_model.zero_grad()
        sys_model.train()

        y_train = sys_model(g, graphs, etypes, vtypes)
        if second_num_clusters in y_true:
            train_loss = sys_loss(y_train[y_true != second_num_clusters], y_true[y_true != second_num_clusters])
        else:
            train_loss = sys_loss(y_train, y_true)

        train_loss.backward()
        sys_optimizer.step()
        train_epoch_loss += train_loss.detach().item()
#         print(num_iter)
        
    train_epoch_loss /= train_num_batches

    valid_epoch_loss = 0
    with torch.no_grad():
        for num_iter in range(valid_num_batches):
            print("Valid", num_iter, "epoch", epoch)
            g = sys_valid[num_iter][0].to(device)
            graphs = torch.tensor(sys_valid[num_iter][1]).to(device)

            y_true = torch.from_numpy(sys_valid[num_iter][2]).to(device)
            etypes = sys_valid[num_iter][3]
            vtypes = sys_valid[num_iter][4]
            
            y_valid = sys_model.forward(g, graphs, etypes, vtypes)
            
            if second_num_clusters in y_true:
                valid_loss = sys_loss(y_valid[y_true != second_num_clusters], y_true[y_true != second_num_clusters])
            else:
                valid_loss = sys_loss(y_valid, y_true)

            # тут считать лосс, выкинуть фейки
            valid_epoch_loss += valid_loss.detach().item()
        
        valid_epoch_loss /= valid_num_batches
    
    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, \
          valid loss {valid_epoch_loss:.4f}, \
          time {time.time() - start_time}')  
    
    sys_lr_scheduler(valid_epoch_loss)
    sys_early_stopping(valid_epoch_loss)
    
    if sys_early_stopping.early_stop:
        break

In [None]:
sys_model.eval()
test_num_batches = len(sys_test)
sys_true = []
sys_pred = []

with torch.no_grad():
    for num_iter in range(test_num_batches):
        g = sys_test[num_iter][0].to(device)
        graphs = torch.tensor(sys_test[num_iter][1]).to(device)
        y_true = torch.from_numpy(sys_test[num_iter][2]).to(device)
        etypes = sys_test[num_iter][3]
        vtypes = sys_test[num_iter][4]
        
        probs = sys_model.forward(g, graphs, etypes, vtypes)
        y_pred = torch.softmax(probs, 1)
        
        if second_num_clusters in y_true:
            sys_pred += y_pred[y_true != second_num_clusters].tolist()
            sys_true += y_true[y_true != second_num_clusters].tolist()
        else:
            sys_pred += y_pred.tolist()
            sys_true += y_true.tolist()

In [None]:
print("SYSTEM metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_system_df, sys_pred, clusters.test_dataset, 1))
print("Acc@3:", get_accuracy_k(3, clusters.test_system_df, sys_pred, clusters.test_dataset, 1))
print("Acc@5:", get_accuracy_k(5, clusters.test_system_df, sys_pred, clusters.test_dataset, 1))
print("Acc@10:", get_accuracy_k(10, clusters.test_system_df, sys_pred, clusters.test_dataset, 1))

In [None]:
print("ALL metric")
print("Acc@1:", get_all_accuracy_k(1, clusters.test_user_df, clusters.test_system_df, user_pred, sys_pred, clusters.test_dataset))
print("Acc@3:", get_all_accuracy_k(3, clusters.test_user_df, clusters.test_system_df, user_pred, sys_pred, clusters.test_dataset))
print("Acc@5:", get_all_accuracy_k(5, clusters.test_user_df, clusters.test_system_df, user_pred, sys_pred, clusters.test_dataset))
print("Acc@10:", get_all_accuracy_k(10, clusters.test_user_df, clusters.test_system_df, user_pred, sys_pred, clusters.test_dataset))