# Dialogue Graph Auto Construction based on data with a regular structure


Goal: Extract regular structures from the data by building a dialogue graph
    
Tasks: 
* Cluster dialog data using embeddings of pre-trained models (BERT, ConveRT, S-BERT…)
* Evaluate the quality of clustering using intent’s labeling of Multi-WoZ dataset 
* Linking clusters of dialogs using naive approaches (Estimation of Probabilities by Frequency Models)
* Try other approaches (Deep Neural Networks) for linking clusters and improve the naive approach


In [None]:
from datasets import load_dataset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from collections import Counter
from torch import nn
from torch_geometric.data import Data
from torch_scatter import scatter_add
from torch_geometric.nn import MessagePassing
from torch.nn import Linear
from torch_geometric.nn import GCNConv
from dgl.dataloading import GraphDataLoader
from torch.utils.data import DataLoader
from tqdm import tqdm

import pandas as pd
import numpy as np
import networkx as nx
import sys
import os
import torch
import math
import tensorflow as tf
import random
import dgl
import torch.nn.functional as F
import dgl.nn.pytorch as dglnn
import torch.nn as nn

from preprocess import Clusters, get_accuracy_k, get_all_accuracy_k, get_all_accuracy_printer

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
print(torch.cuda.device_count())

In [None]:
import wandb
wandb.init(project="MP-tuning")

In [None]:
sys.path.insert(1, '/cephfs/home/ledneva/final_work/common_utils/')

In [None]:
from data_function_uttr_embs import get_data
from GAT_functions_uttr_embs import get_data_dgl_no_cycles
from early_stopping_tools import LRScheduler, EarlyStopping

In [None]:
first_num_clusters = 400
second_num_clusters = 60

In [None]:
embs_path = "/cephfs/home/ledneva/final_work/convert_one_prev_embeddings.csv"
clusters = Clusters(first_num_clusters, second_num_clusters, embs_path)
clusters.form_clusters()

In [None]:
# counting unique intents
unique_intent = []

for intents in clusters.user_df['intent']:
    unique_intent += intents
for intents in clusters.system_df['intent']:
    unique_intent += intents

unique_intent = list(set(unique_intent))  
num_intents = len(unique_intent)
all_intents = []

for i in range(second_num_clusters):
    cluster = clusters.train_user_df[clusters.train_user_df['cluster'] == i]

    intents = []
    for intent_arr in cluster['intent']:
        intents += intent_arr

    intent_count = np.zeros(num_intents)
    for j, intent in enumerate(unique_intent):
        intent_count[j] = intents.count(intent)
    all_intents.append(np.array(intent_count) / sum(intent_count))

for i in range(second_num_clusters):
    cluster = clusters.train_system_df[clusters.train_system_df['cluster'] == i]

    intents = []
    for intent_arr in cluster['intent']:
        intents += intent_arr

    intent_count = np.zeros(num_intents)
    for j, intent in enumerate(unique_intent):
        intent_count[j] = intents.count(intent)
    all_intents.append(np.array(intent_count) / sum(intent_count))

## 4.3 Functions

Functions generating butches for two types of graphs and metric function

In [None]:
device = torch.device('cuda:2')

In [None]:
top_k = 10
batch_size = 256

## 4.4 Preprocessing data

Data generation and preparation

In [None]:
user_train_x, user_train_y, sys_train_x, sys_train_y = get_data(clusters.train_dataset, 
                                                                top_k, second_num_clusters, 
                                                                clusters.train_user_df, 
                                                                clusters.train_system_df,
                                                                np.array(clusters.train_user_embs),
                                                                np.array(clusters.train_system_embs))
user_test_x, user_test_y, sys_test_x, sys_test_y = get_data(clusters.test_dataset, 
                                                            top_k, second_num_clusters,
                                                            clusters.test_user_df,
                                                            clusters.test_system_df,
                                                            np.array(clusters.test_user_embs),
                                                            np.array(clusters.test_system_embs))
user_valid_x, user_valid_y, sys_valid_x, sys_valid_y = get_data(clusters.validation_dataset, 
                                                                top_k, second_num_clusters,
                                                                clusters.valid_user_df, 
                                                                clusters.valid_system_df,
                                                                np.array(clusters.valid_user_embs),
                                                                np.array(clusters.valid_system_embs))

In [None]:
user_train_data = get_data_dgl_no_cycles(user_train_x, user_train_y, 1, top_k, batch_size)

In [None]:
sys_train_data = get_data_dgl_no_cycles(sys_train_x, sys_train_y, 1, top_k, batch_size)

In [None]:
user_test_data = get_data_dgl_no_cycles(user_test_x, user_test_y, 0, top_k, batch_size)

In [None]:
sys_test_data = get_data_dgl_no_cycles(sys_test_x, sys_test_y, 0, top_k, batch_size)

In [None]:
user_valid_data = get_data_dgl_no_cycles(user_valid_x, user_valid_y, 1, top_k, batch_size)

In [None]:
sys_valid_data = get_data_dgl_no_cycles(sys_valid_x, sys_valid_y, 1, top_k, batch_size)

In [None]:
# обучаемые веса для суммирования
linear_weights = np.zeros(top_k)
linear_weights[...] = 1 / top_k
linear_weights = torch.tensor(linear_weights).view(1, -1)
linear_weights = linear_weights.to(device)

In [None]:
intent_embs_dim = len(all_intents[0])
centre_embs_dim = len(clusters.user_cluster_embs[0])

In [None]:
num_comps = 512

In [None]:
learn_embs_dim = num_comps
learn_emb = nn.Parameter(
            torch.Tensor(2 * second_num_clusters + 1, learn_embs_dim), requires_grad=False
)
learn_emb = torch.Tensor(nn.init.xavier_uniform_(learn_emb))
# обучаемый эмбеддинг
# weights = torch.Tensor(emb)

In [None]:
null_cluster_centre_emb = np.zeros(centre_embs_dim)
null_cluster_intent_emb = np.zeros(intent_embs_dim)

In [None]:
centre_mass = torch.Tensor(np.concatenate([clusters.user_cluster_embs, 
                                           clusters.system_cluster_embs, 
                                           [null_cluster_centre_emb]])).to(device)

In [None]:
intent_embs = torch.Tensor(np.concatenate([all_intents, [null_cluster_intent_emb]])).to(device)

In [None]:
# user_cluster_intents, system_cluster_intents - intents
# clusters.user_cluster_embs, clusters.system_cluster_embs - center of mass

## 4.5 Prediction of user clusters

In [None]:
hidden_dim = 2048
embs_dim = 512 + len(centre_mass[0]) + learn_embs_dim + len(intent_embs[0])
num_heads = 2

In [None]:
from dgl import nn as dgl_nn
from torch import nn

class GAT_user(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(GAT_user, self).__init__()

        self.embs = nn.Embedding.from_pretrained(learn_emb).requires_grad_(True)
        self.layer1 = dgl_nn.GATv2Conv(embs_dim, hidden_dim, num_heads)
        self.layer2 = dgl_nn.GATv2Conv(hidden_dim * num_heads, hidden_dim, num_heads)

        self.do1 = nn.Dropout(0.4)
        self.do2 = nn.Dropout(0.4)
        
        self.linear_weights = nn.Embedding.from_pretrained(linear_weights.float()).requires_grad_(True)  
        
        self.classify = nn.Linear(hidden_dim * num_heads, second_num_clusters)

    def forward(self, bg):
        x = bg.ndata['attr']
        x_emb = bg.ndata['emb']
        embeddings = self.embs.weight
        all_embs = torch.concat((embeddings, centre_mass, intent_embs), dim = 1)
        
        get_embs = lambda i: all_embs[i]
        node_embs = get_embs(x)
        
        result_embs = torch.concat((node_embs, x_emb), dim = 1)
        h = result_embs.to(torch.float32)
        
        h = self.layer1(bg, h)
        h = self.do1(h)
        h = torch.reshape(h, (len(h), num_heads * hidden_dim))      
        h = self.layer2(bg, h)
        h = self.do2(h)

        
        bg.ndata['h'] = h
        h = torch.reshape(h, (len(node_embs) // top_k, top_k, num_heads * hidden_dim))        
        linear_weights_1dim = torch.reshape(self.linear_weights.weight, (top_k, ))
        get_sum = lambda e: torch.matmul(linear_weights_1dim, e)
        h = list(map(get_sum, h))
        hg = torch.stack(h)
        return self.classify(hg)   

In [None]:
user_model = GAT_user(hidden_dim, num_heads).to(device)
user_train_epoch_losses = []
user_valid_epoch_losses = []

for param in user_model.parameters():
    param.requires_grad = True
    
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(user_model.parameters(), lr = 0.0001)
lr_scheduler = LRScheduler(optimizer)
early_stopping = EarlyStopping(10)

user_num_epochs = 100

for epoch in range(user_num_epochs):
    train_epoch_loss = 0

    for iter, (batched_graph, labels) in enumerate(user_train_data):
        print(f"{iter} / {len(user_train_data)}")
        out = user_model(batched_graph.to(device))
        loss = criterion(out, labels.to(device))
        optimizer.zero_grad()

        loss.backward() 
        optimizer.step() 
        train_epoch_loss += loss.detach().item()
        
    train_epoch_loss /= (iter + 1)
    user_train_epoch_losses.append(train_epoch_loss)
    
    valid_epoch_loss = 0
    with torch.no_grad():
        for iter, (batched_graph, labels) in enumerate(user_valid_data):
            out = user_model(batched_graph.to(device))
            loss = criterion(out, labels.to(device))
            valid_epoch_loss += loss.detach().item()

        valid_epoch_loss /= (iter + 1)
        user_valid_epoch_losses.append(valid_epoch_loss)
    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, valid loss {valid_epoch_loss:.4f}')  
    wandb.log({'Epoch' : epoch, 'user train loss' : train_epoch_loss, 'user valid loss' : valid_epoch_loss})
    
    lr_scheduler(valid_epoch_loss)
    early_stopping(valid_epoch_loss)
    
    if early_stopping.early_stop:
        break

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.title('user model cross entropy averaged over minibatches')
plt.plot(user_train_epoch_losses, label = "train")
plt.plot(user_valid_epoch_losses, label = "valid")
plt.legend()

In [None]:
user_model.eval()
user_test_X, user_test_Y = map(list, zip(*user_test_data))

user_probs = []
user_test = []

for i in range(len(user_test_Y)):
    g = user_test_X[i].to(device)
    labels = user_test_Y[i]
    labels = labels.tolist()
    user_test += labels
    user_probs_Y = torch.softmax(user_model(g), 1).tolist()
    user_probs += user_probs_Y

In [None]:
print("USER metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_user_df, user_probs, clusters.test_dataset, 0))
print("Acc@3:", get_accuracy_k(3, clusters.test_user_df, user_probs, clusters.test_dataset, 0))
print("Acc@5:", get_accuracy_k(5, clusters.test_user_df, user_probs, clusters.test_dataset, 0))
print("Acc@10:", get_accuracy_k(10, clusters.test_user_df, user_probs, clusters.test_dataset, 0))

## 4.6 Prediction of system clusters

In [None]:
from dgl import nn as dgl_nn
from torch import nn

class GAT_system(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(GAT_system, self).__init__()

        self.embs = nn.Embedding.from_pretrained(learn_emb).requires_grad_(True)
        self.layer1 = dgl_nn.GATv2Conv(embs_dim, hidden_dim, num_heads)
        self.layer2 = dgl_nn.GATv2Conv(hidden_dim * num_heads, hidden_dim, num_heads)

        self.do1 = nn.Dropout(0.4)
        self.do2 = nn.Dropout(0.4)

        self.linear_weights = nn.Embedding.from_pretrained(linear_weights.float()).requires_grad_(True)  
        
        self.classify = nn.Linear(hidden_dim * num_heads, second_num_clusters)

    def forward(self, bg):
        x = bg.ndata['attr']
        x_emb = bg.ndata['emb']
        embeddings = self.embs.weight
        all_embs = torch.concat((embeddings, centre_mass, intent_embs), dim = 1)
        
        get_embs = lambda i: all_embs[i]
        node_embs = get_embs(x)
        
        result_embs = torch.concat((node_embs, x_emb), dim = 1)
        h = result_embs.to(torch.float32)
        
        h = self.layer1(bg, h)
        h = self.do1(h)
        h = torch.reshape(h, (len(h), num_heads * hidden_dim))      
        h = self.layer2(bg, h)
        h = self.do2(h)

        
        bg.ndata['h'] = h
        h = torch.reshape(h, (len(node_embs) // top_k, top_k, num_heads * hidden_dim))        
        linear_weights_1dim = torch.reshape(self.linear_weights.weight, (top_k, ))
        get_sum = lambda e: torch.matmul(linear_weights_1dim, e)
        h = list(map(get_sum, h))
        hg = torch.stack(h)
        return self.classify(hg)   

In [None]:
system_model = GAT_system(hidden_dim, num_heads).to(device)

for param in system_model.parameters():
    param.requires_grad = True
    
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(system_model.parameters(), lr = 0.0001)
lr_scheduler = LRScheduler(optimizer)
early_stopping = EarlyStopping(10)

sys_num_epochs = 100

for epoch in range(sys_num_epochs):
    train_epoch_loss = 0

    for iter, (batched_graph, labels) in enumerate(sys_train_data):
        print(f"{iter}/{len(sys_train_data)}")
        out = system_model(batched_graph.to(device))
        loss = criterion(out, labels.to(device))
        optimizer.zero_grad()

        loss.backward() 
        optimizer.step() 
        train_epoch_loss += loss.detach().item()
        
    train_epoch_loss /= (iter + 1)
    
    valid_epoch_loss = 0
    with torch.no_grad():
        for iter, (batched_graph, labels) in enumerate(sys_valid_data):
            out = system_model(batched_graph.to(device))
            loss = criterion(out, labels.to(device))
            
            valid_epoch_loss += loss.detach().item()

        valid_epoch_loss /= (iter + 1)

    print(f'Epoch {epoch}, train loss {train_epoch_loss:.4f}, valid loss {valid_epoch_loss:.4f}')  
    wandb.log({'system train loss' : train_epoch_loss, 'system valid loss' : valid_epoch_loss})
    lr_scheduler(valid_epoch_loss)
    early_stopping(valid_epoch_loss)
    
    if early_stopping.early_stop:
        break

In [None]:
system_model.eval()
system_test_X, system_test_Y = map(list, zip(*sys_test_data))

system_probs = []
system_test = []

for i in range(len(system_test_Y)):
    g = system_test_X[i].to(device)
    labels = system_test_Y[i]
    labels = labels.tolist()
    system_test += labels
    system_probs_Y = torch.softmax(system_model(g), 1).tolist()
    system_probs += system_probs_Y

In [None]:
print("SYSTEM metric")

print("Acc@1:", get_accuracy_k(1, clusters.test_system_df, system_probs, clusters.test_dataset, 1))
print("Acc@3:", get_accuracy_k(3, clusters.test_system_df, system_probs, clusters.test_dataset, 1))
print("Acc@5:", get_accuracy_k(5, clusters.test_system_df, system_probs, clusters.test_dataset, 1))
print("Acc@10:", get_accuracy_k(10, clusters.test_system_df, system_probs, clusters.test_dataset, 1))

In [None]:
print("ALL metric")
print("Acc@1:", get_all_accuracy_k(1, clusters.test_user_df, clusters.test_system_df, user_probs, system_probs, clusters.test_dataset))
print("Acc@3:", get_all_accuracy_k(3, clusters.test_user_df, clusters.test_system_df, user_probs, system_probs, clusters.test_dataset))
print("Acc@5:", get_all_accuracy_k(5, clusters.test_user_df, clusters.test_system_df, user_probs, system_probs, clusters.test_dataset))
print("Acc@10:", get_all_accuracy_k(10, clusters.test_user_df, clusters.test_system_df, user_probs, system_probs, clusters.test_dataset))
