# 

In [8]:
import pandas as pd
import torch
import ast
import networkx as nx
from torch_geometric.data import Data
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GATConv, BatchNorm

In [9]:
# Load the dataset
df_train = pd.read_csv("combined_features.csv")

# Initialize a directed graph
dialogue_graph = nx.DiGraph()

for idx, row in df_train.iterrows():
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    role = row["role"]

    # Add node with attributes
    dialogue_graph.add_node(message_id, text=text, role=role)

    if parent_id:
        dialogue_graph.add_edge(parent_id, message_id)

In [10]:
def string_to_tensor(tensor_string):
    tensor_string = tensor_string.replace("tensor(", "").replace(")", "")
    tensor_list = ast.literal_eval(tensor_string)
    return torch.tensor(tensor_list, dtype=torch.float)

df_train["combined_features"] = df_train["combined_features"].apply(string_to_tensor)

node_id_to_index = {node_id: idx for idx, node_id in enumerate(dialogue_graph.nodes)}

edges = [(node_id_to_index[src], node_id_to_index[dst]) for src, dst in dialogue_graph.edges]

# Create edge index
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

node_features = torch.stack(df_train["combined_features"].tolist())

# Create PyTorch Geometric data object
graph_data = Data(x=node_features, edge_index=edge_index)

In [11]:

# Filter out invalid edges
num_nodes = graph_data.x.size(0)
valid_mask = (graph_data.edge_index[0] < num_nodes) & (graph_data.edge_index[1] < num_nodes)
graph_data.edge_index = graph_data.edge_index[:, valid_mask]

original_message_ids = list(dialogue_graph.nodes)

unique_nodes, edge_index = torch.unique(graph_data.edge_index, return_inverse=True)
edge_index = edge_index.reshape(2, -1)

# Update graph_data
graph_data.edge_index = edge_index
graph_data.x = graph_data.x[unique_nodes]

In [12]:

class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_dim * heads)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_dim * heads)
        self.conv3 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.conv3(x, edge_index)
        return x

In [13]:



input_dim = graph_data.x.size(1)

# Hyperparameters
hidden_dim = 256 
output_dim = 64  
dropout = 0.5  
learning_rate = 0.001  
weight_decay = 5e-4  
num_epochs = 100  
patience = 3  
min_delta = 0.01  # Minimum change in the monitored quantity to qualify as an improvement

model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, dropout=dropout)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
graph_data = graph_data.to(device)

# Normalize input features (to improve training stability)
graph_data.x = (graph_data.x - graph_data.x.mean(dim=0)) / (graph_data.x.std(dim=0) + 1e-8)

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Loss function for graph reconstruction using MSE


print(original_message_ids[:10])

# Create a mapping of original message IDs to their corresponding embeddings
# node_id_to_embedding = {msg_id: node_embeddings[unique_nodes.tolist().index(node_id_to_index[msg_id])] for msg_id in original_message_ids}


['6ab24d72-0181-4594-a9cd-deaf170242fb', nan, 'c8e83833-ecbc-44fe-b6db-735228c25a1c', '6708c47f-05c9-4346-b3d2-40b2bd24fde4', '343ee2d4-87ae-41fd-a768-bdd65959dc4a', '18145bf4-37fd-4ac0-80f5-6108b5f2b365', '636dd191-50df-4894-ba9a-cd7f00767258', 'ac94bfcf-7f25-4084-8755-dde345ac2323', '73d6f715-3787-409c-81e4-fde0e5ef60cd', 'b280ccbc-b68f-42b9-9fc2-d7ac89b88022']


In [14]:
def compute_loss(out):
    adj_reconstructed = torch.sigmoid(torch.mm(out, out.t()))
    adj_matrix = torch.sparse_coo_tensor(
        graph_data.edge_index,
        torch.ones(graph_data.edge_index.size(1)),
        size=(graph_data.num_nodes, graph_data.num_nodes),
    ).to_dense().to(device)
    loss = F.mse_loss(adj_reconstructed, adj_matrix)
    return loss

# Early stopping criteria
best_loss = float('inf')
patience_counter = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    out = model(graph_data.x, graph_data.edge_index)
    loss = compute_loss(out)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    if best_loss - loss.item() > min_delta:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1
    if patience_counter >= patience:
        print("Early stopping due to no significant improvement in loss")
        break

# Extract node embeddings
node_embeddings = out.detach().cpu()

# Save node embeddings
torch.save(node_embeddings, "node_embeddings.pt")

Epoch 1, Loss: 0.4016643762588501
Epoch 2, Loss: 0.35843244194984436
Epoch 3, Loss: 0.3413124680519104
Epoch 4, Loss: 0.33853679895401
Epoch 5, Loss: 0.3158513903617859
Epoch 6, Loss: 0.309251993894577
Epoch 7, Loss: 0.3082663416862488
Epoch 8, Loss: 0.2996441721916199
Epoch 9, Loss: 0.2912551164627075
Epoch 10, Loss: 0.2846744656562805
Epoch 11, Loss: 0.28691428899765015
Epoch 12, Loss: 0.28918108344078064
Epoch 13, Loss: 0.288006454706192
Early stopping due to no significant improvement in loss


In [15]:
print("Sample of node_id_to_index:", list(node_id_to_index.items())[:10])
print("unique_nodes:", unique_nodes)
missing_indices = [idx for idx in node_id_to_index.values() if idx not in unique_nodes.tolist()]
print("Missing indices in unique_nodes:", missing_indices)


Sample of node_id_to_index: [('6ab24d72-0181-4594-a9cd-deaf170242fb', 0), (nan, 1), ('c8e83833-ecbc-44fe-b6db-735228c25a1c', 2), ('6708c47f-05c9-4346-b3d2-40b2bd24fde4', 3), ('343ee2d4-87ae-41fd-a768-bdd65959dc4a', 4), ('18145bf4-37fd-4ac0-80f5-6108b5f2b365', 5), ('636dd191-50df-4894-ba9a-cd7f00767258', 6), ('ac94bfcf-7f25-4084-8755-dde345ac2323', 7), ('73d6f715-3787-409c-81e4-fde0e5ef60cd', 8), ('b280ccbc-b68f-42b9-9fc2-d7ac89b88022', 9)]
unique_nodes: tensor([    0,     1,     2,  ..., 39280, 39281, 39282])
Missing indices in unique_nodes: [39283, 39284, 39285, 39286, 39287, 39288, 39289, 39290, 39291, 39292, 39293, 39294, 39295, 39296, 39297, 39298, 39299, 39300, 39301, 39302, 39303, 39304, 39305, 39306, 39307, 39308, 39309, 39310, 39311, 39312, 39313, 39314, 39315, 39316, 39317, 39318, 39319, 39320, 39321, 39322, 39323, 39324, 39325, 39326, 39327, 39328, 39329, 39330, 39331, 39332, 39333, 39334, 39335, 39336, 39337, 39338, 39339, 39340, 39341, 39342, 39343, 39344, 39345, 39346, 393

In [16]:
problematic_msg_id = None
for msg_id in original_message_ids:
    try:
        node_index = node_id_to_index[msg_id]
        unique_nodes.tolist().index(node_index)
    except ValueError:
        problematic_msg_id = msg_id
        break

print("Problematic message ID:", problematic_msg_id)


Problematic message ID: 0e06098e-7b5c-471b-8ee2-853cebc3ed58


In [17]:
node_id_to_embedding = {}
for msg_id in original_message_ids:
    node_index = node_id_to_index[msg_id]
    if node_index in unique_nodes.tolist():
        embedding = node_embeddings[unique_nodes.tolist().index(node_index)]
    else:
        embedding = torch.zeros(output_dim)  # Default embedding if not found
    node_id_to_embedding[msg_id] = embedding


In [18]:
import pandas as pd
import torch

# Define the dimensions of the embeddings
output_dim = 64  

df_train = pd.read_csv("OpenAssistant_English_Train.csv")

# node_id_to_embedding = torch.load("path_to_node_embeddings.pth")  # Load your embeddings dictionary

# Add embeddings to the DataFrame
df_train["embedding"] = df_train["message_id"].apply(lambda x: node_id_to_embedding.get(x, torch.zeros(output_dim)))

print(df_train.head())

df_train.to_csv("OpenAssistant_English_Train_with_Embeddings.csv", index=False)

print("Updated DataFrame saved successfully!")


                             message_id                             parent_id  \
0  6ab24d72-0181-4594-a9cd-deaf170242fb                                   NaN   
1  c8e83833-ecbc-44fe-b6db-735228c25a1c  6ab24d72-0181-4594-a9cd-deaf170242fb   
2  6708c47f-05c9-4346-b3d2-40b2bd24fde4  c8e83833-ecbc-44fe-b6db-735228c25a1c   
3  343ee2d4-87ae-41fd-a768-bdd65959dc4a  6ab24d72-0181-4594-a9cd-deaf170242fb   
4  18145bf4-37fd-4ac0-80f5-6108b5f2b365  343ee2d4-87ae-41fd-a768-bdd65959dc4a   

                                                text       role  \
0  Can you write a short introduction about the r...   prompter   
1  "Monopsony" refers to a market structure where...  assistant   
2                            Now explain it to a dog   prompter   
3  Monopsony is a market structure in which there...  assistant   
4  How can one fight back when a monospony had be...   prompter   

                                           embedding  
0  [tensor(0.1143), tensor(0.0116), tensor(0.0887...  


In [19]:
import pandas as pd
import torch
from langchain_ollama import OllamaLLM
import ast

In [20]:


# Initialize the OllamaLLM with the LLaMA 3 model
llm = OllamaLLM(model="llama3")

def get_assistant_response(prompt, embedding=None):
    if embedding is not None:
        # Convert embedding to a string format
        embedding_str = " ".join([f"{x:.4f}" for x in embedding])
        # Combine prompt and embedding with an instruction for a short response
        combined_prompt = f"{prompt}\n\n[Embedding: {embedding_str}]\nPlease provide a short response."
        response = llm.invoke(combined_prompt)
    else:
        # Add an instruction for a short response
        combined_prompt = f"{prompt}\nPlease provide a short response."
        response = llm.invoke(combined_prompt)
    return response

# Create a new DataFrame to store the results
results = []

In [None]:
# Iterate through the dataset and pair prompter messages with their respective assistant responses

df = df_train

for index, row in df.iterrows():
    if row['role'] == 'prompter':
        prompt = row['text']
        # Find the corresponding assistant response
        assistant_response = df.loc[(df['parent_id'] == row['message_id']) & (df['role'] == 'assistant'), 'text'].values
        if len(assistant_response) > 0:
            assistant_response = assistant_response[0]
        else:
            assistant_response = None
        if(index>100):
            break
        print(index)
        
        # Generate responses with and without embeddings
        node_embedding = torch.tensor(row['embedding'])  # Convert the list to a tensor if necessary
        response_with_embedding = get_assistant_response(prompt, node_embedding)
        response_without_embedding = get_assistant_response(prompt)
        
        # Append the results to the new DataFrame
        results.append({
            'message_id': row['message_id'],
            'parent_id': row['parent_id'],
            'prompt': prompt,
            'assistant_response': assistant_response,
            'generated_response_with_embedding': response_with_embedding,
            'generated_response_without_embedding': response_without_embedding
        })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a new CSV file
results_df.to_csv("results2.csv", index=False)

print("Results saved successfully!")

0


  node_embedding = torch.tensor(row['embedding'])  # Convert the list to a tensor if necessary


2
4
6
10
12
14
