In [1]:
import pandas as pd
df_train = pd.read_csv("combined_features.csv")

In [2]:
import networkx as nx

# Initialize a directed graph
dialogue_graph = nx.DiGraph()

# Add nodes and edges
for idx, row in df_train.iterrows():
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    role = row["role"]

    # Add node with attributes
    dialogue_graph.add_node(message_id, text=text, role=role)

    # Add edge if parent_id exists
    if parent_id:
        dialogue_graph.add_edge(parent_id, message_id)

In [3]:
print(df_train["combined_features"].apply(type).value_counts())

combined_features
<class 'str'>    39283
Name: count, dtype: int64


In [4]:
import torch
import ast

def string_to_tensor(tensor_string):
    # Remove the "tensor(" prefix and ")" suffix
    tensor_string = tensor_string.replace("tensor(", "").replace(")", "")

    # Convert the string to a list of floats
    tensor_list = ast.literal_eval(tensor_string)

    # Convert the list to a PyTorch tensor
    return torch.tensor(tensor_list, dtype=torch.float)

# Apply the conversion to the "combined_features" column
df_train["combined_features"] = df_train["combined_features"].apply(string_to_tensor)

# Verify the conversion
print(df_train["combined_features"].apply(type).value_counts())

combined_features
<class 'torch.Tensor'>    39283
Name: count, dtype: int64


In [5]:
combined_features_list=[]
for _, row in df_train.iterrows():
    combined_features_list.append(row["combined_features"])


In [6]:
from torch_geometric.data import Data

# Create a mapping from string node IDs to integer indices
node_id_to_index = {node_id: idx for idx, node_id in enumerate(dialogue_graph.nodes)}
# Convert edges to integer indices
edges = [(node_id_to_index[src], node_id_to_index[dst]) for src, dst in dialogue_graph.edges]

# Example output: [(0, 1), (2, 3), ..
# Create edge index
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
# Create node features tensor
node_features = torch.stack(combined_features_list)

# Create PyTorch Geometric data object
graph_data = Data(x=node_features, edge_index=edge_index)

In [7]:
num_nodes = graph_data.x.size(0)
print(f"Number of nodes: {num_nodes}")

Number of nodes: 39283


In [8]:
max_index = torch.max(graph_data.edge_index).item()
print(f"Maximum node index in edge_index: {max_index}")

Maximum node index in edge_index: 39378


In [9]:
# Filter out invalid edges
valid_mask = (graph_data.edge_index[0] < num_nodes) & (graph_data.edge_index[1] < num_nodes)
graph_data.edge_index = graph_data.edge_index[:, valid_mask]

# Update the number of edges
num_edges = graph_data.edge_index.size(1)
print(f"Number of edges after filtering: {num_edges}")

Number of edges after filtering: 39187


In [10]:
# Check the number of nodes and maximum node index
num_nodes = graph_data.x.size(0)
max_index = torch.max(graph_data.edge_index).item()
print(f"Number of nodes: {num_nodes}")
print(f"Maximum node index in edge_index: {max_index}")

# Ensure max_index < num_nodes
assert max_index < num_nodes, "Invalid node indices in edge_index!"

Number of nodes: 39283
Maximum node index in edge_index: 39282


In [11]:
from torch_geometric.utils import coalesce

# Reindex the nodes in edge_index
unique_nodes, edge_index = torch.unique(graph_data.edge_index, return_inverse=True)
edge_index = edge_index.reshape(2, -1)

# Update the number of nodes
num_nodes = unique_nodes.size(0)
print(f"Number of nodes after reindexing: {num_nodes}")

# Update graph_data
graph_data.edge_index = edge_index
graph_data.x = graph_data.x[unique_nodes]  # Reindex node features if necessary

Number of nodes after reindexing: 39283


In [12]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)  # First GCN layer
        self.conv2 = GCNConv(hidden_dim, output_dim)  # Second GCN layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)  # First convolution
        x = F.relu(x)  # Apply ReLU activation
        x = self.conv2(x, edge_index)  # Second convolution
        return x  # Output node embeddings

In [13]:
# Input dimension: Size of the combined feature vector for each node
input_dim = graph_data.x.size(1)

# Hidden dimension: Size of the hidden layer (can be tuned)
hidden_dim = 128

# Output dimension: Size of the final node embeddings (can be tuned)
output_dim = 64

# Initialize the GCN model
model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

In [14]:
def compute_loss(out):
    # Reconstruct the adjacency matrix
    adj_reconstructed = torch.sigmoid(torch.mm(out, out.t()))

    # Compute reconstruction loss
    loss = F.binary_cross_entropy(adj_reconstructed, torch.eye(graph_data.x.size(0), device=device))
    return loss

In [12]:
import torch
import torch.optim as optim
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.nn.norm import BatchNorm

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.bn1 = BatchNorm(hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.bn2 = BatchNorm(hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        return x

# Input dimension: Size of the combined feature vector for each node
input_dim = graph_data.x.size(1)

# Hyperparameters
hidden_dim = 256  # Hidden dimension
output_dim = 64  # Output dimension
dropout = 0.5  # Dropout rate
learning_rate = 0.001  # Learning rate
weight_decay = 5e-4  # L2 regularization
num_epochs = 100  # Number of epochs
patience = 3  # Early stopping patience
min_delta = 1  # Minimum change in the monitored quantity to qualify as an improvement

# Initialize the GCN model
model = GCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, dropout=dropout)

# Move model and data to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
graph_data = graph_data.to(device)

# Normalize input features (to improve training stability)
graph_data.x = (graph_data.x - graph_data.x.mean(dim=0)) / (graph_data.x.std(dim=0) + 1e-8)

# Initialize the optimizer with weight decay (L2 regularization)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Loss function for graph reconstruction using MSE
def compute_loss(out):
    # Reconstruct the adjacency matrix
    adj_reconstructed = torch.sigmoid(torch.mm(out, out.t()))

    # Use the actual adjacency matrix as the target (if available)
    adj_matrix = torch.sparse_coo_tensor(
        graph_data.edge_index,
        torch.ones(graph_data.edge_index.size(1)),
        size=(graph_data.num_nodes, graph_data.num_nodes),
    ).to_dense().to(device)

    # Compute reconstruction loss using MSE
    loss = F.mse_loss(adj_reconstructed, adj_matrix)
    return loss

# Early stopping criteria
best_loss = float('inf')
patience_counter = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()  # Reset gradients

    out = model(graph_data.x, graph_data.edge_index)  # Forward pass
    loss = compute_loss(out)  # Compute loss

    loss.backward()  # Backward pass
    optimizer.step()  # Update model parameters

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")  # Print loss

    # Early stopping
    if best_loss - loss.item() > min_delta:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping due to no significant improvement in loss")
        break


Epoch 1, Loss: 0.9450471997261047
Epoch 2, Loss: 0.7464883923530579
Epoch 3, Loss: 0.6054078340530396
Epoch 4, Loss: 0.5619692206382751
Early stopping due to no significant improvement in loss


In [13]:
import torch
import torch.optim as optim
from torch_geometric.nn import GATConv
import torch.nn.functional as F
from torch_geometric.nn.norm import BatchNorm

# Define the GAT model
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.bn1 = BatchNorm(hidden_dim * heads)
        self.conv2 = GATConv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout)
        self.bn2 = BatchNorm(hidden_dim * heads)
        self.conv3 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        
        x = self.conv3(x, edge_index)
        return x

# Input dimension: Size of the combined feature vector for each node
input_dim = graph_data.x.size(1)

# Hyperparameters
hidden_dim = 256  # Hidden dimension
output_dim = 64  # Output dimension
dropout = 0.5  # Dropout rate
learning_rate = 0.001  # Learning rate
weight_decay = 5e-4  # L2 regularization
num_epochs = 100  # Number of epochs
patience = 3  # Early stopping patience
min_delta = 0.01  # Minimum change in the monitored quantity to qualify as an improvement

# Initialize the GAT model
model = GAT(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, dropout=dropout)

# Move model and data to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
graph_data = graph_data.to(device)

# Normalize input features (to improve training stability)
graph_data.x = (graph_data.x - graph_data.x.mean(dim=0)) / (graph_data.x.std(dim=0) + 1e-8)

# Initialize the optimizer with weight decay (L2 regularization)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Loss function for graph reconstruction using MSE
def compute_loss(out):
    # Reconstruct the adjacency matrix
    adj_reconstructed = torch.sigmoid(torch.mm(out, out.t()))

    # Use the actual adjacency matrix as the target (if available)
    adj_matrix = torch.sparse_coo_tensor(
        graph_data.edge_index,
        torch.ones(graph_data.edge_index.size(1)),
        size=(graph_data.num_nodes, graph_data.num_nodes),
    ).to_dense().to(device)

    # Compute reconstruction loss using MSE
    loss = F.mse_loss(adj_reconstructed, adj_matrix)
    return loss

# Early stopping criteria
best_loss = float('inf')
patience_counter = 0

# Training loop
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()  # Reset gradients

    out = model(graph_data.x, graph_data.edge_index)  # Forward pass
    loss = compute_loss(out)  # Compute loss

    loss.backward()  # Backward pass
    optimizer.step()  # Update model parameters

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")  # Print loss

    # Early stopping
    if best_loss - loss.item() > min_delta:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping due to no significant improvement in loss")
        break


Epoch 1, Loss: 0.40867576003074646
Epoch 2, Loss: 0.354690283536911
Epoch 3, Loss: 0.3287122845649719
Epoch 4, Loss: 0.3120560944080353
Epoch 5, Loss: 0.3123176395893097
Epoch 6, Loss: 0.30674678087234497
Epoch 7, Loss: 0.29892855882644653
Epoch 8, Loss: 0.2994576096534729
Epoch 9, Loss: 0.31298717856407166
Epoch 10, Loss: 0.28839296102523804
Epoch 11, Loss: 0.2882095277309418
Epoch 12, Loss: 0.28659510612487793
Epoch 13, Loss: 0.28482022881507874
Early stopping due to no significant improvement in loss


In [14]:
node_embeddings = out.detach().cpu()
torch.save(node_embeddings, "node_embeddings.pt")


In [15]:
loaded_node_embeddings = torch.load("node_embeddings.pt")

# Assuming df_train is a DataFrame containing the message IDs
df_train = pd.DataFrame({"message_id": range(len(loaded_node_embeddings))})  # Example DataFrame

# Create the message_id_to_embedding dictionary
message_id_to_embedding = {row["message_id"]: loaded_node_embeddings[i] for i, row in df_train.iterrows()}


In [17]:
pip install transformers

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Using cached huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m1m22.8 MB/s[0m eta [36m0:00:01[0m
[?25hUsing cached huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
Using cached regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl (284 kB)
Using ca

In [20]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Load a pre-trained LLM
llm = AutoModelForCausalLM.from_pretrained("gpt2")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=llm,
    args=training_args,
    train_dataset=combined_inputs,  # Use combined inputs
)

# Fine-tune the LLM
trainer.train()

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/Users/mayur/.cache/huggingface'
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/Users/mayur/.cache/huggingface'


OSError: gpt2 does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [None]:
# Generate responses
generated_responses = llm.generate(input_ids=combined_inputs["input_ids"], max_length=512)

# Decode the generated responses
decoded_responses = [tokenizer.decode(response, skip_special_tokens=True) for response in generated_responses]

# Print the first few responses
for i, response in enumerate(decoded_responses[:5]):
    print(f"Response {i + 1}: {response}")

In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Example: Calculate BLEU score
reference = df_train["text"].tolist()  # Ground truth responses
bleu_scores = [sentence_bleu([ref], gen) for ref, gen in zip(reference, decoded_responses)]
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu}")