In [78]:
import argparse 
import numpy as np 
import time 
import random
import torch
import torch.nn as nn
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM
import pickle
import math
import torch.nn.functional as F
import dgl

class LLMGraphTransformer(nn.Module):
    def __init__(self, model_name="TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", in_feats=9, hidden_size=16, out_feats=9, num_heads=4, device="cpu"):
        super().__init__()
        self.device = device

        # Load the tokenizer and model for TinyLlama
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)

        # Ensure padding token is set for TinyLlama
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.resize_token_embeddings(len(self.tokenizer))

        # Graph Transformer Layers for additional processing
        self.graph_transformer1 = GraphTransformerLayer(in_feats, hidden_size, num_heads=num_heads)
        self.graph_transformer2 = GraphTransformerLayer(hidden_size * num_heads, out_feats, num_heads=1)

        # Projection Layer to match text and graph embedding dimensions
        self.text_embedding_dim = 64  # Match this to text embeddings size (64)
        self.graph_embedding_dim = 9  # Match this to graph embeddings size (9)
        self.projection = nn.Linear(self.text_embedding_dim + self.graph_embedding_dim, out_feats).to(self.device)
        
        # Final classification layer
        self.classifier = nn.Linear(out_feats, out_feats).to(self.device)

    def forward(self, input_texts, graph_data):
        batch_graphs, node_features, edge_features = graph_data

        # Move the graph to the correct device
        batch_graphs = batch_graphs

        # Process input text with TinyLlama to generate embeddings
        inputs = self.tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)  # Ensure inputs are on the same device
        outputs = self.model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=10)

        # Convert generated tokens back to embeddings
        generated_texts = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        text_embeddings = self.tokenizer(generated_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)['input_ids'].to(self.device)  # Move to the correct device

        # Project text_embeddings to the correct size
        text_embeddings = text_embeddings.float()  # Ensure text embeddings are of type float
        text_embedding_dim = text_embeddings.size(1)
        linear_text_projection = nn.Linear(text_embedding_dim, self.text_embedding_dim).to(self.device)
        text_embeddings = linear_text_projection(text_embeddings)

        # Process graph data using Graph Transformer
        x = self.graph_transformer1(batch_graphs, node_features)  # Outputs [1000, 64]
        linear_layer = nn.Linear(16, 64) # 16 dimensions to 64 dimensions
        x = linear_layer(x)
        x = self.graph_transformer2(batch_graphs, x)  # Input is now [1000, 64]
        x = x.to(self.device)

        # Adjust the batch size of text embeddings to match graph embeddings
        text_embeddings = text_embeddings.repeat(x.size(0) // text_embeddings.size(0), 1)

        # Combine text embeddings and graph embeddings
        combined_embeddings = torch.cat([text_embeddings, x], dim=1).to(self.device)  # Concatenate along dimension 1

        # Check combined embeddings size
#         print(f"combined_embeddings shape: {combined_embeddings.shape}")  # Should be [1000, 73]

        # Apply projection layer to match output dimension
        combined_embeddings = self.projection(combined_embeddings)

        # Final classification layer
        logits = self.classifier(combined_embeddings)
        return logits



    
    def generate_text(self, graph_data, labels, max_new_tokens=50):
        # Convert the graph adjacency list to text directly within this method
        batch_text = []
        for node, neighbors in enumerate(graph_data):
            if isinstance(neighbors, (list, set, np.ndarray)):
                for neighbor in neighbors:
                    question = f"What is the relationship between Node {node} and Node {neighbor}? Choices: {', '.join(labels)}."
                    batch_text.append(question)
            else:
                question = f"What is the relationship between Node {node} and Node {neighbors}? Choices: {', '.join(labels)}."
                batch_text.append(question)

        # Tokenize and generate predictions
        inputs = self.tokenizer(batch_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)  # Move to the correct device
#         print("inputs generated")
        outputs = self.model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
#         print("outputs generated")
        generated_text = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        return generated_text



class GraphTransformerLayer(nn.Module):
    def __init__(self, in_dim, out_dim, num_heads=4, dropout=0.1):
        super(GraphTransformerLayer, self).__init__()
        self.num_heads = num_heads

        # Each head should process a fraction of the input features
        assert out_dim % num_heads == 0, "Output dimension must be divisible by the number of heads"
        self.head_dim = out_dim // num_heads  # Calculate dimension per head

        # Multi-head graph attention mechanism
        self.attn_fc = nn.ModuleList([nn.Linear(in_dim, self.head_dim, bias=False) for _ in range(num_heads)])

        # Feedforward layer
        self.ff = nn.Sequential(
            nn.Linear(out_dim, out_dim),
            nn.ReLU(),
            nn.Linear(out_dim, out_dim)
        )

        self.norm1 = nn.LayerNorm(out_dim)
        self.norm2 = nn.LayerNorm(out_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, g, h):
        with g.local_scope():
            # Initialize attention values for edges
            g.edata['attn'] = torch.ones(g.number_of_edges(), self.num_heads).to(h.device)  # Initialize attention scores

            # Multi-head attention
            head_outputs = []
            for i in range(self.num_heads):
                Wh = self.attn_fc[i](h)  # Ensure Wh is computed per-head
                g.ndata['Wh'] = Wh
                # Ensure dimensions match between Wh (nodes) and attn (edges)
                g.update_all(dgl.function.u_mul_e('Wh', 'attn', 'm'), dgl.function.sum('m', 'h'))
                head_outputs.append(g.ndata['h'])

            # Concatenate the heads
            h = torch.cat(head_outputs, dim=1)

            # Add & Norm
            h = self.norm1(h)
            h = F.relu(h)

            # Feedforward
            h = self.ff(h)

            # Add & Norm
            h = self.norm2(h)
            h = self.dropout(h)

        return h

# Prepare graph data for graph transformer layers
def prepare_graph_data(graphs, num_node_features=9, device="cpu"):
    for graph in graphs:
        num_nodes = graph.number_of_nodes()
        if 'feat' not in graph.ndata:
            graph.ndata['feat'] = torch.randn(num_nodes, num_node_features).to(device)  # Move to correct device

        num_edges = graph.number_of_edges()
        if 'feat' not in graph.edata:
            graph.edata['feat'] = torch.randn(num_edges, num_node_features).to(device)  # Move to correct device

    batch_graphs = dgl.batch(graphs)
    node_features = batch_graphs.ndata['feat']
    edge_features = batch_graphs.edata['feat']
    return batch_graphs, node_features, edge_features


# Data balancing function
def balance_data(data, labels, n_samples_per_label):
    label_groups = {}
    for label in np.unique(labels):
        label_indices = np.where(labels == label)[0]
        sampled_indices = np.random.choice(label_indices, size=n_samples_per_label, replace=(len(label_indices) < n_samples_per_label))
        label_groups[label] = sampled_indices

    balanced_indices = np.concatenate(list(label_groups.values()))
    balanced_data = data[balanced_indices]
    balanced_labels = labels[balanced_indices]

    return balanced_data, balanced_labels




# Training loop
def fit(args):
    data = args["dataset"]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available

    # Load the data manually (edge_feat, label, adj, adj_lists, config)
    path = "datasets/" + data + "/"
    edge_feat = np.load(path + "edge_feat_scaled.npy", allow_pickle=True)
    label = np.load(path + "label_mul.npy", allow_pickle=True)
    adj = np.load(path + "adj_random.npy", allow_pickle=True)
    with open(path + 'adj_random_list.dict', 'rb') as file:
        adj_lists = pickle.load(file)

    # Initialize LLMGraphTransformer using TinyLlama
    model = LLMGraphTransformer(device=device)
    labels = ['Normal', 'Audio-Streaming', 'Browsing', 'Chat', 'File-Transfer', 'Email', 'P2P', 'Video-Streaming', 'VOIP']

    # Optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    loss_fn = nn.CrossEntropyLoss()

    # Balance the dataset
    balanced_data, balanced_labels = balance_data(np.arange(len(edge_feat)), label, n_samples_per_label=30)

    # Split the data into train, validation, and test sets
    train_val, test, train_val_labels, test_labels = train_test_split(balanced_data, balanced_labels, test_size=0.2, stratify=balanced_labels)
    train, val, train_labels, val_labels = train_test_split(train_val, train_val_labels, test_size=0.2, stratify=train_val_labels)

    times = []
    trainscores = []
    valscores = []

    for epoch in range(10):
        print(f"Epoch: {epoch}")
        random.shuffle(train)
        

        for batch in range(int(len(train) / 10)):  # Batch size is 10
            batch_edges = train[10 * batch:10 * (batch + 1)]
             
            # Print the number of batches
            print(f"Training data size: {len(train)}")
            print(f"Number of batches: {len(train) // 10}")

            # Prepare graph data (initialize with random node features)
            graph_data = prepare_graph_data([dgl.rand_graph(100, 200) for _ in batch_edges], num_node_features=9) 

            # Convert batch_edges to text
            batch_text = model.generate_text(batch_edges, labels, max_new_tokens=10)

            optimizer.zero_grad()
            # Forward pass
            logits = model(batch_text, graph_data)
            
            # Ensure batch_labels are converted to LongTensor
            batch_labels = torch.tensor(train_labels[10 * batch:10 * (batch + 1)], device=device, dtype=torch.long)  # Convert to LongTensor

            # Print shapes to debug
#             print(f"logits shape: {logits.shape}")   # Check the logits shape
#             print(f"batch_labels shape: {batch_labels.shape}")  # Check the labels shape

            # Adjust logits size to match batch_labels if necessary
            if logits.size(0) > batch_labels.size(0):
                logits = logits[:batch_labels.size(0)]
            print(logits)
            loss = loss_fn(logits, batch_labels)
            loss.backward()

            # Update parameters
            optimizer.step()

            predicted_labels = torch.argmax(logits, dim=-1)
            acc_train = f1_score(batch_labels.cpu().numpy(), predicted_labels.cpu().numpy(), average="weighted")

            print(f'Batch {batch + 1}/{len(train) // 10}, Loss: {loss.item()}, Accuracy: {acc_train}')

    print("Training completed.")


# Prediction function
def predict(model, data_idx):
    model.eval()
    predict_output = []
    labels = ['Normal', 'Audio-Streaming', 'Browsing', 'Chat', 'File-Transfer', 'Email', 'P2P', 'Video-Streaming', 'VOIP']

    with torch.no_grad():
        for batch in range(int(len(data_idx) / 10)):
            batch_edges = data_idx[10 * batch:10 * (batch + 1)]
            graph_data = prepare_graph_data([dgl.rand_graph(100, 200) for _ in batch_edges], num_node_features=9, device=device)

            # Convert batch_edges to text
            batch_text = model.generate_text(batch_edges, labels, max_new_tokens=10)

            # Forward pass
            logits = model(batch_text, graph_data)
            predicted_labels = torch.argmax(logits, dim=-1)
            predict_output.extend(predicted_labels.cpu().numpy())

    return predict_output


if __name__ == "__main__":
    fit({"dataset": "Darknet"})


Epoch: 0
Training data size: 172
Number of batches: 17
Batch 1/17, Loss: 13328.412109375, Accuracy: 0.0
Training data size: 172
Number of batches: 17
Batch 2/17, Loss: 3447.844482421875, Accuracy: 0.0
Training data size: 172
Number of batches: 17
Batch 3/17, Loss: 12018.7255859375, Accuracy: 0.12000000000000002
Training data size: 172
Number of batches: 17
Batch 4/17, Loss: 4587.74169921875, Accuracy: 0.02857142857142857
Training data size: 172
Number of batches: 17
Batch 5/17, Loss: 3511.83740234375, Accuracy: 0.03333333333333334
Training data size: 172
Number of batches: 17
Batch 6/17, Loss: 11470.8984375, Accuracy: 0.0
Training data size: 172
Number of batches: 17
Batch 7/17, Loss: 4445.24169921875, Accuracy: 0.0
Training data size: 172
Number of batches: 17
Batch 8/17, Loss: 12191.1875, Accuracy: 0.01818181818181818
Training data size: 172
Number of batches: 17
Batch 9/17, Loss: 6497.1630859375, Accuracy: 0.06666666666666668
Training data size: 172
Number of batches: 17
Batch 10/17

Epoch: 0
Training data size: 172
Number of batches: 17
inputs generated
outputs generated


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1000x84 and 576x9)

In [36]:
import dgl
import torch

# Check if PyTorch can use the GPU
print(torch.cuda.is_available())  # Should print: True

# Create a DGL graph and move it to the GPU
g = dgl.rand_graph(100, 200)
g = g.to('cuda' if torch.cuda.is_available() else 'cpu')

# Check if the graph is on the GPU
print(g.device)

True


DGLError: [17:39:04] /opt/dgl/src/runtime/c_runtime_api.cc:88: Check failed: allow_missing: Device API gpu is not enabled. Please install the cuda version of dgl.
Stack trace:
  [bt] (0) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f1c4806203f]
  [bt] (1) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::runtime::DeviceAPIManager::GetAPI(std::string, bool)+0x374) [0x7f1c487565d4]
  [bt] (2) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::runtime::DeviceAPI::Get(DLContext, bool)+0x1f4) [0x7f1c487502b4]
  [bt] (3) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator<long> >, DLDataType, DLContext)+0x334) [0x7f1c48771e64]
  [bt] (4) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DLContext const&) const+0xc0) [0x7f1c487a8cd0]
  [bt] (5) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::aten::COOMatrix::CopyTo(DLContext const&) const+0x7d) [0x7f1c48898c4d]
  [bt] (6) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0x292) [0x7f1c488893d2]
  [bt] (7) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DLContext const&)+0xf5) [0x7f1c487b9ef5]
  [bt] (8) /usr/local/lib/python3.8/dist-packages/dgl/libdgl.so(+0x97d3bb) [0x7f1c487c73bb]



In [14]:
import torch
print(torch.cuda.is_available())


True


In [10]:
!pip uninstall -y dgl


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: dgl 0.6.1
Uninstalling dgl-0.6.1:
  Successfully uninstalled dgl-0.6.1
[0m

In [21]:
!pip install dgl-cu102==0.6.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting dgl-cu102==0.6.1
  Downloading dgl_cu102-0.6.1-cp38-cp38-manylinux1_x86_64.whl (36.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.8/36.8 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: dgl-cu102
Successfully installed dgl-cu102-0.6.1
[0m