In [22]:
import networkx as nx
import numpy as np

# Assuming G is your networkx graph object from before

# Nodes
nodes = list(G_clean.nodes())

# Edges
edges = list(G_clean.edges())

# Node Attributes
# This is a placeholder. You'll need to define attributes based on your graph's specifics.
node_attributes = {node:  G_clean.nodes[node] for node in  G_clean.nodes()}

# Adjacency Matrix
adj_matrix = nx.to_numpy_matrix( G_clean, nodelist=nodes)

# Creating the tuple
graph_tuple = (nodes, edges, node_attributes, adj_matrix)

# Now, graph_tuple contains your graph's representation.


In [37]:
def check_if_all_zeros(adj_matrix):
    return np.all(adj_matrix == 0)

is_all_zero_example = check_if_all_zeros(graph_tuple[3][0])
print(is_all_zero_example)
# print(graph_tuple[0])
# print(graph_tuple[1])
# print(graph_tuple[2])
print(graph_tuple[0][0])
print(graph_tuple[0][1])
print(graph_tuple[0][2])
print(graph_tuple[0][3])
print(graph_tuple[0][4])
print(graph_tuple[1][0])

print(graph_tuple[3].shape)

False
'CWE-319'
'CWE-6'
'CWE-756'
'CWE-668'
'CWE-266'
("'CWE-319'", "'CWE-311'")
(1497, 1497)


# Build the graph G (with out description)

In [24]:
import networkx as nx
import matplotlib.pyplot as plt

def build_graph_from_file(file_path):
    G = nx.DiGraph()
    
    with open(file_path, 'r') as file:
        for line in file:
            elements = line.strip().split(', ')
            if len(elements) == 3:
                source, relation, target = elements
                G.add_node(source)
                G.add_node(target)
                G.add_edge(source, target, relation=relation)
    
    return G

def draw_graph(G):
    pos = nx.spring_layout(G, seed=42)
    nx.draw_networkx_nodes(G, pos)
    nx.draw_networkx_edges(G, pos)
    nx.draw_networkx_labels(G, pos)
    edge_labels = nx.get_edge_attributes(G, 'relation')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    # Show plot
    plt.show()

file_path = 'kb.txt'
G = build_graph_from_file(file_path)
# draw_graph(G)


In [25]:
import networkx as nx

# Assuming G is your existing networkx graph with nodes needing cleanup

def clean_node_name(node):
    return node.replace('[', '').replace(']', '')

# Create a new graph to hold cleaned nodes and edges
G_clean = nx.DiGraph()

# Copy and clean each node and its attributes
for node in G.nodes(data=True):
    clean_node = clean_node_name(node[0])
    G_clean.add_node(clean_node, **node[1])

# Copy edges to the new graph with cleaned node names
for edge in G.edges(data=True):
    clean_source = clean_node_name(edge[0])
    clean_target = clean_node_name(edge[1])
    G_clean.add_edge(clean_source, clean_target, **edge[2])

# Now G_clean contains your graph with cleaned node names

# GNN

In [4]:
import networkx as nx
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data
import networkx as nx
from transformers import AutoTokenizer, AutoModel
from torch_geometric.nn import GCNConv  # Change this to your preferred GNN layer
import torch.nn.functional as F

G = nx.Graph()
G.add_edge(0, 1, text="Hello from node 0 to node 1")
G.add_edge(1, 2, text="Message from node 1 to node 2")
G.add_nodes_from([
    (0, {"text": "Node 0's des."}),
    (1, {"text": "Node 1's des"}),
    (2, {"text": "Node 2's des."}),
])

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')


def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)


def convert_nx_to_pyg(G):
    texts = [G.nodes[node]['text'] for node in G.nodes()]
    x = torch.cat([encode_text(text) for text in texts], dim=0)
    
    edge_index = torch.tensor(list(G.edges()), dtype=torch.long).t().contiguous()
    
    data = Data(x=x, edge_index=edge_index)
    return data

data = convert_nx_to_pyg(G)


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(data.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GNN(hidden_channels=16)

labels = torch.tensor([0, 1, 0], dtype=torch.long)
data.y = labels

data.train_mask = torch.tensor([True, True, True], dtype=torch.bool)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}: Loss: {loss.item()}')
    

Epoch 1: Loss: 0.6103078722953796
Epoch 2: Loss: 1.298105001449585
Epoch 3: Loss: 1.0220133066177368
Epoch 4: Loss: 0.8595145344734192
Epoch 5: Loss: 0.910021960735321
Epoch 6: Loss: 0.67376708984375
Epoch 7: Loss: 0.617800235748291
Epoch 8: Loss: 0.6802899837493896
Epoch 9: Loss: 0.6780117154121399
Epoch 10: Loss: 0.6075509190559387
Epoch 11: Loss: 0.6708061099052429
Epoch 12: Loss: 0.6125102639198303
Epoch 13: Loss: 0.685776948928833
Epoch 14: Loss: 0.6087173223495483
Epoch 15: Loss: 0.6294495463371277
Epoch 16: Loss: 0.6278523802757263
Epoch 17: Loss: 0.6665735840797424
Epoch 18: Loss: 0.6652699708938599
Epoch 19: Loss: 0.5956849455833435
Epoch 20: Loss: 0.6626076698303223
Epoch 21: Loss: 0.6830970644950867
Epoch 22: Loss: 0.5846443772315979
Epoch 23: Loss: 0.6842755675315857
Epoch 24: Loss: 0.6803540587425232
Epoch 25: Loss: 0.6117826104164124
Epoch 26: Loss: 0.6555581092834473
Epoch 27: Loss: 0.601810872554779
Epoch 28: Loss: 0.6535786986351013
Epoch 29: Loss: 0.6526003479957581
E

# GNN + Transformer layer

In [23]:
import torch
from torch_geometric.nn import GCNConv
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.nn import Linear, Dropout
import torch.nn.functional as F

class CustomGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CustomGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class GNNTransformerModel(torch.nn.Module):
    def __init__(self, gnn_hidden_dim, transformer_model_name, num_labels):
        super(GNNTransformerModel, self).__init__()
        self.gnn = CustomGNN(input_dim=768, hidden_dim=gnn_hidden_dim)
        self.transformer = AutoModelForSequenceClassification.from_pretrained(transformer_model_name, num_labels=num_labels, output_hidden_states=True)
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
        self.dropout = Dropout(0.1)
        self.classifier = Linear(gnn_hidden_dim + self.transformer.config.hidden_size, num_labels)

    def forward(self, text, gnn_data):
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        transformer_hidden_states = transformer_outputs.hidden_states[-1]
        pooled_output = transformer_hidden_states[:, 0]


        gnn_output = self.gnn(gnn_data["x"], gnn_data["edge_index"])
        gnn_output = gnn_output.mean(dim=0, keepdim=True)

        combined_output = torch.cat((pooled_output, gnn_output), dim=1)
        combined_output = self.dropout(combined_output)
        logits = self.classifier(combined_output)
        return logits

model_name = "bert-base-uncased"
gnn_transformer_model = GNNTransformerModel(gnn_hidden_dim=128, transformer_model_name=model_name, num_labels=2)

text = "Example text"
gnn_data = {
    "x": torch.randn(10, 768), 
    "edge_index": torch.tensor([[0, 1], [1, 2]], dtype=torch.long) 
}

logits = gnn_transformer_model(text, gnn_data)
print(logits)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-0.0857,  0.1219]], grad_fn=<AddmmBackward0>)
