In [1]:
import os
import pickle
import json
import random
import numpy as np
!pip install torch
import torch
import torch.nn as nn
import torch.nn.functional as F



In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
import warnings
warnings.filterwarnings("ignore")


In [4]:
!pip install torch_geometric --user



In [5]:
def get_files_in_folder(folder_path):
    file_list = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            file_list.append(file_path)
    return file_list

# Replace 'folder_path' with the path to the folder you want to read files from
verilog_files = get_files_in_folder("encoded")
print(verilog_files)
print(len(verilog_files))


['encoded\\adder11.txt', 'encoded\\adder12.txt', 'encoded\\adder13.txt', 'encoded\\adder14.txt', 'encoded\\adder15.txt', 'encoded\\adder16.txt', 'encoded\\adder17.txt', 'encoded\\adder18.txt', 'encoded\\adder19.txt', 'encoded\\adder2.txt', 'encoded\\adder20.txt', 'encoded\\adder5.txt', 'encoded\\adder6.txt', 'encoded\\adder8.txt', 'encoded\\ALU10.txt', 'encoded\\ALU13.txt', 'encoded\\ALU14.txt', 'encoded\\ALU15.txt', 'encoded\\ALU2.txt', 'encoded\\ALU6.txt', 'encoded\\ALU7.txt', 'encoded\\ALU8.txt', 'encoded\\ALU9.txt', 'encoded\\and1.txt', 'encoded\\and10.txt', 'encoded\\and12.txt', 'encoded\\and13.txt', 'encoded\\and14.txt', 'encoded\\and15.txt', 'encoded\\and16.txt', 'encoded\\and17.txt', 'encoded\\and18.txt', 'encoded\\and19.txt', 'encoded\\and2.txt', 'encoded\\and20.txt', 'encoded\\and21.txt', 'encoded\\and23.txt', 'encoded\\and25.txt', 'encoded\\and26.txt', 'encoded\\and27.txt', 'encoded\\and28.txt', 'encoded\\and29.txt', 'encoded\\and3.txt', 'encoded\\and30.txt', 'encoded\\and6.

In [6]:
from torch_geometric.data import Data


def extracting_attributes(verilog_file):
    try:
        if os.path.isfile(verilog_file):
            with open(verilog_file, "r") as file:
                loaded_data = json.load(file)
                nodes = loaded_data[0]
                edges = loaded_data[1]
                label = loaded_data[3]
                x = torch.tensor(nodes, dtype=torch.float)
                edge_index = torch.tensor(edges, dtype=torch.long)
                y = torch.tensor(label, dtype=torch.float)
                num_nodes = x.size(0)
                
                # Create batch assignment vector (assuming one graph per file)
                batch = torch.zeros(num_nodes, dtype=torch.long)
                data = Data(x=x, edge_index=edge_index, y = y, batch = batch)
                return data
    except Exception as e:
        return None
    

In [7]:
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset

class VerilogDataset(Dataset):  # Using Dataset from torch_geometric
    def __init__(self, verilog_files):
        print(f"Loaded {len(verilog_files)} Verilog files.")
        self.verilog_files = verilog_files

    def __len__(self):
        return len(self.verilog_files)

    def __getitem__(self, idx):
        verilog_file = self.verilog_files[idx]
        data = extracting_attributes(verilog_file)
        return data

dataset = VerilogDataset(verilog_files)
print(len(dataset))

Loaded 314 Verilog files.
314


In [8]:
temp = dataset[0]
temp.y

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [9]:
def graph_stat(dataset):
    """
    TODO: calculate the statistics of the ENZYMES dataset.
    
    Outputs:
        min_num_nodes: min number of nodes
        max_num_nodes: max number of nodes
        mean_num_nodes: average number of nodes
        min_num_edges: min number of edges
        max_num_edges: max number of edges
        mean_num_edges: average number of edges
    """
    nodes_edges = [(data.num_nodes, data.num_edges) for data in dataset]
    num_nodes, num_edges = list(list(zip(*nodes_edges))[0]), list(list(zip(*nodes_edges))[1])
    min_num_nodes = min(num_nodes)
    max_num_nodes = max(num_nodes)
    mean_num_nodes = np.mean(num_nodes)
    min_num_edges = min(num_edges)
    max_num_edges = max(num_edges)
    mean_num_edges = np.mean(num_edges)
    
    print(f"min_num_nodes: {min_num_nodes}")
    print(f"max_num_nodes: {max_num_nodes}")
    print(f"mean_num_nodes: {mean_num_nodes}")
    print(f"min_num_edges: {min_num_edges}")
    print(f"max_num_edges: {max_num_edges}")
    print(f"mean_num_edges: {mean_num_edges}")

graph_stat(dataset)

min_num_nodes: 3
max_num_nodes: 165
mean_num_nodes: 15.006369426751592
min_num_edges: 2
max_num_edges: 208
mean_num_edges: 18.29299363057325


In [10]:
from torch_geometric.data import Data
from torch.utils.data.dataloader import default_collate

def custom_collate(batch):
    if isinstance(batch[0], Data):
        return batch
    else:
        return default_collate(batch)

In [11]:
# from torch.utils.data import random_split

# # Define the sizes of training, validation, and test sets
# train_size = int(0.7 * len(dataset))  # 70% of the data for training
# val_size = int(0.15 * len(dataset))   # 15% of the data for validation
# test_size = len(dataset) - train_size - val_size  # Remaining data for testing

# # Split the dataset into training, validation, and test sets
# train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# # Create DataLoader for each set
# train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=32)
# test_dataloader = DataLoader(test_dataset, batch_size=32)


from torch.utils.data import random_split

# Define the size of the training set (e.g., 70% of the data)
train_size = int(0.7 * len(dataset))

# Calculate the size of the testing set
test_size = len(dataset) - train_size

# Split the dataset into training and testing sets
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=custom_collate)


In [12]:
len(train_loader.dataset)

219

In [13]:
loader_iter = iter(train_loader)
batch = next(loader_iter)
print(batch)
# print(batch.num_graphs)

[Data(x=[9, 2], edge_index=[2, 12], y=[1, 16], batch=[9]), Data(x=[34, 2], edge_index=[2, 41], y=[1, 16], batch=[34]), Data(x=[10, 2], edge_index=[2, 9], y=[1, 16], batch=[10]), Data(x=[48, 2], edge_index=[2, 75], y=[1, 16], batch=[48]), Data(x=[5, 2], edge_index=[2, 4], y=[1, 16], batch=[5]), Data(x=[5, 2], edge_index=[2, 4], y=[1, 16], batch=[5]), Data(x=[4, 2], edge_index=[2, 3], y=[1, 16], batch=[4]), Data(x=[4, 2], edge_index=[2, 3], y=[1, 16], batch=[4]), Data(x=[6, 2], edge_index=[2, 6], y=[1, 16], batch=[6]), Data(x=[30, 2], edge_index=[2, 39], y=[1, 16], batch=[30]), Data(x=[9, 2], edge_index=[2, 8], y=[1, 16], batch=[9]), Data(x=[4, 2], edge_index=[2, 3], y=[1, 16], batch=[4]), Data(x=[13, 2], edge_index=[2, 15], y=[1, 16], batch=[13]), Data(x=[47, 2], edge_index=[2, 80], y=[1, 16], batch=[47]), Data(x=[4, 2], edge_index=[2, 3], y=[1, 16], batch=[4]), Data(x=[11, 2], edge_index=[2, 12], y=[1, 16], batch=[11]), Data(x=[38, 2], edge_index=[2, 37], y=[1, 16], batch=[38]), Data(x

In [14]:
for data in train_loader:
    print(len(data))
    # print(data.num_graphs)
    break

32


In [15]:
import math
from torch_geometric.utils import to_dense_adj, add_self_loops

class GCNConv(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNConv, self).__init__()
        self.theta = nn.Parameter(torch.FloatTensor(in_channels, out_channels))
        # Initialize the parameters.
        stdv = 1. / math.sqrt(out_channels)
        self.theta.data.uniform_(-stdv, stdv)
    
    def forward(self, x, edge_index):
        """
        TODO:
            1. Generate the adjacency matrix with self-loop \hat{A} using edge_index.
            2. Calculate the diagonal degree matrix \hat{D}.
            3. Calculate the output X' with torch.mm using the equation above.
        """

        num_nodes = x.shape[0]
        A = torch.sparse_coo_tensor(edge_index, torch.ones(edge_index.shape[1]), (num_nodes, num_nodes))
        A = A.to_dense()
        A_hat = A + torch.eye(num_nodes)
        
        A_sum = torch.sum(A_hat, dim=1)
        D = torch.pow(A_sum, -0.5)
        D[D == float('inf')] = 0.0
        D_hat_sqrt = torch.diag(D)
        
        first = torch.mm(torch.mm(D_hat_sqrt, A_hat), D_hat_sqrt)
        second = torch.mm(x, self.theta)
        
        ret = torch.mm(first, second)
        
        return ret

In [16]:
# from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        """
        TODO:
            1. Define the first convolution layer using `GCNConv()`. Set `out_channels` to 64;
            2. Define the first activation layer using `nn.ReLU()`;
            3. Define the second convolution layer using `GCNConv()`. Set `out_channels` to 64;
            4. Define the second activation layer using `nn.ReLU()`;
            5. Define the third convolution layer using `GCNConv()`. Set `out_channels` to 64;
            6. Define the dropout layer using `nn.Dropout()`;
            7. Define the linear layer using `nn.Linear()`. Set `output_size` to 2.

        Note that for MUTAG dataset, the number of node features is 7, and the number of classes is 2.

        """
        
        self.gcn1 = GCNConv(in_channels=2, out_channels=64)
        self.a1 = nn.ReLU()
        self.gcn2 = GCNConv(in_channels=64, out_channels=64)
        self.a2 = nn.ReLU()
        self.gcn3 = GCNConv(in_channels=64, out_channels=64)
        self.dropout = nn.Dropout(p=0.4)
        self.linear = nn.Linear(in_features=64, out_features=16)

    def forward(self, x, edge_index, batch):
        """
        TODO:
            1. Pass the data through the frst convolution layer;
            2. Pass the data through the activation layer;
            3. Pass the data through the second convolution layer;
            4. Obtain the graph embeddings using the readout layer with `global_mean_pool()`;
            5. Pass the graph embeddgins through the dropout layer;
            6. Pass the graph embeddings through the linear layer.
            
        Arguments:
            x: [num_nodes, 3], node features
            edge_index: [2, num_edges], edges
            batch: [num_nodes], batch assignment vector which maps each node to its 
                   respective graph in the batch

        Outputs:
            probs: probabilities of shape (batch_size, 16)
        """
        x = self.gcn1(x, edge_index)
        x = self.a1(x)
        x = self.gcn2(x, edge_index)
        x = self.a2(x)
        x = self.gcn3(x, edge_index)
        x = global_mean_pool(x, batch)
        x = self.dropout(x)
        x = self.linear(x)
        
        probs = torch.nn.functional.softmax(x, dim=-1)
        
        return probs
        
        
        
GCN()

GCN(
  (gcn1): GCNConv()
  (a1): ReLU()
  (gcn2): GCNConv()
  (a2): ReLU()
  (gcn3): GCNConv()
  (dropout): Dropout(p=0.4, inplace=False)
  (linear): Linear(in_features=64, out_features=16, bias=True)
)

In [17]:
gcn = GCN()

In [92]:
gcn = GCN()

# optimizer
optimizer = torch.optim.Adam(gcn.parameters(), lr=0.001)
# loss
criterion = torch.nn.CrossEntropyLoss()

def train(train_loader):
    gcn.train()
    for batch_data in train_loader:  # Iterate in batches over the training dataset.
        for data in batch_data:
            # print(data.x, data.edge_index, data.batch, data.y)
            out = gcn(data.x, data.edge_index, data.batch)
            # print(out)
            # print(data.y)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        
def test(loader):
    gcn.eval()
    correct = 0
    for batch_data in loader:  # Iterate in batches over the training/test dataset.
        for data in batch_data:
            # print(data.x, data.edge_index, data.batch, data.y)
            out = gcn(data.x, data.edge_index, data.batch)  
            pred = out.argmax(dim=1)  # Use the class with highest probability.
            y_label = (data.y.tolist())
            y_label = y_label[0].index(1.0)
            pred_label = (pred.tolist())[0]
            print(pred_label)
            print(y_label)
            if y_label == pred_label:
                correct += 1              # Check against ground-truth labels.
    return correct / len(loader.dataset)  # Derive ratio of correct predictions.

import time

# Start the timer
start_time = time.time()

# Your training code here
for epoch in range(200):
    train(train_loader)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch + 1:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
    



# End the timer
end_time = time.time()

# Calculate the duration
duration = end_time - start_time
print("Training duration:", duration, "seconds")

10
4
10
13
10
4
10
13
10
7
10
9
10
2
10
11
10
11
10
6
10
6
10
3
10
3
10
2
10
3
10
5
10
13
10
7
10
6
10
10
10
13
10
2
10
12
10
14
10
11
10
4
10
5
10
2
10
1
10
13
10
6
10
4
10
3
10
8
10
8
10
10
10
13
10
6
10
3
10
7
10
8
10
12
10
12
10
3
10
5
10
7
10
4
10
7
10
14
10
7
10
13
10
2
10
11
10
2
10
10
10
0
10
3
10
11
10
4
10
7
10
12
10
11
10
13
10
7
10
7
10
14
10
7
10
13
10
11
10
10
10
6
10
14
10
3
10
9
10
10
10
4
10
6
10
5
10
10
10
10
10
8
10
2
10
10
10
14
10
14
10
3
10
8
10
13
10
12
10
9
10
10
10
3
10
6
10
11
10
8
10
8
10
5
10
15
10
0
10
10
10
11
10
7
10
2
10
8
10
1
10
11
10
11
10
8
10
2
10
13
10
8
10
14
10
0
10
12
10
12
10
3
10
6
10
14
10
6
10
9
10
2
10
11
10
1
10
9
10
10
10
15
10
5
10
6
10
10
10
7
10
2
10
2
10
10
10
11
10
6
10
4
10
0
10
14
10
9
10
2
10
0
10
9
10
13
10
11
10
2
10
10
10
14
10
6
10
8
10
15
10
10
10
14
10
13
10
8
10
11
10
7
10
12
10
6
10
14
10
4
10
10
10
0
10
2
10
0
10
0
10
7
10
14
10
5
10
3
10
8
10
10
10
14
10
14
10
7
10
10
10
10
10
12
10
8
10
4
10
0
10
11
10
1
10
3
10
3
10
2


In [93]:
test_acc

0.21052631578947367

In [94]:
torch.save(gcn.state_dict(), 'gcn_model3.pth')

In [18]:
gcn.load_state_dict(torch.load('gcn_model3.pth'))

<All keys matched successfully>

In [None]:
def predict(data):
    gcn.eval()
    out = gcn(data.x, data.edge_index, data.batch)
    pred = out.argmax(dim=1)
    return pred

In [None]:
x = torch.tensor([[1, 1], [0, 1], [0, 1], [0, 1], [2, 2], [3, 9], [3, 8], [4, 3], [4, 10], [4, 3], [4, 2], [4, 3], [4, 2], [4, 3], [4, 2], [4, 3], [4, 3], [4, 3]], dtype=torch.float)
y = torch.tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.long)
edge_index = torch.tensor([[1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 
                           [5, 6, 8, 0, 7, 9, 10, 11, 13, 15, 16, 17, 7, 9, 11, 13, 15, 16, 17, 8, 4, 8, 8, 12, 8, 14, 8, 8, 8, 8]], dtype=torch.long)
gcn.eval()
batch = torch.tensor([0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0 ], dtype=torch.long)
data_trial = Data(x=x, edge_index=edge_index, y=y, batch = batch)

out = gcn(data_trial.x, data_trial.edge_index, data_trial.batch)
print(out)
pred = out.argmax(dim=1)
print(pred)

tensor([[0.0000e+00, 0.0000e+00, 1.2009e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
         6.7527e-40, 0.0000e+00, 2.1214e-25, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)
tensor([10])


In [None]:
gcn = GCN()
gcn.load_state_dict(torch.load('gcn_model3.pth'))
x = torch.tensor([[1, 1], [0, 1], [0, 1], [0, 1], [2, 2], [3, 9], [3, 8], [4, 3], [4, 10], [4, 3], [4, 2], [4, 3], [4, 2], [4, 3], [4, 2], [4, 3], [4, 3], [4, 3]], dtype=torch.float)
y = torch.tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.long)
edge_index = torch.tensor([[1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 
                           [5, 6, 8, 0, 7, 9, 10, 11, 13, 15, 16, 17, 7, 9, 11, 13, 15, 16, 17, 8, 4, 8, 8, 12, 8, 14, 8, 8, 8, 8]], dtype=torch.long)
gcn.eval()
batch = torch.tensor([0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0 ], dtype=torch.long)
data_trial = Data(x=x, edge_index=edge_index, y=y, batch = batch)

out = gcn(data_trial.x, data_trial.edge_index, data_trial.batch)
print(out)
pred = out.argmax(dim=1)
print(pred)


In [None]:
for data in test_dataset:
    out = gcn(data_trial.x, data_trial.edge_index, data_trial.batch)
    pred = out.argmax(dim=1)

# Experiment 1


### Mode of Operation

* 321 verilog files 
* only 3 features             [type, operation_type, num_of_connections]
* no edge attribute
* 18 classes 
* 200 epochs 
* learning rate = 0.01
* Dropoout = 0.4
* Adam Optimizer
* train 70, test 30 (on whole dataset, not each class)
* time of training = seconds
* conv relu conv relu conv dropout linear 


### Results

* Train acc:  0.2902
* Test Acc: 0.1959


### Suggested Modifications for upcoming experiments

1) Clean dataset (by removing unnecessay, uninformative or wrong code files)
2) remove reduntant parsing (different files but same parsing)
3) include more informative features
4) improve encoding format
5) try using less classes (most important ones, so that less classes but more balanced dataset)
6) adding more files
7) adjusting hyperparameters such as learning rate, dropout, ...etc
8) splitting train, val, test
9) using equal percentages of each class (adjusting splitting)

# Experiment 3



2 node features (node type, num_connections)
no edge attrib

train= 25, test = 21

lr = 0.001

16 classes

314 file only  (removed bcd and seg)

arch => conv relu conv relu conv dropout linear 

200 epochs