In [1]:
!nvidia-smi

Wed Mar 15 13:18:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   33C    P0    43W / 300W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!module load python/3.9.6

In [3]:
!pip3 install dgl pytorch_lightning torchmetrics pandas numpy

Ignoring pip: markers 'python_version < "3"' don't match your environment
Looking in links: /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/avx2, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/gentoo/generic, /cvmfs/soft.computecanada.ca/custom/python/wheelhouse/generic


## Data Module

In [4]:
from data_loader.dataset_ import PlaseDectDataset
import pytorch_lightning as pl
from utils.data import static_splitter
import dgl
import multiprocessing

class DataModule(pl.LightningDataModule):
    def __init__(self, data_dir, args, num_workers = multiprocessing.cpu_count() , batch_size = 32):
        super().__init__()
        train_split, test_split, val_split =  static_splitter(data_dir)

        self.train_split = PlaseDectDataset(train_split, args)
        self.test_split = PlaseDectDataset(test_split, args)
        self.val_split = PlaseDectDataset(val_split, args)

        self.batch_size = batch_size
        self.num_workers = num_workers

    def train_dataloader(self):
        return dgl.dataloading.GraphDataLoader(
        self.train_split,
        batch_size=self.batch_size,
        num_workers=self.num_workers,
        shuffle=True
    )

    def val_dataloader(self):
        return dgl.dataloading.GraphDataLoader(
        self.val_split,
        batch_size=self.batch_size,
        num_workers=self.num_workers,
        shuffle=False
    )

    def test_dataloader(self):
        return dgl.dataloading.GraphDataLoader(
        self.test_split,
        batch_size=self.batch_size,
        num_workers=self.num_workers,
        shuffle=False
    )

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/mootez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Model Module

In [31]:
import torch
import pytorch_lightning as pl
from torch.optim import Adam
import torchmetrics

class PlastDectClassifier(pl.LightningModule):
    def __init__(self, graph_model, loss_func, lr=1e-3, weight_decay=1e-2):
        super().__init__()
        self.save_hyperparameters()
        #### Graph encoder (GGNN, GATv2, GCN)
        self.graph_model = graph_model
        #### loss function
        self.loss_func = loss_func
        #### Metrics
        self.acc = torchmetrics.Accuracy(task='binary')
        self.f1 = torchmetrics.F1Score(task='binary')
        self.mcc = torchmetrics.MatthewsCorrCoef(task='binary')
        #### Optimizer params
        self.lr=lr
        self.weight_decay=weight_decay
    

    def forward(self, x):
        x = self.graph_model(x)

        return x

    def training_step(self, batch, batch_idx):
        g, y = batch
        logits = self.graph_model(g)
        y = y.float().unsqueeze(1)
        loss = self.loss_func(logits, y)
        preds = torch.argmax(logits, dim=1)
        preds = preds.float().unsqueeze(1)
        acc = self.acc(preds, y)
        f1 = self.f1(preds, y)
        mcc = self.mcc(preds, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
        self.log('train_f1', f1, on_step=True, on_epoch=True, logger=True)
        self.log('train_mcc', mcc, on_step=True, on_epoch=True, logger=True)

        return loss
    
    def validation_step(self, batch, batch_idx):
        g, y = batch
        logits = self.graph_model(g)
        y = y.float().unsqueeze(1)
        loss = self.loss_func(logits, y)
        preds = torch.argmax(logits, dim=1)
        preds = preds.float().unsqueeze(1)
        acc = self.acc(preds, y)
        f1 = self.f1(preds, y)
        mcc = self.mcc(preds, y)
        self.log('val_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('val_acc', acc, on_step=True, on_epoch=True, logger=True)
        self.log('val_f1', f1, on_step=True, on_epoch=True, logger=True)
        self.log('val_mcc', mcc, on_step=True, on_epoch=True, logger=True)

        return loss
    
    def test_step(self, batch, batch_idx):
        g, y = batch
        logits = self.graph_model(g)
        y = y.float().unsqueeze(1)
        loss = self.loss_func(logits, y)
        preds = torch.argmax(logits, dim=1)
        preds = preds.float().unsqueeze(1)
        acc = self.acc(preds, y)
        f1 = self.f1(preds, y)
        mcc = self.mcc(preds, y)
        self.log('test_loss', loss, on_step=True, on_epoch=True, logger=True)
        self.log('test_acc', acc, on_step=True, on_epoch=True, logger=True)
        self.log('test_f1', f1, on_step=True, on_epoch=True, logger=True)
        self.log('test_mcc', mcc, on_step=True, on_epoch=True, logger=True)

        return loss
        

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=1e-3, weight_decay=1e-2)
        return optimizer

## Main

In [32]:
from modules.model import GGNN
class args:
    dataset="java_cc_utc"
    data_src="/home/mootez/scratch/code_smells_dataset/ComplexConditional.csv"
    graph_embed_size=256 
    feature_size=256 
    emb_type="w2v" 
    w2v="/home/mootez/projects/def-tusharma/mootez/pl-agnostic-smell-detection/wv_models/word2vec_ir"
    tok="nltk" 
    build_method="utc" 
    window_size=3
    read_out="sum"
    max_etypes=1
    num_steps=6

In [33]:
encoder = GGNN(input_dim=args.feature_size, output_dim=args.graph_embed_size,
                        num_steps=args.num_steps, max_edge_types=args.max_etypes, read_out=args.read_out)

In [34]:
data_module = DataModule(args.data_src, args)

[03/15/2023 - 13:32:12] File "/project/6067998/mootez/BaseGNN/data_loader/dataset_.py", line 41  	Number of val instances: 2000	
[03/15/2023 - 13:32:23] File "/project/6067998/mootez/BaseGNN/data_loader/dataset_.py", line 41  	Number of train instances: 8000	
[03/15/2023 - 13:32:33] File "/project/6067998/mootez/BaseGNN/data_loader/dataset_.py", line 41  	Number of eval instances: 145953	


In [35]:
loss_function = torch.nn.BCELoss(reduction='sum')

In [36]:
model = PlastDectClassifier(encoder, loss_function)

  rank_zero_warn(
  rank_zero_warn(


In [37]:
trainer = pl.Trainer(
    accelerator='gpu',
    devices=[0],
    max_epochs=3,
    log_every_n_steps=100,
)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [38]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [39]:
trainer.fit(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                   | Params
-------------------------------------------------------
0 | graph_model | GGNN                   | 460 K 
1 | loss_func   | BCELoss                | 0     
2 | acc         | BinaryAccuracy         | 0     
3 | f1          | BinaryF1Score          | 0     
4 | mcc         | BinaryMatthewsCorrCoef | 0     
-------------------------------------------------------
460 K     Trainable params
0         Non-trainable params
460 K     Total params
1.843     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]torch.Size([32, 1])
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.69it/s]torch.Size([32, 1])
                                                                           

  rank_zero_warn(


Epoch 0:   0%|          | 0/63 [00:00<?, ?it/s] torch.Size([32, 1])




Epoch 0:   2%|▏         | 1/63 [00:09<10:01,  9.69s/it, v_num=5]torch.Size([32, 1])
Epoch 0:   3%|▎         | 2/63 [00:09<04:59,  4.91s/it, v_num=5]torch.Size([32, 1])
Epoch 0:   5%|▍         | 3/63 [00:09<03:17,  3.29s/it, v_num=5]torch.Size([32, 1])
Epoch 0:   6%|▋         | 4/63 [00:09<02:25,  2.47s/it, v_num=5]torch.Size([32, 1])
Epoch 0:   8%|▊         | 5/63 [00:09<01:54,  1.98s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  10%|▉         | 6/63 [00:09<01:34,  1.66s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  11%|█         | 7/63 [00:09<01:19,  1.42s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  13%|█▎        | 8/63 [00:09<01:08,  1.25s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  14%|█▍        | 9/63 [00:10<01:00,  1.11s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  16%|█▌        | 10/63 [00:10<00:53,  1.01s/it, v_num=5]torch.Size([32, 1])
Epoch 0:  17%|█▋        | 11/63 [00:10<00:47,  1.09it/s, v_num=5]torch.Size([32, 1])
Epoch 0:  19%|█▉        | 12/63 [00:10<00:42,  1.19it/s, v_num=5]torch.Siz

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
