In [1]:
import torch
import torch.nn as nn
import numpy as np
from statistics import mean
from create_datasets import *
from sklearn.metrics import accuracy_score

In [2]:
final_edges = np.load('../datasets/final_edges.dump', allow_pickle=True)

In [None]:
data = generate_fingerprints(final_edges)

  0%|          | 0/87153 [00:00<?, ?it/s]

In [None]:
class CNN_net(nn.Module):
    def __init__(self, inp_len=1024, out_len = 256, in_c=1):
        super().__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_c, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(128, 256, kernel_size=7, stride=1, padding=3),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(256*inp_len, 512),
            nn.Sigmoid(),
            nn.Linear(512, out_len),
#             nn.Dropout(0.3)
        )

        
    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.view(x.size(0), -1) # flat
        x = self.decoder(x)
        
        return x

In [None]:
class FullNet(nn.Module):
    def __init__(self, finger_print_model, graph_embedding_model, combined_model):
        super().__init__()
        self.FP_model = finger_print_model
        self.GE_model = graph_embedding_model
        self.CB_model = combined_model
    
    def forward(self, fp, ge):
        fp_out = self.FP_model(fp)
        ge_out = self.GE_model(ge)
        inp = torch.cat((fp_out, ge_out), 1)
        inp = inp.unsqueeze(1)
        out = self.CB_model(inp)
        return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
dataset = LinkDataset(data)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train , test = torch.utils.data.random_split(dataset, [train_size, test_size])
BATCH_SIZE = 256
trainloader = DataLoader(train, num_workers = 16, batch_size= BATCH_SIZE)
testloader = DataLoader(test, num_workers = 16, batch_size= BATCH_SIZE)

In [None]:
model = FullNet(CNN_net(inp_len=1024, out_len=256), 
                CNN_net(inp_len=256, out_len =256), 
                CNN_net(inp_len=512, out_len = 2))
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-6
                            )
# optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)

In [None]:
out = model(torch.rand(32, 1, 1024).float().to(device), torch.rand(32, 1, 256).float().to(device))

In [None]:
!nvidia-smi

In [None]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

In [None]:
def eval(model,testloader):
    model.eval()
    test_loss = 0.0
    preds = []
    trues = []
    for fp, ge, label in testloader:
        output = model(fp.float().to(device),ge.float().to(device))
        loss = criterion(output, label.float().to(device))
        test_loss+=loss.item()
        for i in range(len(output)):
            pred = output[i].argmax().item()
            true = label[i].argmax().item()
            preds.append(pred)
            trues.append(true)
    model.train()
    print("Accuracy", accuracy_score(preds, trues))
    return accuracy_score(preds, trues), test_loss / len(testloader)

In [None]:
train_losses = []
test_losses = []
num_epochs= 50
best_acc = 0.0
acc_list = []
for epoch in tqdm(range(1, num_epochs)):
    train_loss = 0.0
    model.train()
    batch_id = 0
    for fp, ge, label in trainloader:
        batch_id +=1
        
        optimizer.zero_grad()
        output = model(fp.float().to(device),ge.float().to(device))
        loss = criterion(output, label.float().to(device))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() 
    
        print(f'Epoch:{epoch} batch {batch_id}/{len(trainloader)} loss:{loss.item()}', end='\r')
    
    acc, test_loss = eval(model, testloader)
    acc_list.append(acc)

    if acc > best_acc:
        best_acc = acc
        print("Improved Accuracy is", acc )
        torch.save(model, 'SAVED_MODELS/CNN-bestmodel_4.pt')
        with open('SAVED_MODELS/CNN-bestmodel_4.txt', 'w') as f:
            print(model.eval() , "Accuracy" , acc, file=f)

#     else:
#         model = torch.load('SAVED_MODELS/CNN-bestmodel_3.pt')
    
    print()
    print("Train loss: ",train_loss/len(trainloader))
    print("Test  loss: ",test_loss)
    
    train_losses.append(train_loss/len(trainloader))
    test_losses.append(test_loss)

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses)
plt.plot(test_losses)

In [None]:
# model = torch.load('SAVED_MODELS/CNN-bestmodel_4.pt')
# eval(model, testloader)

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay
def get_performance(model, testloader):
    model.eval()
    test_loss = 0.0
    preds = []
    trues = []
    for fp, ge, label in testloader:
        output = model(fp.float().to(device),ge.float().to(device))
        loss = criterion(output, label.float().to(device))
        test_loss+=loss.item()
        for i in range(len(output)):
            pred = output[i].argmax().item()
            true = label[i].argmax().item()
            preds.append(pred)
            trues.append(true)
    model.train()
    print("Accuracy", accuracy_score(preds, trues))
    print("f1 score", f1_score(preds, trues))
    print(classification_report(trues, preds, labels=[0,1]))
    print()
    cm = confusion_matrix(trues, preds, labels=[0,1])
    disp = ConfusionMatrixDisplay(cm, np.array([0,1]))
    disp.plot()
    plt.show()

In [None]:
get_performance(model, testloader)

In [None]:
# torch.save(model, 'CNN-model-acc-0.888.pt')
# with open('CNN-model-acc-0.888.txt', 'w') as f:
#     print(model.eval(), file=f)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
print(model.parameters())

In [None]:
from sklearn.model_selection import KFold

In [None]:
def reset_weights(m):
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            print(f'Reset trainable parameters of layer = {layer}')
            layer.reset_parameters()

In [None]:
k_folds = 5
num_epochs = 1
loss_function = nn.CrossEntropyLoss()

# For fold results
results = {}

# Set fixed random number seed
torch.manual_seed(42)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=k_folds, shuffle=True)

In [None]:
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    
    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
                      dataset, 
                      batch_size=128, sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=128, sampler=test_subsampler)
    
    model = FullNet(CNN_net(inp_len=1024, out_len=256), 
                CNN_net(inp_len=256, out_len =256), 
                CNN_net(inp_len=512, out_len = 2))
    
    model.apply(reset_weights)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    
    for epoch in range(0, num_epochs):
        
        print(f'Starting epoch {epoch+1}')
        current_loss = 0.0
        train_loss = 0.0
        model.train()
        batch_id = 0
        for fp, ge, label in trainloader:
            batch_id +=1
        
            optimizer.zero_grad()
            output = model(fp.float().to(device),ge.float().to(device))
            loss = criterion(output, label.float().to(device))
            loss.backward()
            optimizer.step()
            train_loss += loss.item() 
    
            print(f'Epoch:{epoch} batch {batch_id}/{len(trainloader)} loss:{loss.item()}', end='\r')
        
    print('Training process has finished. Saving trained model.')
    print('Starting testing')
    
    # Saving the model
    save_path = f'./model-fold-{fold}.pth'
    torch.save(network.state_dict(), save_path)

    # Evaluationfor this fold
    acc, test_loss = eval(model, testloader)
    results[fold] = acc * 100

# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value} %')
    sum += value
print(f'Average: {sum/len(results.items())} %')