# PRE-PROCESSING

In [27]:
import os
import sys
import cv2
import json
import torch
import random
import warnings
import subprocess
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler, BatchSampler, SequentialSampler

from evaluation_metrics import prec_rec_f1_acc_mcc, get_list_of_scores

project_file_path = "{}CS271-DTI".format(os.getcwd().split("CS271-DTI")[0])
training_files_path = "{}/training_files".format(project_file_path)

# print(project_file_path, training_files_path)

In [28]:
# PRE-PROCESSING

from torch.utils.data import Dataset

class DataSet(Dataset):
    def __init__(self, target_id, train_val_test):
        print("Fetching {} dataset for Target: {}".format(train_val_test, target_id))
        self.target_id = target_id
        self.train_val_test = train_val_test
        self.training_dataset_path = "{}/{}".format(training_files_path, target_id)

        #         print(self.training_dataset_path)

        self.train_val_test_folds = json.load(open(os.path.join(self.training_dataset_path, "train_val_test_dict.json")))
        self.compid_list = [compid_label[0] for compid_label in self.train_val_test_folds[train_val_test]]
        self.label_list = [compid_label[1] for compid_label in self.train_val_test_folds[train_val_test]]
        
    def __len__(self):
        return len(self.compid_list)

    def __getitem__(self, index):
        comp_id = self.compid_list[index]
        img_path = os.path.join(self.training_dataset_path, "imgs", "{}.png".format(comp_id))
        img_arr = cv2.imread(img_path)
        if random.random()>=0.50:
            angle = random.randint(0,359)
            rows, cols, channel = img_arr.shape
            rotation_matrix = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
            img_arr = cv2.warpAffine(img_arr, rotation_matrix, (cols, rows), cv2.INTER_LINEAR,
                                                 borderValue=(255, 255, 255))
        img_arr = np.array(img_arr) / 255.0
        img_arr = img_arr.transpose((2, 0, 1))
        label = self.label_list[index]
        return img_arr, label, comp_id


def get_dataLoader(target_id, batch_size=32):
    training_dataset = DataSet(target_id, "training")
    validation_dataset = DataSet(target_id, "validation")
    test_dataset = DataSet(target_id, "test")
    train_sampler = SubsetRandomSampler(range(len(training_dataset)))
    train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=batch_size,
                                              sampler=train_sampler)

    validation_sampler = SubsetRandomSampler(range(len(validation_dataset)))
    validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size,
                                               sampler=validation_sampler)

    test_sampler = SubsetRandomSampler(range(len(test_dataset)))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,
                                               sampler=test_sampler)

    return train_loader, validation_loader, test_loader


# MODEL

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from operator import itemgetter

class CNNModel1(nn.Module):
    def __init__(self, fully_layer_1, fully_layer_2, drop_rate):
        super(CNNModel1, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 32, 2)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 2)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 64, 2)
        self.bn4 = nn.BatchNorm2d(64)
        self.conv5 = nn.Conv2d(64, 32, 2)
        self.bn5 = nn.BatchNorm2d(32)

        self.pool = nn.MaxPool2d(2, 2)
        self.drop_rate = drop_rate
        self.fc1 = nn.Linear(32*5*5, fully_layer_1)
        self.fc2 = nn.Linear(fully_layer_1, fully_layer_2)
        self.fc3 = nn.Linear(fully_layer_2, 2)

    def forward(self, x):
#         print(x.shape)
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
#          print(x.shape)
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
#         print(x.shape)
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
#         print(x.shape)
        x = self.pool(F.relu(self.bn4(self.conv4(x))))
#         print(x.shape)
        x = self.pool(F.relu(self.bn5(self.conv5(x))))
#         print(x.shape)

        x = x.view(-1, 32*5*5)
        x = F.dropout(F.relu(self.fc1(x)), self.drop_rate)
        x = F.dropout(F.relu(self.fc2(x)), self.drop_rate)
        x = self.fc3(x)

        return x

In [30]:
def get_device():
    device = "cpu"
    use_gpu = torch.cuda.is_available()
    if use_gpu:
        print("GPU is available on this device!")
        device = "cuda"
    else:
        print("CPU is available on this device!")
    return device


def get_loss(model, criterion, data_loader, device):
    total_count = 0
    total_loss = 0.0
    all_comp_ids = []
    all_labels = []
    all_predictions = []
    for i, data in enumerate(data_loader):
        img_arrs, labels, comp_ids = data
        img_arrs, labels = torch.tensor(img_arrs).type(torch.FloatTensor).to(device), torch.tensor(labels).to(device)
        total_count += len(comp_ids)
        y_pred = model(img_arrs).to(device)
        loss = criterion(y_pred.squeeze(), labels)
        total_loss += float(loss.item())
        all_comp_ids.extend(list(comp_ids))
        _, preds = torch.max(y_pred, 1)
        all_labels.extend(list(labels))
        all_predictions.extend(list(preds))


    return total_loss, total_count, all_comp_ids, all_labels, all_predictions


In [31]:
def train_validation_test_training(target_id, fully_layer_1, fully_layer_2, learning_rate, batch_size, drop_rate, n_epoch, experiment_name):
    arguments = [str(argm) for argm in
                 [target_id, fully_layer_1, fully_layer_2, learning_rate, batch_size, drop_rate, n_epoch, experiment_name]]

    str_arguments = "-".join(arguments)
    print("Arguments:", str_arguments)

    device = get_device()
    
#     exp_path = os.path.join(result_files_path, "experiments", experiment_name)
#     if not os.path.exists(exp_path): os.makedirs(exp_path)

    train_loader, valid_loader, test_loader = get_dataLoader(target_id, batch_size)
        
    model = CNNModel1(fully_layer_1, fully_layer_2, drop_rate).to(device)
        
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    optimizer.zero_grad()

#     best_val_mcc_score, best_test_mcc_score = 0.0, 0.0
#     best_val_test_performance_dict = dict()
#     best_val_test_performance_dict["MCC"] = 0.0

    for epoch in range(n_epoch):
        total_training_count = 0
        total_training_loss = 0.0
        print("\n----------\nEpoch : {}".format(epoch))
        
        model.train()
        batch_number = 0
        
        all_training_labels = []
        all_training_preds = []
        
        print("Training mode")
        
        for i, data in enumerate(train_loader):
            batch_number += 1

            optimizer.zero_grad()
            
            img_arrs, labels, comp_ids = data
            img_arrs, labels = torch.tensor(img_arrs).type(torch.FloatTensor).to(device), torch.tensor(labels).to(device)

            total_training_count += len(comp_ids)
            
            y_pred = model(img_arrs).to(device)
            
            _, preds = torch.max(y_pred, 1)
            all_training_labels.extend(list(labels))
            all_training_preds.extend(list(preds))

            loss = criterion(y_pred.squeeze(), labels)
            total_training_loss += float(loss.item())
            loss.backward()
            optimizer.step()
            
        print("Epoch : {} | Loss: {}".format(epoch,total_training_loss))
        
        training_perf_dict = dict()
        
        try: training_perf_dict = prec_rec_f1_acc_mcc(all_training_labels, all_training_preds)
        except: print("There was a problem during training performance calculation!")
            
        model.eval()
        
        with torch.no_grad():  
            
            print("Validation mode")

            total_val_loss, total_val_count, all_val_comp_ids, all_val_labels, val_predictions = get_loss (model, criterion, valid_loader, device)
            
            val_perf_dict = dict()
            val_perf_dict["MCC"] = 0.0
            
            try: val_perf_dict = prec_rec_f1_acc_mcc(all_val_labels, val_predictions)
            except: print("There was a problem during validation performance calculation!")
            
            total_test_loss, total_test_count, all_test_comp_ids, all_test_labels, test_predictions = get_loss (
                model, criterion, test_loader, device)
            
            test_perf_dict = dict()
            test_perf_dict["MCC"] = 0.0
            
            try: test_perf_dict = prec_rec_f1_acc_mcc(all_test_labels, test_predictions)
            except: print("There was a problem during test performance calculation!")


        if epoch == n_epoch - 1:
            score_list = get_list_of_scores()
            print("Training scores: {}\n\nValidation scores: {}\n".format(training_perf_dict, val_perf_dict))

In [1]:
train_validation_test_training("CHEMBL286",  512, 256, 0.001, 32,
                               0.25, 10, "my_experiment")

    



OUTPUT IN VM INSTANCE:
    
    
Arguments: CHEMBL286-512-256-0.001-32-0.25-10-my_experiment
CPU is available on this device!
Fetching training dataset for Target: CHEMBL286
Fetching validation dataset for Target: CHEMBL286
Fetching test dataset for Target: CHEMBL286

----------
Epoch : 0
Training mode
Epoch : 0 | Loss: 24.623343855142593
Validation mode


----------
Epoch : 1
Training mode
Epoch : 1 | Loss: 21.871842563152313
Validation mode

----------
Epoch : 2
Training mode
Epoch : 2 | Loss: 18.412213176488876
Validation mode

----------
Epoch : 3
Training mode
Epoch : 3 | Loss: 17.25254960358143
Validation mode

----------
Epoch : 4
Training mode
Epoch : 4 | Loss: 16.09722039103508
Validation mode

----------
Epoch : 5
Training mode
Epoch : 5 | Loss: 14.411342665553093
Validation mode

----------
Epoch : 6
Training mode
Epoch : 6 | Loss: 13.260539785027504
Validation mode

----------
Epoch : 7
Training mode
Epoch : 7 | Loss: 13.358637064695358
Validation mode

----------
Epoch : 8
Training mode
Epoch : 8 | Loss: 12.603277996182442
Validation mode

----------
Epoch : 9
Training mode
Epoch : 9 | Loss: 12.466190099716187
Validation mode




Training scores: {'Precision': 0.8565840938722294, 'Recall': 0.9253521126760563, 'F1-Score': 0.8896411645226812, 'Accuracy': 0.8622147083685545, 'MCC': 0.7107543042098888, 'TP': 657, 'FP': 110, 'TN': 363, 'FN': 53}




Validation scores: {'Precision': 0.9161290322580645, 'Recall': 0.797752808988764, 'F1-Score': 0.8528528528528528, 'Accuracy': 0.835016835016835, 'MCC': 0.6754343641638585, 'TP': 142, 'FP': 13, 'TN': 106, 'FN': 36}
    
    

In [8]:
# TF LEARN MODEL -> CONVERT TO KERAS

# from tflearn.activations import relu
# from tflearn.layers.conv import avg_pool_2d, conv_2d, max_pool_2d

# def CNNModel(outnode, model_name,  target, opt, learn_r, epch, n_of_h1, dropout_keep_rate, save_model=False):
#     convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input')

#     convnet = conv_2d(convnet, 32, 5, activation='relu')
#     convnet = max_pool_2d(convnet, 5)

#     convnet = conv_2d(convnet, 64, 5, activation='relu')
#     convnet = max_pool_2d(convnet, 5)

#     convnet = conv_2d(convnet, 128, 5, activation='relu')
#     convnet = max_pool_2d(convnet, 5)

#     convnet = conv_2d(convnet, 64, 5, activation='relu')
#     convnet = max_pool_2d(convnet, 5)

#     convnet = conv_2d(convnet, 32, 5, activation='relu')
#     convnet = max_pool_2d(convnet, 5)

#     convnet = fully_connected(convnet, 1024, activation='relu')
#     convnet = dropout(convnet, 0.8)

#     convnet = fully_connected(convnet, outnode, activation='softmax')
#     convnet = regression(convnet, optimizer=opt, learning_rate=learn_r, loss='categorical_crossentropy', name='targets')

#     str_model_name = "{}_{}_{}_{}_{}_{}_{}_{}".format(model_name,  target, opt, learn_r, epch, n_of_h1, dropout_keep_rate, save_model)

#     model = None

#     if save_model:
#         print("Model will be saved!")
#         model = tflearn.DNN(convnet, checkpoint_path='../tflearnModels/{}'.format(str_model_name), best_checkpoint_path='../tflearnModels/bestModels/best_{}'.format(str_model_name),
#                         max_checkpoints=1, tensorboard_verbose=0, tensorboard_dir="../tflearnLogs/{}/".format(str_model_name))
#     else:
#         model = tflearn.DNN(convnet)

#     return model