In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp /content/drive/MyDrive/extendable_test.hdf5 extendable_test.hdf5

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [22]:
import tables as tb, numpy as np
hdf5_epath = 'extendable_test.hdf5'
h5f = tb.open_file(hdf5_epath, mode='r')

X = np.array([_ for _ in h5f.root.MyData.X[:]])
Y = np.array([_ for _ in h5f.root.MyData.Y[:]])
ID = np.array([_ for _ in h5f.root.MyData.id[:]])
CUI = np.array([_ for _ in h5f.root.MyData.cui[:]])
h5f.close()

In [23]:
X = np.resize(X, (-1,786))[1:]
y = np.resize(Y, (-1,1))[1:]
ID = np.resize(ID, (-1,1))[1:]
CUI = np.resize(CUI, (-1,1))[1:]
X.shape, y.shape, ID.shape

((147760, 786), (147760, 1), (147760, 1))

In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [7]:
BATCH_SIZE = 32
MODEL_PATH = "NSP_classification.bin"

In [8]:
# train data
class data_loader(Dataset):
    def __init__(self, X, y):
        self.X_data = X
        self.y_data = y
    def __getitem__(self, index):
        return torch.FloatTensor(self.X_data[index]), torch.FloatTensor(self.y_data[index])
    def __len__ (self):
        return len(self.X_data)

dataloader = DataLoader(data_loader(X, y), batch_size=BATCH_SIZE)

In [9]:
!cp /content/drive/MyDrive/NSP_classification_model.bin {MODEL_PATH}

class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        self.layer_1 = nn.Linear(786, 256)
        self.layer_2 = nn.Linear(256, 128)
        self.layer_out = nn.Linear(128, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(256)
        self.batchnorm2 = nn.BatchNorm1d(128)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x


In [10]:
model = torch.load(MODEL_PATH, map_location='cpu')
model.to(device)

binaryClassification(
  (layer_1): Linear(in_features=786, out_features=256, bias=True)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [11]:
y_pred_list, y_score_list = [], []
model.eval()
with torch.no_grad():
    for x in X:
        x = torch.FloatTensor([x])
        x = x.to(device)
        y_test_pred = model(x)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy()[0][0])
        y_score_list.append(y_test_pred.cpu().numpy()[0][0])

In [12]:
print(accuracy_score(y_pred_list, [_[0] for _ in y]))
confusion_matrix(y_pred_list, [_[0] for _ in y])

0.9071873308067135


array([[122117,   8062],
       [  5652,  11929]])

In [25]:
len(ID), len(CUI), len(y), len(y_pred_list), len(y_score_list)

(147760, 147760, 147760, 147760, 147760)

In [27]:
result = pd.DataFrame({'id': ID.flatten(), 'CUI': CUI.flatten(), 'y': y.flatten(), 'score': y_score_list, 'prediction': y_pred_list })
result = result.set_index('id')
result.head()

Unnamed: 0_level_0,CUI,y,score,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,b'C0019699',1,9.049319e-05,0.0
0,b'C0019699',1,7.057533e-08,0.0
0,b'C0019699',1,6.002809e-10,0.0
0,b'C0019699',1,4.885939e-10,0.0
0,b'C0019699',1,0.02229213,0.0


In [44]:
new_y, new_y_pred = [], []
indices = list(set(result.index))
for index in indices:
  df = result.xs(index)
  if isinstance(df, pd.Series):
    # print(df[0], df[1], df[2], df[3])
    new_y.append(df[1])
    new_y_pred.append(df[3])
  else:
    df = df.groupby('CUI').max()
    temp = df[df['score']==max(df['score'])]
    new_y.append(temp['y'].values[0])
    new_y_pred.append(temp['prediction'].values[0])

In [46]:
accuracy_score(new_y, new_y_pred)

0.7236716815844602

In [17]:
count = 0
for y, py in zip(new_y, new_y_pred):
  if y == py:
    count += 1
print(count/len(new_y))

0.724052561416873
