<a href="https://colab.research.google.com/github/HuyenNguyenHelen/CORD-19-KG/blob/master/Evaluation/BERT_based_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT-based Evaluation: correctness, and relatedness
Use the saved BERT model to automatically label correctness for entities and triples, and relatedness fo entities in the ten KGs.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch_pretrained_bert pytorch-nlp

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 31.2 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 8.1 MB/s eta 0:00:01[K     |████████                        | 30 kB 7.2 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 3.3 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 3.4 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 4.0 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 4.3 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 4.8 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 3.9 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 3.9 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 3.9 MB/s eta 0:00:01[K     |████████████████████████████████| 1

In [3]:
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [None]:
# Define BERT model
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        # First Layer
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)

        dropout_output = self.dropout(pooled_output)

        linear_output = self.linear(dropout_output)
        
        # output layer
        proba = self.sigmoid(linear_output)
        
        return proba

# Create main function
def main(saved_model_path, data_path, correct_compreh = 'correct_ent' ):
  """
  correct_compreh: 'correct_ent', 'correct_trip', 'compreh'
  """
  # Loading data
  
  with open(data_path, 'r') as f:
    data_test = pd.read_csv(f)
  
  if correct_compreh == 'correct_ent':
    X_test = data_test['subject'] + data_test['object']

    # Tokenizer 
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  # get max len in tokenized train text to set the tokens length in the next step
  MAX_LEN = max(map(len, X_test))  # can do len(max(X_train, key=len)) also
  print('MAX LEN of trainning sentence is:', MAX_LEN, '\nMAX LEN > 512 is ', MAX_LEN>512)

  # Update MAX LEN if it's > 512, set it to be 225 
  ## 512 is is the maximum seq len of BERT_BASE. But we cannot allow the seq len to be 512 since we'll run out of GPU memory --> Use max len of 225
  MAX_LEN = 225 if MAX_LEN > 512 else MAX_LEN

  # Convert to tokens using tokenizer
  test_tokens  = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[: MAX_LEN] + ['[SEP]'], X_test.to_list()))

  print( '\nNumber of Testing Sequences:', len(test_tokens) )
  # Following is to convert List of words to list of numbers. (Words are replaced by their index in dictionar)
  test_tokens_ids  = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)),  maxlen= MAX_LEN, truncating="post", padding="post", dtype="int")
  # Mask the paddings with 0 and words with 1
  test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

  ## Converting test token ids, test labels and test masks to a tensor and the create a tensor dataset out of them.
  # Convert token ids to tensor 
  test_tokens_tensor = torch.tensor(test_tokens_ids)

  # Convert labels to tensors
  # test_y_tensor = torch.tensor(y_test.to_numpy().reshape(-1, 1)).float()

  # Convert to tensor for maks
  test_masks_tensor = torch.tensor(test_masks)

  # Load Token, token mask and label into Dataloader
  test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor)

  # Define sampler
  test_sampler = SequentialSampler(test_dataset)

  # Define test data loader
  test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  bert_clf = BertBinaryClassifier()
  bert_clf.load_state_dict(torch.load(saved_model_path),  strict=False)

  bert_clf.eval()     # Define eval
  bert_predicted = [] # To Store predicted result
  all_logits = []     # Predicted probabilities that is between 0 to 1 is stored here

  with torch.no_grad():
      for step_num, batch_data in enumerate(test_dataloader):

          # Load the batch on gpu memory
          token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

          # Calculate ouput of bert
          logits = bert_clf(token_ids, masks)

          # Get the numpy logits
          numpy_logits = logits.cpu().detach().numpy()  # Detach from the GPU memory
          
          # Using the threshold find binary 
          bert_predicted += list(numpy_logits[:, 0] > 0.5)  # Threshold conversion
          # all_logits += list(numpy_logits[:, 0])
  print(bert_predicted)

if __name__=='__main__':
  main(saved_model_path = '/content/drive/MyDrive/KG/KG_EVAL_SAVE_MODEL/all.h5', 
       data_path = '/content/drive/MyDrive/KG/KG_10fold_data/subset_9.csv'  )
