In [1]:
!pip install fasttext
!pip install fastparquet
!gdown --id '1cAthveg1d3MjrKJtMKGzfX3eH8HJ-dQp'
!unzip MedNLI_dataset.zip

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.1-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.1-py3-none-any.whl (238 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246764 sha256=8954a4e952cd0749aa782bc5e37839abaec4af57edeb5b4604fea5278334f899
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader, random_split
from sklearn.metrics import accuracy_score

In [3]:
test_data = pd.read_parquet('MedNLI_dataset/test-00000-of-00001-47685aa42db61e77.parquet', engine='fastparquet')
train_data = pd.read_parquet('MedNLI_dataset/train-00000-of-00001-210cfe9263b99806.parquet', engine='fastparquet')
valid_data = pd.read_parquet('MedNLI_dataset/valid-00000-of-00001-cc552de6d1a6fa4b.parquet', engine='fastparquet')

In [4]:
def find_pre_and_hyp(query):
    start_pre = query.find("[PRE]") + len("[PRE]")
    end_pre = query.find("[HYP]")
    start_hyp = query.find("[HYP]") + len("[HYP]")
    end_hyp = query.find("OUTPUT:")
    premise = query[start_pre:end_pre].strip()
    hypothesis = query[start_hyp:end_hyp].strip()

    return premise,hypothesis

In [5]:
class CustomDataset(Dataset):
    def __init__(self, x_list, y_list):
        self.samples = []
        for x,y in zip(x_list,y_list):
            #x_tensor = torch.tensor(x,dtype = torch.float32)
            y_tensor = torch.tensor(y,dtype = torch.float32)
            self.samples.append((x[0],x[1],y_tensor))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
bert_model = AutoModel.from_pretrained("gsarti/biobert-nli").to(device)
src_pad_idx = 0
src_vocab_size = bert_model.config.vocab_size

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [7]:
def get_lists(data):
    x_list = []
    y_list = []
    i =0
    for query,answer in zip(data['query'],data['answer']):
        i = i + 1
        if answer == 'entailment':
           y = [1,0,0]
        elif answer == 'neutral':
           y = [0,1,0]
        elif answer == 'contradiction':
           y = [0,0,1]
        else:
           print('should not get here')

        premise,hypothesis = find_pre_and_hyp(query)
        x_list.append((premise,hypothesis))
        y_list.append(y)
    return x_list,y_list

In [8]:
train_x_list,train_y_list = get_lists(train_data)
test_x_list,test_y_list = get_lists(test_data)
val_x_list,val_y_list = get_lists(valid_data)

In [9]:
print(len(train_x_list))
print(len(train_y_list))
print(len(test_x_list))
print(len(test_y_list))
print(len(val_x_list))
print(len(val_y_list))

11232
11232
1422
1422
1395
1395


In [10]:
train_dataset = CustomDataset(train_x_list,train_y_list)
test_dataset = CustomDataset(test_x_list,test_y_list)
val_dataset = CustomDataset(val_x_list,val_y_list)

In [11]:
class embedding_layer(nn.Module):
  def __init__(self,bert_model,tokenizer):
    super(embedding_layer, self).__init__()
    self.bert_model = bert_model
    self.tokenizer = tokenizer

  def forward(self, x):
    with torch.no_grad():
         s = tokenizer(x,return_tensors="pt",padding=True).to(device)
         vec = bert_model(**s)['last_hidden_state'].to(device)
    return vec

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, seq_len,embedding_size):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        self.embedding_size = embedding_size
        self.seq_len = seq_len

    def forward(self, x):
        pe = torch.zeros(x.size(0), x.size(1), self.embedding_size).to(device)
        div_term = torch.zeros(x.size(0), 1, self.embedding_size).to(device)
        ks = torch.arange(self.embedding_size).float().to(device)
        values = torch.exp(-torch.log(torch.tensor(1000.0)) * 2 * ks / self.embedding_size).to(device)
        values = values.view(1, 1, -1).to(device)
        div_term = div_term + values
        x = x.reshape([x.shape[0],x.shape[1],1]).to(device)
        pe[:, :, ::2] = torch.sin(x * div_term)[:, :, ::2].to(device)
        pe[:, :, 1::2] = torch.cos(x * div_term)[:, :, 1::2].to(device)
        return self.dropout(pe)

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_size, heads):
        super().__init__()
        self.embedding_size = embedding_size
        self.heads = heads
        self.head_dim = embedding_size // heads
        assert(self.heads * self.head_dim == self.embedding_size), "Invalid number of heads"
        self.fc_values = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.fc_keys = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.fc_queries = nn.Linear(self.head_dim, self.head_dim, bias=False).to(device)
        self.fc_out = nn.Linear(heads * self.head_dim, embedding_size).to(device)

    def forward(self, values, keys, queries, mask):
        N = queries.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)
        values = self.fc_values(values).to(device)
        keys = self.fc_keys(keys).to(device)
        queries = self.fc_queries(queries).to(device)
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]).to(device)
        if mask is not None:
            energy = energy.masked_fill(mask.to(device) == 0, float("-1e20")).to(device)
        energy = torch.softmax(energy / (self.embedding_size ** 0.5), dim=3).to(device)
        attention = torch.einsum("nhql,nlhd->nqhd", [energy, values]).to(device)
        attention = attention.reshape(N, query_len, self.heads * self.head_dim).to(device)
        out = self.fc_out(attention)
        return out

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_size, heads, forward_expansion, p):
        super().__init__()
        self.attention = MultiHeadAttention(embedding_size, heads)
        self.norm1 = nn.LayerNorm(embedding_size)
        self.feed_forward = nn.Sequential(nn.Linear(embedding_size, forward_expansion * embedding_size),
                                          nn.ReLU(),
                                          nn.Linear(forward_expansion * embedding_size, embedding_size))
        self.norm2 = nn.LayerNorm(embedding_size)
        self.dropout = nn.Dropout(p)
    def forward(self, values, keys, queries, mask):
        attention_out = self.attention(values, keys, queries, mask)
        x = self.norm1(attention_out + queries)
        x = self.dropout(x)
        ff_out = self.feed_forward(x)
        out = self.norm2(ff_out + x)
        out = self.dropout(out)
        return out

In [15]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embedding_size, num_layers, heads,
                 forward_expansion, max_length, p, device):
        super().__init__()
        self.device = device
        self.word_embedding = embedding_layer(bert_model,tokenizer)
        self.positional_embedding = PositionalEncoding(max_length, embedding_size)
        self.layers = nn.ModuleList([TransformerBlock(embedding_size, heads, forward_expansion, p) for _ in range(num_layers)])
        self.dropout = nn.Dropout(p)

    def forward(self, x, mask):
        mask = None
        pe = self.word_embedding(x)
        N = pe.size(0)
        seq_len = pe.size(1)
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
        out = self.dropout((pe + self.positional_embedding(positions)))
        for layer in self.layers:
            out = layer(out, out, out ,mask)
        return out

In [16]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, src_pad_idx, embedding_size=768,
                 num_layers=1, forward_expansion=8, heads=8, max_length=100, p=0.1):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.encoder = Encoder(src_vocab_size, embedding_size, num_layers, heads,
                               forward_expansion, max_length, p, device)

    def get_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2).to(device)
        return src_mask

    def forward(self, src):
        src_mask = None
        enc_out = self.encoder(src, src_mask).to(device)
        return enc_out

In [27]:
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.transformer1 = Transformer(src_vocab_size, src_pad_idx).to(device)
        self.transformer2 = Transformer(src_vocab_size, src_pad_idx).to(device)
        self.lstm1 = nn.LSTM(768, 20, 1,batch_first = True).to(device)
        self.lstm2 = nn.LSTM(768, 20, 1,batch_first = True).to(device)
        self.fc1 = nn.Linear(20, 3).to(device)

    def forward(self, x_pre,x_hyp):
        x_pre = self.transformer1(x_pre)
        x_hyp = self.transformer2(x_hyp)
        output, (hn, cn) = self.lstm1(x_pre)
        xhyp_h0 = hn
        xhyp_c0 = cn
        output, (hn, cn) = self.lstm2(x_hyp, (xhyp_h0, xhyp_c0))
        a = self.fc1(hn)
        x = a.reshape([a.shape[1],a.shape[2]])
        x = nn.functional.softmax(x,dim=1)
        return x

In [28]:
bsize = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=bsize, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=bsize, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=bsize, shuffle=False)

In [35]:
model = NN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
#optimizer = optim.Adam(model.parameters(), lr=0.001)

In [36]:
def get_model_acc(model,data_loader):
    model.eval()
    predictions = []
    true_labels = []
    for x_pre, x_hyp, y in data_loader:
        outputs = model(x_pre,x_hyp)
        predicted_labels = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(predicted_labels)
        l1 = torch.argmax(y, dim=1).cpu().numpy()
        true_labels.extend(l1)
    model.train()
    return accuracy_score(true_labels, predictions)

In [39]:
model.train()
num_epochs = 10
for epoch in range(num_epochs):
    for x_pre,x_hyp,y in train_loader:
        optimizer.zero_grad()
        y = y.to(device)
        outputs = model(x_pre,x_hyp)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
    #print(get_model_acc(model,train_loader))
    print(get_model_acc(model,test_loader))
    print(loss.item())
    print('========================================')
print('Training completed.')

0.6420534458509142
0.8776193261146545
0.6420534458509142
0.8146007657051086
0.6385372714486639
0.9916337728500366
0.6371308016877637
0.9229754209518433
0.6371308016877637
0.900733470916748
0.6385372714486639
0.8907442688941956
0.6392405063291139
0.8992757797241211
0.6497890295358649
0.8326966762542725
0.6336146272855133
0.9114311933517456
0.6462728551336147
0.8283160924911499
Training completed.


In [None]:
get_model_acc(model,test_loader)

0.7215189873417721

In [None]:
get_model_acc(model,val_loader)

0.7469534050179212

In [None]:
get_model_acc(model,train_loader)

0.8506054131054132

In [None]:
torch.save(model,'modelx.pth')