In [1]:
from typing import List
from smart_open import open


def read_tsv_as_lists(path : str) -> List[List[str]]:
    # assumes first line is header 
    d = [line.strip().split('\t') for line in open(path)][1:]

    targets = []
    premise = []
    hypothesis = []
    premise_pos = []
    hypothesis_pos = []

    for ls in d:

        targets.append(ls[0])
        premise.append(ls[1])
        hypothesis.append(ls[2])
        premise_pos.append(ls[3])
        hypothesis_pos.append(ls[4])
    # returns five lists with labels and input
    return targets, premise, hypothesis, premise_pos, hypothesis_pos

targets, premise, hypothesis, premise_pos, hypothesis_pos = parse_data("/content/drive/MyDrive/mnli_train.tsv")

In [2]:
from torch import nn 


class ANN(nn.Module):
    def __init__(
        self,
        sequential,
    ):
        super().__init__()
        self.sequential = sequential
      
    def forward(self, x):
      x = self.sequential(x)
      return x

In [3]:
import torch

class Embedding:
  def __init__(
      self,
      embedding_weights,
      train_embedding = False
  ) -> None:
      self.embedding = nn.Embedding.from_pretrained(embedding_weights)
      self.embedding.requires_grad = train_embedding

  def mean_concat(self, premise, hypothesis, mean_dim = 0, concat_dim=1):
    premise_emb = self.embedding(premise).mean(dim=mean_dim)
    hypothesis_emb = self.embedding(hypothesis).mean(dim=mean_dim)
    inputs = torch.concat((premise_emb, hypothesis_emb), dim=concat_dim)
    return inputs


In [4]:
# https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
# import requests, zipfile, io

# r = requests.get("http://vectors.nlpl.eu/repository/20/40.zip")
# z = zipfile.ZipFile(io.BytesIO(r.content))
# z.extractall("/content/drive/MyDrive/embedding_word2vec")

In [5]:
import gensim

emb_model = gensim.models.KeyedVectors.load_word2vec_format(
    "/content/drive/MyDrive/embedding_word2vec/model.bin",
    binary=True, 
    unicode_errors="replace"
)

In [6]:
# from os import path
# 
# model_path = "/content/drive/MyDrive/embedding_word2vec/"
# metadata_file = path.join(model_path, "meta.json")
# 
# with open(metadata_file, "r") as meta:
#         print(meta.read())

In [7]:
def get_emb_idx(w, emb_model = emb_model):
    return emb_model.vocab[w].index

In [8]:
import string
from typing import List
PUNCTS = string.punctuation.replace("'", "")


def data_prep(txt : str) -> List[str]:
  txt = txt.lower().translate(str.maketrans(PUNCTS, ' '*len(PUNCTS)))
  # remove ' from string with punctuations marks, other punctuation marks will
  # be removed from string. Remaining punctuations are !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
  
  # text is casefolded, chars in PUNCTS are removed and double white spaces are reduced
  # to single white space
  #txt = [t.lower().translate(str.maketrans('', '', PUNCTS)).strip() for t in txt.split()]
  txt = [t.strip() for t in txt.split()]
  return txt

In [9]:
# import torch
# 
# torch_emb = nn.Embedding.from_pretrained(torch.FloatTensor(emb_model.vectors))
# test_str = data_prep(premise[0])
# emb_test = torch_emb(torch.LongTensor([get_emb_idx(w) for w in test_str if w in emb_model.vocab]))
# emb_test.mean(dim=0)

In [10]:
import torch

target_stoi = {'contradiction' : 1, 'entailment' : 2, 'neutral' : 0}


class NLIDataset(torch.utils.data.Dataset):
    def __init__(
        self, 
        premise, 
        hypothesis, 
        targets,
        target_stoi
    ) -> None:
        self.premise = premise
        self.hypothesis = hypothesis
        self.target_stoi = target_stoi
        self.targets = [self.target_stoi[t] for t in targets]

    def __getitem__(self, idx : int):
        premise = torch.LongTensor(
            [get_emb_idx(w) for w in self.premise[idx] if w in emb_model.vocab]
        )
        
        hypothesis = torch.LongTensor(
            [get_emb_idx(w) for w in self.hypothesis[idx] if w in emb_model.vocab]
        )

        targets = self.targets[idx]
        return premise, hypothesis, targets

    def __len__(self):
        return len(self.targets)


In [11]:
class NLIDatasetConcat(torch.utils.data.Dataset):
    def __init__(
        self, 
        premise, 
        hypothesis, 
        targets,
        target_stoi,
        embedding
    ) -> None:
        self.premise = premise
        self.hypothesis = hypothesis
        self.target_stoi = target_stoi
        self.targets = [self.target_stoi[t] for t in targets]
        self.embedding = embedding

    def __getitem__(self, idx : int):
        premise = torch.LongTensor(
            [get_emb_idx(w) for w in data_prep(self.premise[idx]) if w in emb_model.vocab]
        )
        
        hypothesis = torch.LongTensor(
            [get_emb_idx(w) for w in data_prep(self.hypothesis[idx]) if w in emb_model.vocab]
        )
        inputs = self.embedding.mean_concat(premise, hypothesis, concat_dim=0)
        targets = self.targets[idx]
        return inputs, targets

    def __len__(self):
        return len(self.targets)


In [12]:
test_dataset = NLIDataset(
    premise=premise[:100_000],
    hypothesis=hypothesis[:100_000],
    targets=targets[:100_000],
    target_stoi=target_stoi
)

In [13]:
emb = Embedding(torch.FloatTensor(emb_model.vectors))

test_dataset_concat = NLIDatasetConcat(
    premise=premise,
    hypothesis=hypothesis,
    targets=targets,
    target_stoi=target_stoi,
    embedding=emb
)

In [14]:
test_dataset_concat[0][0].shape

torch.Size([200])

In [None]:
from torch.nn.utils.rnn import pad_sequence


def collate_mean_emb(data):
  premise, hypothesis, label = zip(*data)
  label = torch.LongTensor(label)
  premise = pad_sequence(premise)
  hypothesis = pad_sequence(hypothesis)
  return premise, hypothesis, label

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=2,
    collate_fn=collate_mean_emb
)

In [15]:
train_dataloader_concat = torch.utils.data.DataLoader(
    test_dataset_concat,
    batch_size=128,
    shuffle=False,
    num_workers=2
)

In [None]:
for _premise, _hypothesis, _label in train_dataloader:
  print(_premise.shape)
  print(_hypothesis.shape)
  print(_label.shape)
  break

torch.Size([1086, 128])
torch.Size([118, 128])
torch.Size([128])


In [None]:
for inputs, _label in train_dataloader_concat:
  print(inputs.shape)
  print(_label.shape)
  break

torch.Size([128, 200])
torch.Size([128])


In [None]:
model_seq = nn.Sequential(
  nn.Linear(200, 512),
  nn.Tanh(),
  nn.Linear(512,3),
)

model = ANN(model_seq)

In [None]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.1,
    betas=(0.9, 0.999),
    eps=1e-08,
    weight_decay=0.05
)
# use ExponetialLR as learning rate scheduler
scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer, gamma=0.9)

In [None]:
from torch.nn import functional as F

n_epochs = 5
loss_fn = F.cross_entropy
accum_loss = []

for epoch in range(n_epochs):
    # put model in train mode, if drop out is included in forward this will be activated
    model.train()
    # get data and targets from the dataloader, these are put to the correct device
    for _premise, _hypothesis, _labels in train_dataloader:
        epoch_train_acc = []
        # classes = classes.to(device)
        # zero out gradients
        optimizer.zero_grad()
        # make prediction 
        inputs = emb.mean_concat(
            _premise, _hypothesis, mean_dim=0, concat_dim=1
        )
        y_pred = model(inputs)
        # calcualte loss
        loss = loss_fn(
            y_pred, 
            _labels
        ).mean()
        accum_loss.append(loss.item())
        # calculate grads 
        loss.backward()
        # update weights w.r.t. grads 
        optimizer.step()
        epoch_train_acc.append((y_pred.argmax(dim=1) == _labels).float().mean().item())

    print(epoch,": " ,accum_loss[-1], sep="")
    print(round(sum(epoch_train_acc)/len(epoch_train_acc), 3))

    # adjust learning rate
    scheduler.step()

In [26]:
model_seq = nn.Sequential(
  nn.Linear(200, 512),
  nn.ReLU(),
  nn.Linear(512,3),
)

model = ANN(model_seq)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.01,
    betas=(0.9, 0.999),
    eps=1e-08,
    weight_decay=0.05
)
# use ExponetialLR as learning rate scheduler
scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer, gamma=0.9)

In [27]:
from torch.nn import functional as F

n_epochs = 5
loss_fn = F.cross_entropy
accum_loss = []
expl = False 

for epoch in range(n_epochs):
    # put model in train mode, if drop out is included in forward this will be activated
    model.train()
    # get data and targets from the dataloader, these are put to the correct device
    i = 0
    for inputs, _labels in train_dataloader_concat:
        
        epoch_train_acc = []
        # classes = classes.to(device)
        # zero out gradients
        optimizer.zero_grad()
        y_pred = model(torch.nan_to_num(inputs))
        
        #if not y_pred.isfinite().all().item():
        #  expl = True
        #  break
        
        i = i + 1
        # calcualte loss
        loss = loss_fn(
            y_pred, 
            _labels
        ).mean()
        accum_loss.append(loss.item())
        # calculate grads 
        loss.backward()
        # update weights w.r.t. grads 
        loss.register_hook(lambda grad: print(grad))
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25, error_if_nonfinite=True)
        optimizer.step()
        epoch_train_acc.append((y_pred.argmax(dim=1) == _labels).float().mean().item())
  

    # if expl:
    #   print()
    #   break
    print(epoch,": " ,accum_loss[-1], sep="")
    print(round(sum(epoch_train_acc)/len(epoch_train_acc), 3))
    

    # adjust learning rate
    scheduler.step()

0: 0.8983548283576965
0.565
1: 0.9121233224868774
0.652
2: 0.9223904609680176
0.609
3: 0.9215961694717407
0.652
4: 0.9078409075737
0.609


In [None]:
for p in model.parameters():
  print(p.grad)

In [None]:
ids = torch.LongTensor([get_emb_idx(w) for w in data_prep(hypothesis[30462]) if w in emb_model.vocab])
emb.embedding(ids).mean(dim=0)

In [None]:
ids = torch.LongTensor([get_emb_idx(w) for w in data_prep(premise[30462]) if w in emb_model.vocab])

In [None]:
emb.embedding(ids).mean(dim=0)

In [16]:
hypothesis[7779]

'The beverage is full of citrus extracts and pieces of citrus.'

In [17]:
premise[7779]

'Becauseitsafullyloadedcitrusbeverageright?'

In [18]:
data_prep(premise[7779])

['becauseitsafullyloadedcitrusbeverageright']

In [None]:
torch.nan_to_num(test_dataset_concat[7779][0])

In [None]:
for i in range(len(test_dataset_concat)):
  if not test_dataset_concat[i][0].isfinite().all().item():
    print(i)
    break

7779
