In [1]:
# !pip install transformers

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer


import torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets

from collections import defaultdict, Counter

from transformers import BertTokenizer, BertModel

import math
from typing import Tuple
import numpy as np

import time
import random
import functools

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn')

In [3]:
# To ensure that the code is reproducible, set random seeds
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
import pandas as pd
pwngc_path = "../data/test_transformer/pwngc4torchtext.csv"
pwngc_df = pd.read_csv(pwngc_path, delimiter="\t", header=None)
pwngc_df.columns = ["token", "stemm", "pos", "annotation", "synset", "tag" ]
pwngc_df.head(20)

Unnamed: 0,token,stemm,pos,annotation,synset,tag
0,having,have,v,no-annotation,no-synset,O
1,the,the,,no-annotation,no-synset,O
2,necessary,necessary,a,01580050-a,necessary.a.01,[135144.0 25.01 64176.03 90.0 0.5]
3,means,means,n,00172710-n,means.n.01,[111736.0 98.31 98012.7 0.0 18.5]
4,or,or,,no-annotation,no-synset,O
5,skill,skill,n,no-annotation,no-synset,O
6,or,or,,no-annotation,no-synset,O
7,know-how,know-how,n,05616786-n,know-how.n.01,[142676.0 107.17 71890.08 0.0 106.5]
8,or,or,,no-annotation,no-synset,O
9,authority,authority,n,05196582-n,authority.n.01,[37587.0 104.34 194973.98 0.0 7.5]


In [6]:
import numpy as np
len(np.where(pwngc_df["tag"] != 'O')[0])
# 532.821 annotated tokens

532821

In [5]:
import ast
tag2 = pwngc_df['tag'][2]
print(tag2)
print(type(tag2))
def removeBra(string_list):
    if string_list[0] == "[" and string_list[-1] == "]":
        return string_list[1:-1]
    else:
        return string_list

tag2list = torch.tensor(list(map(float, removeBra(tag2).split(' '))), dtype=torch.float32)
# tag2list = ast.literal_eval(tag2)
print(tag2list)
print(type(tag2list))
# type(eval("tensor({}, device='{}')".format(tag2, "cpu")))

[135144.0 25.01 64176.03 90.0 0.5]
<class 'str'>
tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01])
<class 'torch.Tensor'>


In [8]:
# split the dataset into training, validation and testing
train_path = "train.csv"
validate_path = "validate.csv"
test_path = "test.csv"


In [10]:
#   train_examples = read_data("../data/test_transformer/train.csv", self.fields) #'data/eng.train.iob', self.fields)

In [41]:
TEXT = data.Field(use_vocab=True,
                  lower=True)

LABEL = data.Field(is_target=True,
                   use_vocab=False,
                   unk_token=None,
                   preprocessing=data.Pipeline(lambda x: torch.tensor(list(map(float, removeBra(x).split(' '))), dtype=torch.double)),
                   dtype=data.Pipeline(lambda x: torch.tensor(x, dtype=torch.double)))

train, valid, test = datasets.SequenceTaggingDataset.splits(path='../data/test_transformer/',
                                   train = train_path,
                                   validation = validate_path,
                                   test = test_path,
                                   fields=[("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]) #,

type(train)

torchtext.legacy.datasets.sequence_tagging.SequenceTaggingDataset

In [12]:
print(train)

<torchtext.legacy.datasets.sequence_tagging.SequenceTaggingDataset object at 0x00000164570411C8>


In [42]:
for t, lt, l in zip(train.text, train.lemmatized_text, train.label):
    print(type(l), type(lt), type(l))
    print(t, lt, l)
    print(len(t))

<class 'list'> <class 'list'> <class 'list'>
['having', 'the', 'necessary', 'means', 'or', 'skill', 'or', 'know-how', 'or', 'authority', 'to', 'do', 'something'] ['have', 'the', 'necessary', 'means', 'or', 'skill', 'or', 'know-how', 'or', 'authority', 'to', 'do', 'something'] [tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01],
       dtype=torch.float64), tensor([1.1174e+05, 9.8310e+01, 9.8013e+04, 0.0000e+00, 1.8500e+01],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([1.4268e+05, 1.0717e+02, 7.1890e+04, 0.0000e+00, 1.0650e+02],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([3.7587e+04, 1.0434e+02, 1.9497e+05, 0.0000e+00, 7.5000e+00],
       dtype=torch.float64), tensor([0., 0., 0., 0., 

In [43]:
for ex in train:
    print(len(ex.text))

13
9
9
10
4
4
5
6
10
9
9
14
16
5
10
4
4
4
2
2
14


In [44]:
# Load the pre-trained embeddings that come with the torchtext library.
use_pretrained = True
if use_pretrained:
    print('We are using pre-trained word embeddings.')
    TEXT.build_vocab(train, vectors="glove.840B.300d")
else:
    print('We are training word embeddings from scratch.')
    TEXT.build_vocab(train, max_size=5000)

We are using pre-trained word embeddings.


In [45]:
# for key, val in zip(TEXT.vocab.stoi, TEXT.vocab.vectors):
    # print(key, val)

print(list(train))
# print(TEXT.vocab.stoi)

[<torchtext.legacy.data.example.Example object at 0x000001F436560848>, <torchtext.legacy.data.example.Example object at 0x000001F4365507C8>, <torchtext.legacy.data.example.Example object at 0x000001F436550BC8>, <torchtext.legacy.data.example.Example object at 0x000001F436550F88>, <torchtext.legacy.data.example.Example object at 0x000001F436550408>, <torchtext.legacy.data.example.Example object at 0x000001F436550308>, <torchtext.legacy.data.example.Example object at 0x000001F436550908>, <torchtext.legacy.data.example.Example object at 0x000001F436798808>, <torchtext.legacy.data.example.Example object at 0x000001F436550188>, <torchtext.legacy.data.example.Example object at 0x000001F436550AC8>, <torchtext.legacy.data.example.Example object at 0x000001F436550688>, <torchtext.legacy.data.example.Example object at 0x000001F43653ACC8>, <torchtext.legacy.data.example.Example object at 0x000001F4365348C8>, <torchtext.legacy.data.example.Example object at 0x000001F436798248>, <torchtext.legacy.d

In [18]:
# b= data.example.Example.fromlist([train.text, train.label], fields=[("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)])
# TEXT.build_vocab(b, vectors="glove.840B.300d")
# print(b.vocab)
# for tt in b:
#     print(tt.vocab)

In [46]:
BATCH_SIZE = 5

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    # sort=False, # to skip sorting validation and testing data
    sort_key=lambda x: len(x.text),
    repeat=False)

In [20]:
# print(train_iterator.data())

train_batch = train_iterator.data()
print(TEXT.vocab.freqs.most_common(10))
print("-"*40)
print(TEXT.vocab.itos[:10])
print(TEXT.vocab.stoi)
print(TEXT.vocab.vectors)

[('or', 36), ('necessary', 34), ('the', 12), ('for', 10), ('to', 8), ('not', 8), ('a', 8), ('having', 6), ('of', 6), ('(', 6)]
----------------------------------------
['<unk>', '<pad>', 'or', 'necessary', 'the', 'for', 'a', 'not', 'to', '(']
defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x0000016457039FC8>>, {'<unk>': 0, '<pad>': 1, 'or': 2, 'necessary': 3, 'the': 4, 'for': 5, 'a': 6, 'not': 7, 'to': 8, '(': 9, ')': 10, 'have': 11, 'having': 12, 'of': 13, 'all': 14, 'and': 15, 'by': 16, 'completeness': 17, 'effectiveness': 18, 'fact': 19, 'know-how': 20, 'means': 21, 'normal': 22, 'skill': 23, 'with': 24, 'lack': 25, 'lacking': 26, ':': 27, 'absolutely': 28, 'as': 29, 'authority': 30, 'boat': 31, 'but': 32, 'characteristic': 33, 'characteristics': 34, 'choice': 35, 'component': 36, 'country': 37, 'deductive_reasoning': 38, 'display': 39, 'displaying': 40, 'do': 41, 'e.g.': 42, 'effect': 43, 'etc': 44, 'etc.': 45, 'every': 46, 'force': 47

In [21]:
for bat in train_iterator:
    print(bat)
# for batch in train_iterator:
#     print(type(batch))#, type(batch.text), type(batch.label))
    # print(batch.text[0], type(batch.text[0]))
    # print(batch.text, batch.lemmatized_text, batch.label)



# for batch in train_iterator.data():
#     print(type(batch), type(batch.text), type(batch.label))
#     print(batch.text[0], type(batch.text[0]))
#     print(batch.text, batch.lemmatized_text, batch.label)

ValueError: Specified Field dtype <torchtext.legacy.data.pipeline.Pipeline object at 0x0000016457041248> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues

In [22]:
list(train_iterator.data())

[<torchtext.legacy.data.example.Example at 0x16456f6f8c8>,
 <torchtext.legacy.data.example.Example at 0x16457042e48>,
 <torchtext.legacy.data.example.Example at 0x1645701d488>,
 <torchtext.legacy.data.example.Example at 0x1645701d848>,
 <torchtext.legacy.data.example.Example at 0x16457035788>,
 <torchtext.legacy.data.example.Example at 0x1645701d288>,
 <torchtext.legacy.data.example.Example at 0x1645703b448>,
 <torchtext.legacy.data.example.Example at 0x1645701d788>,
 <torchtext.legacy.data.example.Example at 0x16457042c48>,
 <torchtext.legacy.data.example.Example at 0x16457042e08>,
 <torchtext.legacy.data.example.Example at 0x1645701d5c8>,
 <torchtext.legacy.data.example.Example at 0x1645701d9c8>,
 <torchtext.legacy.data.example.Example at 0x164570425c8>,
 <torchtext.legacy.data.example.Example at 0x1645701dac8>,
 <torchtext.legacy.data.example.Example at 0x1645701d808>,
 <torchtext.legacy.data.example.Example at 0x164570424c8>,
 <torchtext.legacy.data.example.Example at 0x16457042508

In [23]:
train_batches = list(train_iterator.data())
valid_batches = list(valid_iterator.data())

In [24]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [30]:
class TransformerEncoderModel(nn.Module):

    def __init__(self, text_field, label_field, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        # Multi-head attention mechanism is included in TransformerEncoderLayer
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        # self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()
        # -------------------------------------

        voc_size = len(text_field.vocab)
        print("voc_size: ", voc_size )

        # Embedding layer. If we're using pre-trained embeddings, copy them
        # into our embedding module.
        self.embedding = nn.Embedding(voc_size, 300)
        print("Embedding", self.embedding)
        if text_field.vocab.vectors is not None:
            self.embedding.weight = torch.nn.Parameter(TEXT.vocab.vectors)


    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.zero_()
        # self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = torch.tensor(src, device="cpu")
        src = self.embedding(src)
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src) #, src_mask)
        # output = self.decoder(output)
        return output


# def generate_square_subsequent_mask(sz: int) -> Tensor:
#     """Generates an upper-triangular matrix of -inf, with zeros on diag."""
#     return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [26]:
# train_batches = train_iterator.data()
# for batch in train_batches:
#     print(batch)
#     # batch.text is of shape: (input_data_length, batch_size)
#     print(batch.text)
#     # print(batch.text.vocab)
#     # print(batch.text.vocab.vectors)

In [42]:
class Tagger:

    def __init__(self, lower):
        self.TEXT = data.Field(use_vocab=True,
                  lower=True)

        self.LABEL = data.Field(is_target=True,
                           use_vocab=False,
                           unk_token=None,
                           preprocessing=data.Pipeline(
                               lambda x: torch.tensor(list(map(float, removeBra(x).split(' '))),
                                                      dtype=torch.double)),
                           dtype=torch.DoubleTensor)
                                #data.Pipeline(lambda x: torch.tensor(x, dtype=torch.double)))

        self.fields = [("text",self.TEXT),("lemmatized_text",self.TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]


        # self.TEXT = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, lower=lower)
        # I changed sequential = True to false, because my data is not sequential
        # self.LABEL = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True, unk_token=None)
        # self.LABEL = data.Field(is_target=True, sequential=False, unk_token=None, dtype=list)
        # data.Field(init_token='<bos>', eos_token='<eos>',
        #     sequential=False, use_vocab=False)
        # self.fields = [('text', self.TEXT), ('label', self.LABEL)]
        # self.device = 'cuda'
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    def tag(self, sentences):
        # This method applies the trained model to a list of sentences.

        # First, create a torchtext Dataset containing the sentences to tag.
        examples = []
        for sen in sentences:
            labels = ['?']*len(sen) # placeholder
            examples.append(data.Example.fromlist([sen, labels], self.fields))
        dataset = data.Dataset(examples, self.fields)

        iterator = data.Iterator(
            dataset,
            device=self.device,
            batch_size= 5, #64,
            repeat=False,
            train=False,
            sort=False)

        # Apply the trained model to all batches.
        out = []
        self.model.eval()
        with torch.no_grad():
            for batch in iterator:
                # Call the model's predict method. This returns a list of NumPy matrix
                # containing the integer-encoded tags for each sentence.
                predicted = self.model.predict(batch.text)

                # # Convert the integer-encoded tags to tag strings.
                # for tokens, pred_sen in zip(sentences, predicted):
                #     out.append([self.LABEL.vocab.itos[pred_id] for _, pred_id in zip(tokens, pred_sen[1:])])
        return predicted #out

    def train(self):
        # Read training and validation data according to the predefined split.
        # train_examples = read_data("../data/test_transformer/train.csv", self.fields) #'data/eng.train.iob', self.fields)
        # valid_examples = read_data("../data/test_transformer/validate.csv", self.fields) #'data/eng.valid.iob', self.fields)

        train_examples, valid_examples, test_examples = datasets.SequenceTaggingDataset.splits(path='../data/test_transformer/',
                                           train = train_path,
                                           validation = validate_path,
                                           test = test_path,
                                           fields=[("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]) #,


        # Count the number of words and sentences.
        n_tokens_train = 0
        n_sentences_train = 0
        for ex in train_examples:
            n_tokens_train += len(ex.text) #+ 2
            n_sentences_train += 1
        n_tokens_valid = 0
        for ex in valid_examples:
            n_tokens_valid += len(ex.text)

        # Load the pre-trained embeddings that come with the torchtext library.
        use_pretrained = True
        if use_pretrained:
            print('We are using pre-trained word embeddings.')
            self.TEXT.build_vocab(train_examples, vectors="glove.840B.300d")
        else:
            print('We are training word embeddings from scratch.')
            self.TEXT.build_vocab(train_examples, max_size=5000)
        # self.LABEL.build_vocab(train_examples)

        # Create one of the models defined above.
        # self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
        # self.model = RNNCRFTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)

        self.model = TransformerEncoderModel(text_field=self.TEXT,
                                             label_field=self.LABEL,
                                             ntoken=300,
                                             d_model=300,
                                             d_hid=200,
                                             nlayers=2,
                                             nhead=2,
                                             dropout=0.2)
        self.model.to(self.device)

        # -----------------------------------------------------------
        #                       BucketIterator
        # -----------------------------------------------------------

        batch_size = 5 #1024
        n_batches = np.ceil(n_sentences_train / batch_size)

        mean_n_tokens = n_tokens_train / n_batches

        train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
                                                        (train_examples, valid_examples, test_examples),
                                                        device=self.device,
                                                        batch_size=batch_size,
                                                        sort_key=lambda x: len(x.text),
                                                        repeat=False,
                                                        sort=True)

        # train_iterator = data.BucketIterator(
        #     train_examples,
        #     device=self.device,
        #     batch_size=batch_size,
        #     sort_key=lambda x: len(x.text),
        #     repeat=False,
        #     train=True,
        #     sort=True)
        #
        # valid_iterator = data.BucketIterator(
        #     valid_examples,
        #     device=self.device,
        #     batch_size= 2, #64,
        #     sort_key=lambda x: len(x.text),
        #     repeat=False,
        #     train=False,
        #     sort=True)

        train_batches = train_iterator
        valid_batches = valid_iterator
        test_batches = valid_iterator


        # ---------------------------------------------------------------------
        #                       Optimizer
        # ---------------------------------------------------------------------
        criterion = nn.CrossEntropyLoss()
        lr = 5.0  # learning rate
        optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
        # -------


        # optimizer = torch.optim.Adam(self.model.parameters(), lr=0.01, weight_decay=1e-5)

        # n_labels = len(self.LABEL.vocab)

        # ----------------------------------------------------------
        #                       Epoch Training
        # ----------------------------------------------------------

        history = defaultdict(list)

        n_epochs = 3 #25

        # For each epoch
        for i in range(1, n_epochs + 1):

            t0 = time.time()

            loss_sum = 0

            self.model.train()

            # for transformer
            scheduler.step()

            # For each batch
            for batch in train_batches:

                # Compute the output and loss.
                # loss = self.model(batch.text, batch.label) / mean_n_tokens

                out = self.model(batch.text)
                ntokens = 300
                loss = criterion(out.view(-1, ntokens), batch.label)

                optimizer.zero_grad()
                loss.backward()
                # I added this
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
                # ---
                optimizer.step()
                loss_sum += loss.item()


            train_loss = loss_sum / n_batches
            history['train_loss'].append(train_loss)

            # Evaluate on the validation set.
            if i % 1 == 0:

                stats = defaultdict(Counter)

                # from transformers
                # lr = scheduler.get_last_lr()[0]
                # ms_per_batch = (time.time() - start_time) * 1000 / log_interval
                # cur_loss = total_loss / log_interval
                # ppl = math.exp(cur_loss)
                # print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                #       f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                #       f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
                # total_loss = 0

                self.model.eval()
                with torch.no_grad():
                    for batch in valid_batches:
                        # Predict the model's output on a batch.
                        predicted = self.model.predict(batch.text)
                        print("predicted := ", predicted)
                        # Update the evaluation statistics.
                        # evaluate_iob(predicted, batch.label, self.LABEL, stats)

                # # Compute the overall F-score for the validation set.
                # _, _, val_f1 = prf(stats['total'])
                #
                # history['val_f1'].append(val_f1)
                #
                # t1 = time.time()
                # print(f'Epoch {i}: train loss = {train_loss:.4f}, val f1: {val_f1:.4f}, time = {t1-t0:.4f}')

        # # After the final evaluation, we print more detailed evaluation statistics, including
        # # precision, recall, and F-scores for the different types of named entities.
        # print()
        # print('Final evaluation on the validation set:')
        # p, r, f1 = prf(stats['total'])
        # print(f'Overall: P = {p:.4f}, R = {r:.4f}, F1 = {f1:.4f}')
        # for label in stats:
        #     if label != 'total':
        #         p, r, f1 = prf(stats[label])
        #         print(f'{label:4s}: P = {p:.4f}, R = {r:.4f}, F1 = {f1:.4f}')
        #
        # plt.plot(history['train_loss'])
        # plt.plot(history['val_f1'])
        # plt.legend(['training loss', 'validation F-score'])

tagger = Tagger(lower=False)
tagger.train()

We are using pre-trained word embeddings.
voc_size:  2
Embedding Embedding(2, 300)




ValueError: Specified Field dtype <torchtext.legacy.data.pipeline.Pipeline object at 0x0000016457041248> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues

In [47]:
def decode_params(spatial_params):
    l0 = spatial_params[0]
    alpha = spatial_params[1]
    alpha_rad = alpha * np.pi / 180
    l_i = spatial_params[2]
    beta_i = spatial_params[3]
    beta_i_rad = beta_i * np.pi / 180
    r = spatial_params[4]
    return l0, alpha, alpha_rad, l_i, beta_i, beta_i_rad, r


def point_in_space(spatial_params):
    l0, alpha, alpha_rad, l_i, beta_i, beta_i_rad, r = decode_params(spatial_params)
    # np.cos() and np.sin() take angles in radian as params
    center_pt = np.array([l0*np.cos(alpha_rad), l0 * np.sin(alpha_rad)])
    sense_pt = center_pt + np.array([l_i * np.cos(alpha_rad + beta_i_rad),
                                     l_i * np.sin(alpha_rad + beta_i_rad)])
    return sense_pt, center_pt


def inside_sphere(point, sphere_coo):

    pt = point_in_space(point)
    sphere_sense, sphere_center = point_in_space(sphere_coo)

    sphere_rad = sphere_coo[-1] # in angles

    contained = (pt[0] - sphere_sense[0])**2 + (pt[1] - sphere_sense[1])**2 <= sphere_rad**2

    if contained:
        return True
    else:
        return False

def distance(pred_pt, original_pt):
    """
    Calculates the distance between two sense points.
    :param pred_pt:
    :param original_pt:
    :return:
    """
    pred_sense, pred_center = point_in_space(pred_pt)
    orig_sense, orig_center = point_in_space(original_pt)

    return np.linalg.norm(pred_sense - orig_sense)



def sphere_dist(pred_pt, original_pt):
    """
    Calculates the distance between two 2D spheres.
    :param pred_pt:
    :param original_pt:
    :return:
    """
    pred_sense, pred_center = point_in_space(pred_pt)
    pred_radius = pred_pt[-1]
    orig_sense, orig_center = point_in_space(original_pt)
    orig_radius = original_pt[-1]

    return (pred_radius + orig_radius -
            np.linalg.norm(pred_sense - orig_sense))

def decode_prediction(spatial_params, df="SPATIAL_WORDNET.pickle") -> [str]:
    """
    Projects the predicted spatial parameters into the embedding space.
    Returns the synsets in the vacinity of the projected point.
    :param spatial_params:
    :return:
    """
    synsets = [] # sort from specific to most general

    sense_pt, center_pt = point_in_space(spatial_params)

    spatial_df = pd.read_pickle(df)
    # get the spheres, where the point/point+radius is contained/overlaping/near

    # 1. check if the predicted point is contained in some sense
    spatial_df["contained"] = spatial_df.apply(lambda row:
                                               inside_sphere(spatial_params,
                                                             row[['l0', 'alpha', 'l_i', 'beta_i', 'radius']]))

    # 2. For those synsets, which is the nearest synset point
    #use distance() to calculate distance between centers

    # 3. If None of the synsets apply to that word sense
    # use sphere_dist to find the nearest sphere (most general synset), and assign it to that synset
    # (this maybe good for rare senses)


    return synsets

def train_loss(tmp_pred, synset_params):
    # Loss is the distance between the two spheres/containment of the word within that sphere
    # radius acts as tolerance!
    return

We are using pre-trained word embeddings.


ValueError: Specified Field dtype <torchtext.legacy.data.pipeline.Pipeline object at 0x0000016457041248> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues

In [38]:
TEXT = data.Field(use_vocab=True,
                  lower=True)

LABEL = data.Field(is_target=True,
                   use_vocab=False,
                   unk_token=None,
                   sequential=False,
                   postprocessing=data.Pipeline(
                       lambda x: torch.tensor(list(map(float, removeBra(x).split(' '))),
                                              dtype=torch.double)),
                   dtype=torch.DoubleTensor)
                        #data.Pipeline(lambda x: torch.tensor(x, dtype=torch.double)))

# LABEL.numericalize()
fields = [("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]

In [25]:
train, valid, test = datasets.SequenceTaggingDataset.splits(path='../data/test_transformer/',
                                   train = train_path,
                                   validation = validate_path,
                                   test = test_path,
                                   fields=[("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]) #,

In [26]:
BATCH_SIZE = 5

use_pretrained = True
if use_pretrained:
    print('We are using pre-trained word embeddings.')
    TEXT.build_vocab(train, vectors="glove.840B.300d")
else:
    print('We are training word embeddings from scratch.')
    TEXT.build_vocab(train, max_size=5000)

We are using pre-trained word embeddings.


In [40]:
# LABEL.build_vocab(train.label)
LABEL.numericalize(train)

ValueError: Specified Field dtype <class 'torch.DoubleTensor'> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues

In [19]:
train_iterator, valid_iterator = data.BucketIterator.splits(
                                                        (train, valid),
                                                        device=DEVICE,
                                                        batch_size=BATCH_SIZE,
                                                        sort_key=lambda x: len(x.text),
                                                        repeat=False,
                                                        sort=True)

ValueError: Specified Field dtype <class 'torch.DoubleTensor'> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues

In [23]:
batch = next(iter(train_iterator))
print("Numericalize premises:\n", batch.text)
print("Entailment labels:\n", batch.label)

ValueError: Specified Field dtype <class 'torch.DoubleTensor'> can not be used with use_vocab=False because we do not know how to numericalize it. Please raise an issue at https://github.com/pytorch/text/issues