In [1]:
# !pip install transformers

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer


import torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets

from collections import defaultdict, Counter

from transformers import BertTokenizer, BertModel

import math
from typing import Tuple
import numpy as np

import time
import random
import functools

import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn')

In [3]:
# To ensure that the code is reproducible, set random seeds
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
import pandas as pd
pwngc_path = "../data/test_transformer/pwngc4torchtext.csv"
pwngc_df = pd.read_csv(pwngc_path, delimiter="\t", header=None)
pwngc_df.columns = ["token", "stemm", "pos", "annotation", "synset", "tag" ]
pwngc_df.head(20)

Unnamed: 0,token,stemm,pos,annotation,synset,tag
0,having,have,v,no-annotation,no-synset,O
1,the,the,,no-annotation,no-synset,O
2,necessary,necessary,a,01580050-a,necessary.a.01,[135144.0 25.01 64176.03 90.0 0.5]
3,means,means,n,00172710-n,means.n.01,[111736.0 98.31 98012.7 0.0 18.5]
4,or,or,,no-annotation,no-synset,O
5,skill,skill,n,no-annotation,no-synset,O
6,or,or,,no-annotation,no-synset,O
7,know-how,know-how,n,05616786-n,know-how.n.01,[142676.0 107.17 71890.08 0.0 106.5]
8,or,or,,no-annotation,no-synset,O
9,authority,authority,n,05196582-n,authority.n.01,[37587.0 104.34 194973.98 0.0 7.5]


In [None]:
import numpy as np
len(np.where(pwngc_df["tag"] != 'O')[0])
# 532.821 annotated tokens

In [58]:
import ast
tag2 = pwngc_df['tag'][2]
print(tag2)
print(type(tag2))
def removeBra(string_list):
    if string_list[0] == "[" and string_list[-1] == "]":
        return string_list[1:-1]
    else:
        return string_list

tag2list = torch.tensor(list(map(float, removeBra(tag2).split(' '))), dtype=torch.float32)
# tag2list = ast.literal_eval(tag2)
print(tag2list)
print(type(tag2list))
# type(eval("tensor({}, device='{}')".format(tag2, "cpu")))

[135144.0 25.01 64176.03 90.0 0.5]
<class 'str'>
tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01])
<class 'torch.Tensor'>


In [10]:
# split the dataset into training, validation and testing
train_path = "train.csv"
validate_path = "validate.csv"
test_path = "test.csv"

In [6]:
def read_data(corpus_file, datafields):
    """
    reads the stem word and the spatial tag of each token in the .csv file
    :param corpus_file:
    :param datafields:
    :return:
    """
    with open(corpus_file, encoding='utf-8') as f:
        examples = []
        words = []
        labels = []
        for line in f:
            line = line.strip()
            if not line:
                examples.append(data.Example.fromlist([words, labels], datafields))
                words = []
                labels = []
            else:
                columns = line.split()
                words.append(columns[1])
                labels.append(columns[-1])
        return data.Dataset(examples, datafields)

In [7]:
#   train_examples = read_data("../data/test_transformer/train.csv", self.fields) #'data/eng.train.iob', self.fields)

In [59]:
TEXT = data.Field(use_vocab=False,
                  lower=True)

LABEL = data.Field(is_target=True,
                   use_vocab=False,
                   unk_token=None,
                   preprocessing=data.Pipeline(lambda x: torch.tensor(list(map(float, removeBra(x).split(' '))), dtype=torch.double)),
                   dtype=data.Pipeline(lambda x: torch.tensor(x, dtype=torch.double)))

train, valid, test = datasets.SequenceTaggingDataset.splits(path='../data/test_transformer/',
                                   train = train_path,
                                   validation = validate_path,
                                   test = test_path,
                                   fields=[("text",TEXT),("lemmatized_text",TEXT), (None, None), (None,None), (None, None), ("label",LABEL)]) #,

In [60]:
for t, lt, l in zip(train.text, train.lemmatized_text, train.label):
    print(t, lt, l)

['having', 'the', 'necessary', 'means', 'or', 'skill', 'or', 'know-how', 'or', 'authority', 'to', 'do', 'something'] ['have', 'the', 'necessary', 'means', 'or', 'skill', 'or', 'know-how', 'or', 'authority', 'to', 'do', 'something'] [tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01],
       dtype=torch.float64), tensor([1.1174e+05, 9.8310e+01, 9.8013e+04, 0.0000e+00, 1.8500e+01],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([1.4268e+05, 1.0717e+02, 7.1890e+04, 0.0000e+00, 1.0650e+02],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([3.7587e+04, 1.0434e+02, 1.9497e+05, 0.0000e+00, 7.5000e+00],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0.

In [61]:
# Load the pre-trained embeddings that come with the torchtext library.
use_pretrained = True
if use_pretrained:
    print('We are using pre-trained word embeddings.')
    TEXT.build_vocab(train, vectors="glove.840B.300d")
else:
    print('We are training word embeddings from scratch.')
    TEXT.build_vocab(train, max_size=5000)

We are using pre-trained word embeddings.


In [33]:
tens = torch.tensor(1, dtype=torch.float)
tens

tensor(1., dtype=torch.float64)

In [34]:
x = [13.4, 34.4, 2.0]
tens = torch.tensor(x, dtype=torch.float)

In [35]:
tens

tensor([13.4000, 34.4000,  2.0000], dtype=torch.float64)

In [36]:
tend = torch.tensor(x, dtype=torch.double)
tend

tensor([13.4000, 34.4000,  2.0000], dtype=torch.float64)

In [62]:
BATCH_SIZE = 5
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    repeat=False)

In [71]:
for batch in train_iterator.data():
    print(batch.text, batch.lemmatized_text, batch.label)

['lacking', 'necessary', 'force', 'for', 'effectiveness'] ['lack', 'necessary', 'force', 'for', 'effectiveness'] [tensor([6.0597e+04, 1.2401e+02, 1.2927e+05, 1.8000e+02, 2.5000e+00],
       dtype=torch.float64), tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01],
       dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([0., 0., 0., 0., 0.], dtype=torch.float64), tensor([1.3004e+05, 1.1761e+02, 1.0131e+05, 0.0000e+00, 2.5000e+00],
       dtype=torch.float64)]
['lacking', 'necessary', 'documents', '(', 'as', 'for', 'e.g.', 'permission', 'to', 'live', 'or', 'work', 'in', 'a', 'country', ')'] ['lack', 'necessary', 'document', '(', 'as', 'for', 'e.g.', 'permission', 'to', 'live', 'or', 'work', 'in', 'a', 'country', ')'] [tensor([6.0597e+04, 1.2401e+02, 1.2927e+05, 1.8000e+02, 2.5000e+00],
       dtype=torch.float64), tensor([1.3514e+05, 2.5010e+01, 6.4176e+04, 9.0000e+01, 5.0000e-01],
       dtype=torch.float64), tensor([6.4798e+04, 1.0888e+02,