In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import regex as re
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from allennlp.modules.conditional_random_field import ConditionalRandomField
from allennlp.modules.conditional_random_field import allowed_transitions
from torch.utils.data import Dataset, DataLoader, random_split
from torchcrf import CRF

from RULE import RULEs

In [2]:
def get_dictionary(dictionary_dir: '(str) directory of dictionary (fasttext format)')\
-> '(dict) dict[word: vector]':
        dictionary = {}
        with open(dictionary_dir, 'r', encoding = 'utf8') as f:
                for line in f:
                        tmp_line = line.strip()
                        tmp_list = [word.strip() for word in tmp_line.split()]
                        if tmp_line != '' and len(tmp_list) == 301:
                                dictionary[tmp_list[0]] = np.array([float(number) for \
                                number in tmp_list[1:]])
        return dictionary

In [6]:
my_dict = get_dictionary('fasttext.th.vec')

In [13]:
total_len_list = []
total_oov_list = []
cnt_o1 = 0
cnt_o2 = 0
with open('clean_384.txt', 'r', encoding='utf8') as f:
    with open('clean_384_oov_less_10.txt', 'w', encoding='utf8') as o1:
        with open('clean_384_oov_more_10.txt', 'w', encoding='utf8') as o2:
            for line_ind, line in enumerate(f):
                if line_ind % 100==0:
                    print(line_ind)
                tmp_line = line.strip()
                if tmp_line != '':
                    tmp_line = (word.strip() for word in tmp_line.split('||'))
                    #print(tmp_line)
                    cnt_oov = 0
                    for ind, word in enumerate(tmp_line):
                        if word not in my_dict:
                            cnt_oov += 1
                    if cnt_oov/(ind+1) < 0.1:
                        o1.write(line.strip() + '\n')
                        cnt_o1 += 1
                    else:
                        o2.write(line.strip() + '\n')
                        cnt_o2 += 1
                    total_oov_list.append(cnt_oov)
                    total_len_list.append(ind+1)
with open('report_oov_test.txt', 'w', encoding = 'utf8') as f:
    f.write(f'less10_cnt: {cnt_o1}, upper10_cnt: {cnt_o2}')
percent_oov_list = [100*total_oov_list[i]/total_len_list[i] for i in \
range(len(total_len_list))]

plt.hist(percent_oov_list,100)
plt.show()

In [11]:
print(len([i for i in percent_oov_list if i >= 10]))
print(len([i for i in percent_oov_list if i < 10]))
plt.hist(total_oov_list,100)
plt.show()
plt.hist(total_len_list,100)
plt.show()

In [121]:
def check_len(text_dir: 'path to text dir', delimeter: 'delimeter used for split()'):
    Max_len = 0
    Min_len = 1000000
    with open(text_dir, 'r', encoding='utf8') as f:
        for line in f:
            tmp_len = len(line.split(delimeter))
            Max_len = max(tmp_len, Max_len)
            Min_len = min(tmp_len, Min_len)
    return Max_len, Min_len

In [122]:
check_len('label_384.txt', '||')

In [3]:
def gen_char_dicitonary(char_dict_dir: 'all unique chars', out_dic_vec_dir: 'dir of dictionary vectors',\
 num_unique_char):
    with open(char_dict_dir, 'r', encoding='utf8') as f:
        with open(out_dic_vec_dir, 'w', encoding='utf8') as o1:
            for line in f:
                for ind, Char in enumerate(line.strip()):
                    o1.write(Char + ' ')
                    for i in range(num_unique_char + 1):
                        if i != ind:
                            o1.write('0 ')
                        else:
                            o1.write('1 ')
                    o1.write('\n')
                    if 'a' <= Char <= 'z':
                        o1.write(Char.upper() + ' ')
                        for i in range(num_unique_char):
                            if i != ind:
                                o1.write('0 ')
                            else:
                                o1.write('1 ')
                        o1.write('1 ')
                        o1.write('\n')

In [184]:
gen_char_dicitonary('./char-word-level-LSTM-CRF/char_dictionary.txt', './char-word-level-LSTM-CRF/char_vec_dictionary.txt', 134)

In [179]:
with open('./char-word-level-LSTM-CRF/char_dictionary.txt', 'r', encoding='utf8') as f:
    cnt = 0
    for i in f:
        for char in i.strip():
            cnt = cnt + 1
print(cnt)

In [2]:
class MyDataloader(Dataset):
    def __init__(self, TextDir: '.txt extension of samples', LabelDir: '.txt extension of labels',rules:\
    'the rules to be replaced => see in RULE.py', Len_word_vec: 'size of word vector') -> None:
        super().__init__()
        self.DF = pd.read_csv(TextDir, names=['text'])
        self.Label_DF = pd.read_csv(LabelDir, names=['text'])
        self.rules = rules
        self.Len_word_vec = Len_word_vec
    def __len__(self):
        return len(self.DF)
    def __getitem__(self, Index) -> '(sample: (torch.tensor), label: (torch.tensor))':
        all_words = [word.strip() for word in self.DF['text'][Index].strip().split('||')]
        for i in range(len(all_words)):
            for rule in self.rules:
                all_words[i] = re.sub(*rule, all_words[i])
        Label = [float(word.strip()) for word in self.Label_DF['text'][Index].strip().split('||')]
        mask = [1.0]*len(all_words)
        if len(all_words) < self.Len_word_vec:
            Label = Label + [2.0]*(self.Len_word_vec - len(all_words))
            mask = mask + [0.0]*(self.Len_word_vec - len(all_words))
            all_words = all_words + ['<pad>']*(self.Len_word_vec - len(all_words))
        # print(len(all_words))
        # print(len(Label))
        # print(len(mask))
        # print('----------')
        return (all_words, torch.tensor(Label), torch.tensor(mask))

In [10]:
cnt = 0
with open('../clean_384.txt', 'r', encoding = 'utf8') as f:
    for line in f:
        if line.strip() != '':
            cnt = cnt + 1
print(cnt)

cnt = 0
with open('../label_384.txt', 'r', encoding = 'utf8') as f:
    for line in f:
        if line.strip() != '':
            cnt = cnt + 1
print(cnt)

# Separating data

In [6]:
tags = {0:'I', 1:'B', 2:'O', 3:'<PAD>'}
all=allowed_transitions('IOB1', tags)

In [7]:
num_tags = 4
mt_crf = ConditionalRandomField(num_tags=num_tags, constraints =all, include_start_end_transitions= False)

In [8]:
seq_length=4
batch_size=3

data = torch.randn(seq_length, batch_size, num_tags)#shape(seq_length, batch_size, num_tags)

In [9]:
target = torch.tensor([[0,0,0],[0,0,0],[2,2,2],[0,3,3]])#shape = (seq_length, batch_size)
print(target.size())

In [12]:
mask = torch.tensor([[1,1,1],[1,1,1], [1,1,1],[0,1,0]])
#(seq_length, batch_size)
#mask = torch.tensor([[1,0,0],[0,1,0],[0,0,1]])
print(mask.size())

In [13]:
mt_crf(data, target,mask = mask)

In [2]:
def get_index(len_row, len_col):
    for i in range(len_row):
        for j in range(len_col):
            yield(i,j)

In [22]:
x=torch.tensor([[1,0,0,0],[1,1,1,0]])
print(x[0,0].item())

In [23]:
r,c = x.size()
max_len = 0
prev_col = 1
for row, col in get_index(r,c):
    if prev_col == 1 and x[row,col].item() == 0:
        max_len = max(max_len, col)
    prev_col = x[row,col].item()
#print(max_len)
x = x[:,:max_len]
print(x)

# Defining layers

In [3]:
# class TimeDistributed(nn.Module):
#     def __init__(self, layer: '(nn.Module) layer to be processed', time_steps: '(int)'):
#         super().__init__()
#         self.layers = nn.ModuleList([layer for i in range(time_steps)])

#     def forward(self, x) -> '(torch.tensor) shape=(1, embedding_size)':
#         batch_size, time_steps, C, H, W = x.size()
#         output = torch.tensor([])
#         for i in range(time_steps):
#           output_t = self.layers[i](x[:, i, :, :, :])
#           output_t  = torch.flatten(output_t)
#           output = torch.cat((output, output_t ), 1)
#         return output

# class Convs(nn.Module):
#     def __init__(self, List_of_kernel_sizes: 'example: [(3,100),(5,100),(7,100)]', List_num_filter: 'example: \
#     [64,64,128] ***len(List_num_filter) must equal to len(List_of_kernel_sizes)***',\
#     use_BN: 'see My2DConv', activation_func: 'see My2DConv', input_channel: 'see My2DConv', \
#     same_padding: 'see My2DConv', time_steps: 'see TimeDistributed'):
#         tmp_List_layers = []
#         for ind, kernel_size in enumerate(List_of_kernel_sizes):
#             tmp_List_layers.append(TimeDistributed(My2DConv(List_num_filter[ind], use_BN, \
#             activation_func, input_channel, kernel_size, same_padding), time_steps))
#         self.Layer_list = nn.ModuleList(tmp_List_layers)

def get_index(len_row, len_col)->'(iterator of all ((int)row, (int)col))':
    for i in range(len_row):
        for j in range(len_col):
            yield(i,j)

def get_longest_seq_len(MASK: '(torch.tensor: shape=(batch_size, num_words)) \
    of mask 1 for non padding, 0 for otherwise')->'(int) col index of first zero in\
    of the longest sequence example: x=torch.tensor([[1,1,0],[1,0,0]]) -> return 2':
    tmp_mask = np.sum(MASK.numpy(),0)
    col = 0
    for i in range(tmp_mask.shape[0]):
        if tmp_mask[i]==0:
            col = i
            break
    if col == 0:
        col = tmp_mask.shape[0]
    return col

class overall_char_embedding(nn.Module):
    def __init__(self, output_size: '(tuple of ints): (batch_size, \
    embedding_size_per_word)',
    dir_char_dictionary: 'see in CharEmbedding',
    max_len_char: 'see in CharEmbedding',
    nums_filter: '(list) list of number of filters according to each \
    kernel_sizes (respectively)',
    use_BN: 'see in My2DConv',
    activation_func: 'see in My2DConv',
    input_channel: 'see in My2DConv',
    kernel_sizes: '(list) list of size of kernels used, and they will be \
    computed concurrently',
    same_padding: 'see in My2DConv',
    num_words: 'number of words used in 1 sample',
    num_char_encoding_size: 'size of encoding for each char'):
        super().__init__()
        self.batch_size, self.embedding_size_per_word = output_size
        self.Char_embedder = CharEmbedding(dir_char_dictionary,\
        max_len_char,  self.batch_size)
        tmp_cnn_models = []
        for ind_cnn, kernel_size in enumerate(kernel_sizes):
            tmp_cnn_models.append(\
            My2DConv(nums_filter[ind_cnn], use_BN, activation_func, input_channel,\
            kernel_size, same_padding)
            )
        self.num_words = num_words
        self.CNNs = nn.ModuleList(tmp_cnn_models)
        self.MyMaxPool = nn.MaxPool2d((1, num_char_encoding_size), stride= (1,1))
        self.MyFCN = nn.Linear(sum(nums_filter)*max_len_char, output_size[1])
    def forward(self, x):
        tmp_compute = self.Char_embedder(x)
        batch_size, num_word, num_char, embedding_size = tmp_compute.size()
        tmp_compute = tmp_compute.view(batch_size, num_word, 1, num_char, \
        embedding_size)
        all_output_list = []
        for num_word in range(self.num_words):
            tmp_output_cnn = []
            for tmp_cnn in self.CNNs:
                tmp_output_cnn.append(self.MyMaxPool(tmp_cnn(tmp_compute[:,\
                num_word,:,:,:])).view((self.batch_size, -1)))
            all_output_list.append(nn.ReLU()(self.MyFCN(torch.cat(tmp_output_cnn, 1))))
        print(all_output_list[0].size())
        print(len(all_output_list))
        all_output_list = torch.stack(all_output_list, dim=1)
        return all_output_list
                
class gru_crf(nn.Module):
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features', num_layers: '(int) number of \
    recursion', dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}"):
        super().__init__()
        self.gru = nn.GRU(input_size=num_input_features, hidden_size=hidden_size, num_layers=num_layers,\
        batch_first = True, dropout=dropout_gru, bidirectional=bidirectional)
        all_transition=allowed_transitions('IOB1', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.crf = ConditionalRandomField(4, all_transition, include_start_end_transitions= False)
    def forward(self, samples, target: '(torch.tensor) shape=(...............,)the target tags to be used', mask: 'True for non-pad elements'):
        batch_size, words, _ = samples.size()
        tmp_compute = self.gru(samples)[0].view(batch_size, words, -1)
        index_to_cut = get_longest_seq_len(mask)
        print()
        ##############################################
        ###cut padding some parts out#################
        tmp_compute = tmp_compute[:, :index_to_cut,:]
        target = target[:, :index_to_cut]
        mask = mask[:, :index_to_cut]
        
        #self.crf(tmp_compute.float(),target.float(),mask=mask.int(),reduction='sum')
        print('tmp_compute')
        print(tmp_compute.size())
        print('target')
        print(target.size())
        print('mask')
        print(mask.size())
        return self.crf(tmp_compute,target.long(),mask)

class My2DConv(nn.Module):
    def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, use 2d-batchnorm after linear conv',\
    activation_func: '(bool) if True, use RELU after BN', input_channel: '(int) number of input channels', \
    kernel_size: '(tuple): (width, height) size of the kernels', same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
        super().__init__()
        if same_padding:
            #assume that dialation = 1 and stride = 1
            self.padding = (math.floor((kernel_size[0] - 1)/2), math.floor((kernel_size[1] -1)/2))
        else:
            self.padding = 0
        self.Conv = nn.Conv2d(input_channel, num_filter, kernel_size, padding= self.padding)
        self.use_BN = use_BN
        self.activation_func = activation_func
        if self.use_BN:
            self.BN = nn.BatchNorm2d(num_filter)

    def forward(self, input_data: '(torch.tensor) dimension= (batch_size, num_channel_in, in_height, in_width)') \
    -> '(torch.tensor) shape= (batch_size, num_filter, in_height, in_width)':
        tmp_compute = self.Conv(input_data.float())
        if self.use_BN:
            tmp_compute = self.BN(tmp_compute)
        if self.activation_func:
            tmp_compute = nn.ReLU()(tmp_compute)
        return tmp_compute
        

class CharEmbedding(nn.Module):
    def __init__(self,\
    dir_char_dictionary: '(str) .txt',\
    max_len_char: '(int) max size of char representation, for example: given max_len_char=3 and word= "abcde" => only "abc" is used', batch_size):
    #Example: given embed_capital=True and 'a' is embedded as array([1.,0.,0.,0.,0]). 'A' is then embedded as array([1.,0.,0.,0.,1.])
        super().__init__()
        self.dictionary = {}
        self.max_len_char = max_len_char
        self.batch_size = batch_size
        with open(dir_char_dictionary, 'r', encoding='utf8') as f:
            for line in f:
                tmp_data = line.strip().split()
                self.dictionary[tmp_data[0]] = np.array([float(Char) for Char in tmp_data[1:]])
    def forward(self, list_of_tuples: '(List) for \
    example: [("w1_article1","w1_article2",...,"w1_articlen"),\
            ("w2_article1","w2_article2",...,"w2_articlen"),\
            ....\
            ("wm_article1","wm_article2",...,"wm_articlen"),\
            ]') -> '(torch.tensor) \
    shape:(max_len_char, len(dictionary)(+1))':
        #Note: 1 outer list is for 1 word.
        output = []
        for tmp_tuple in list_of_tuples:
            for word in tmp_tuple:
                embedded_word = []
                tmp_word = word
                if len(word) > self.max_len_char:
                    tmp_word = tmp_word[:self.max_len_char]
                for Char in tmp_word:
                    if Char in self.dictionary:
                        tmp_vector = self.dictionary[Char]
                    else:
                        tmp_vector = np.zeros(self.dictionary['a'].shape)
                    embedded_word.append(tmp_vector)
                if len(embedded_word) < self.max_len_char:
                    for i in range(self.max_len_char - len(embedded_word)):
                        embedded_word.append(np.zeros(self.dictionary['a'].shape))
                output.append(torch.tensor(embedded_word))
        tensor_out = []
        for i in range(self.batch_size):
            tensor_out.append([])
        for word_ind, word in enumerate(output):
            tensor_out[word_ind%self.batch_size].append(word)
        #print(len(tensor_out))
        #print(tensor_out)
        for ind in range(len(tensor_out)):
            # for j in tensor_out[ind]:
            #     print(j.size())
            # print('-------------')
            tensor_out[ind] = torch.stack(tensor_out[ind])
        return torch.stack(tensor_out)

class WordEmbedding(nn.Module):
    #use fasttext embedding ==> read from a file
    def __init__(self, fasttext_dictionary_dir: '(str) .vec extension of words and embedded_vectors',\
     Len_embedded_vector: '(int) size of embedded each vector (300 for fasttext) **Count only numbers not words'\
     , batch_size) -> None:
        #example of format in fasttext_dictionary_dir
        #กิน 1.0 -2.666 -3 22.5 .... \n
        #นอน 1.5 -5.666 3 9.5 .... \n
        #...
        #...
        super().__init__()
        self.dictionary = {}
        self.Len_embedded_vector = Len_embedded_vector
        self.batch_size = batch_size
        with open(fasttext_dictionary_dir, 'r', encoding = 'utf8') as f:
            for line in f:
                tmp_line = line.strip()
                tmp_words = tmp_line.split()
                if tmp_line != '' and len(tmp_words) == self.Len_embedded_vector + 1:
                    self.dictionary[tmp_words[0]] = np.array([float(element) for element in tmp_words[1:]])
                else:
                    continue
    def forward(self, list_of_tuples: '(List) for \
    example: [("w1_article1","w1_article2",...,"w1_articlen"),\
            ("w2_article1","w2_article2",...,"w2_articlen"),\
            ....\
            ("wm_article1","wm_article2",...,"wm_articlen"),\
            ]') -> '(torch.tensor) \
    shape:(max_len_char, len(dictionary)(+1))':
        tmp_list = []
        for tmp_tuple in list_of_tuples:
            for word in tmp_tuple:
                if word in self.dictionary:
                    tmp_list.append(self.dictionary[word])
                else:
                    #in case of OOV: Zero-vector is used.
                    tmp_list.append(np.zeros(self.Len_embedded_vector))
        tensor_out = []
        for i in range(self.batch_size):
            tensor_out.append([])
        for i in range(len(tmp_list)):
            tensor_out[i%self.batch_size].append(tmp_list[i])
        for i in range(self.batch_size):
            # print(len(tensor_out[i]))
            # print(tensor_out[i][0])
            tensor_out[i] = torch.tensor(tensor_out[i])
        #print(torch.stack(tensor_out))
        return torch.stack(tensor_out)

class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

class AttentionBetweenWordsAndChars(nn.Module):
    def __init__(self, hidden_size: '(int) size of key, query and value vectors',\
    input_vec_size: '(int) incase of fasttext input_vec_size=300'):
        super().__init__()
        self.K_FCN = nn.Linear(input_vec_size, hidden_size)
        self.Q_FCN = nn.Linear(input_vec_size, hidden_size)
        self.V_FCN = nn.Linear(input_vec_size, hidden_size)
        self.AttLayer = ScaledDotProductAttention(math.sqrt(hidden_size), 0.1)
    def forward(self, char_vectors, word_vectors):
        batch_size, word_size, _ = word_vectors.size()
        word_vectors = word_vectors.float()
        char_vectors = char_vectors.float()
        K = torch.stack([self.K_FCN(word_vectors),self.K_FCN(char_vectors)],dim = 2)
        Q = torch.stack([self.Q_FCN(word_vectors),self.Q_FCN(char_vectors)],dim = 2)
        V = torch.stack([self.V_FCN(word_vectors),self.V_FCN(char_vectors)],dim = 2)
        all_output_list = []
        for word_ind in range(word_size):
            all_output_list.append(self.AttLayer(Q[:,word_ind,:,:], \
            K[:,word_ind,:,:], V[:,word_ind,:,:])[0].view(batch_size,-1))

        return torch.stack(all_output_list,dim = 1)

# Training

In [4]:
BS = 16
dataloader = DataLoader(MyDataloader('../clean_384.txt', '../label_384.txt', RULEs, 544), batch_size=BS, shuffle=False)

In [5]:
t1 = time()
word_embed = WordEmbedding('../fasttext.th.vec', 300, BS)
print(f'word_embed: {time() - t1}')
t1=time()
char_embed = CharEmbedding('../LSTM-CRF-NER/char_vec_dictionary.txt',5, BS)
print(f'char_embed: {time() - t1}')

word_embed: 14.280364036560059
char_embed: 0.012252092361450195


In [6]:
data = []
word_en = []
for ind, i in enumerate(dataloader):
    if ind > 10:
        break
    data.append(i)
print('ok')
for i in data:
    t1 = time()
    char_embed(i[0])
    print(f'embedding: {time() - t1}')
    t1 = time()
    word_en.append(word_embed(i[0]))
    print(f'embedding: {time() - t1}')

ok
embedding: 0.9911289215087891
embedding: 0.35086679458618164
embedding: 0.8712151050567627
embedding: 0.3417980670928955
embedding: 0.8621640205383301
embedding: 0.340421199798584
embedding: 0.8987948894500732
embedding: 0.3425710201263428
embedding: 0.8711140155792236
embedding: 0.3415639400482178
embedding: 0.8454639911651611
embedding: 0.3358640670776367
embedding: 0.9129228591918945
embedding: 0.3381960391998291
embedding: 0.849724292755127
embedding: 0.337630033493042
embedding: 0.8453831672668457
embedding: 0.34116411209106445
embedding: 0.8487789630889893
embedding: 0.33668017387390137
embedding: 0.8521537780761719
embedding: 0.33583903312683105


In [None]:
tmp_all = overall_char_embedding((BS,300),'../LSTM-CRF-NER/char_vec_dictionary.txt',5,[4],True,True,1,[(3,135)],True,544,135)
output_char_en = []
for i in data:
    t1 = time()
    output_char_en.append(tmp_all(i[0]))
    print(f'embedding: {time() - t1}')

torch.Size([16, 300])
544
embedding: 10.82581877708435
torch.Size([16, 300])
544
embedding: 13.51282787322998
torch.Size([16, 300])
544
embedding: 13.09506106376648
torch.Size([16, 300])
544
embedding: 12.933187246322632
torch.Size([16, 300])
544
embedding: 12.992132186889648
torch.Size([16, 300])
544
embedding: 13.033794641494751


In [None]:
my_attention = AttentionBetweenWordsAndChars(50,300)

In [None]:
att_out = []
for i in range(len(data)):
    t1 = time()
    att_out.append(my_attention(output_char_en[i], word_en[i]))
    print(f'att_layer: {time() - t1}')

In [None]:
gru_crf_layer = gru_crf(100, 2, 544, 0.1, True, {0:'I', 1:'B', 2:'O', 3:'<PAD>'})

In [None]:
for i in range(len(att_out)):
    t1 = time()
    print(gru_crf_layer(att_out[i], data[i][1], data[i][2]))
    print(f'gru_crf: {time() - t1}')
    print('-------------------')

In [42]:
tags = {0:'I', 1:'B', 2:'O', 3:'<PAD>'}
all_transition=allowed_transitions('IOB1', tags)
CRF = ConditionalRandomField(4, all_transition, include_start_end_transitions= False)

In [110]:
x2 = torch.transpose(x,0,1)

In [111]:
print(x2.size())

torch.Size([16, 384, 4])


In [124]:
CRF(x2,y.long(),z)#torch.transpose(z,1,0).long()

tensor(-8493.3369, grad_fn=<SumBackward0>)

In [114]:
print(x2.size())
print(y.size())
print(z.size())

torch.Size([16, 384, 4])
torch.Size([16, 384])
torch.Size([16, 384])


In [58]:
for i in range(16):
    if z[i,500] == 1:
        print('no pad found')

In [98]:
def get_longest_seq_len(MARK: '(torch.tensor: shape=(batch_size, num_words)) \
    of mask 1 for non padding, 0 for otherwise')->'(int) col index of first zero in\
    of the longest sequence example: x=torch.tensor([[1,1,0],[1,0,0]]) -> return 2':
    r,c = MARK.size()
    max_len = 0
    prev_col = 1
    for row, col in get_index(r,c):
        if prev_col == 1 and MARK[row,col].item() == 2:
            max_len = max(max_len, col)
        prev_col = MARK[row,col].item()
    return col

In [48]:
get_longest_seq_len(z)

542

In [59]:
print(z)

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])


In [60]:
www=z.numpy()

In [63]:
a = np.sum(www,0)
print(a.shape)

(543,)


In [69]:
for i in range(a.shape[0]):
    if a[i] == 0:
        print(i)
        break

384


In [67]:
print(a)

[16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16. 16

In [102]:
def get_longest_seq_len2(MASK: '(torch.tensor: shape=(batch_size, num_words)) \
    of mask 1 for non padding, 0 for otherwise')->'(int) col index of first zero in\
    of the longest sequence example: x=torch.tensor([[1,1,0],[1,0,0]]) -> return 2':
    tmp_mask = np.sum(MASK.numpy(),0)
    col = 0
    for i in range(tmp_mask.shape[0]):
        if tmp_mask[i]==0:
            col = i
            break
    if col == 0:
        col = tmp_mask.shape[0]
    return col, tmp_mask

In [104]:
col,v =get_longest_seq_len2(z)

In [105]:
v[-1]

16.0