In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import regex as re
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from allennlp.modules.conditional_random_field import ConditionalRandomField
from allennlp.modules.conditional_random_field import allowed_transitions
from allennlp.modules.lstm_cell_with_projection import LstmCellWithProjection
from allennlp.modules.input_variational_dropout import InputVariationalDropout
from torch.utils.data import Dataset, DataLoader, random_split
#from torchcrf import CRF
from torch.utils.data.sampler import SubsetRandomSampler
import random
from torchnlp.nn import WeightDropLSTM


from torch.nn.utils.rnn import PackedSequence
from typing import *

from sklearn.metrics import confusion_matrix
#from sklearn_crfsuite import metrics
from torchnlp.nn import WeightDropGRU
from RULE import RULEs
from POSMap import POSMAP

# Defining dataloader

# Defining Layers

In [7]:
class MyDataloader(Dataset):
    def __init__(self, TextDir: '.txt extension of samples', LabelDir: '.txt extension of labels',rules:\
                 'the rules to be replaced => see in RULE.py', Len_word_vec: 'size of word vector', \
                delimiter: '(str) delimiter used to separate data', dir_char_dictionary: \
                '(str) see in CharEmbedding', max_len_char: '(int) see in CharEmbedding', \
                fasttext_dictionary_dir: '(str) see in WordEmbedding',\
                Len_embedded_vector: '(int) see in WordEmbedding', device, POSDir: '(str) .txt extension of POS',\
                POSMapping: 'see in POSMap.py') -> None:
        super().__init__()
        self.DF = pd.read_csv(TextDir, names=['text'])
        self.Label_DF = pd.read_csv(LabelDir, names=['text'])
        self.pos_DF = pd.read_csv(POSDir, names=['text'])
        self.rules = rules
        self.Len_word_vec = Len_word_vec
        self.delimiter = delimiter
        self.char_embedder = CharEmbedding(dir_char_dictionary, max_len_char)
        self.word_embedder = WordEmbedding(fasttext_dictionary_dir, Len_embedded_vector)
        self.device = device
        self.pos_embedder = POSEmbedding(POSMapping)
    def __len__(self):
        return len(self.DF)
    def __getitem__(self, Index) -> '(sample: (torch.tensor), label: (torch.tensor))':
        all_words = [word.strip() for word in self.DF['text'][Index].strip().split(self.delimiter)]
        for i in range(len(all_words)):
            for rule in self.rules:
                all_words[i] = re.sub(*rule, all_words[i])
        Label = [float(word.strip()) for word in self.Label_DF['text'][Index].strip().split(self.delimiter)]
        mask = [1.0]*len(all_words)
        POS = [pos.strip() for pos in self.pos_DF['text'][Index].strip().split(self.delimiter)]
        tmp_length = len(all_words)
        if len(all_words) < self.Len_word_vec:
            Label = Label + [3.0]*(self.Len_word_vec - len(all_words))
            mask = mask + [0.0]*(self.Len_word_vec - len(all_words))
            POS = POS + ['<pad>']*(self.Len_word_vec - len(all_words))
            all_words = all_words + ['<pad>']*(self.Len_word_vec - len(all_words))
        char_embed = self.char_embedder.embed(all_words)
        word_embed = self.word_embedder.embed(all_words)
        pos_embed = self.pos_embedder.embed(POS)
        # print(len(all_words))
        # print(len(Label))
        # print(len(mask))
        # print('----------')
        return (char_embed.to(self.device), word_embed.to(self.device), \
                torch.tensor(Label).to(self.device), torch.tensor(mask).to(self.device), \
                tmp_length, pos_embed.float().to(self.device))
    

class CharEmbedding():
    def __init__(self,\
    dir_char_dictionary: '(str) .txt',\
    max_len_char: '(int) max size of char representation, for example: given max_len_char=3 and word= "abcde" => only "abc" is used'):
    #Example: given embed_capital=True and 'a' is embedded as array([1.,0.,0.,0.,0]). 'A' is then embedded as array([1.,0.,0.,0.,1.])
        self.dictionary = {}
        self.max_len_char = max_len_char
        with open(dir_char_dictionary, 'r', encoding='utf8') as f:
            for line in f:
                tmp_data = line.strip().split()
                self.dictionary[tmp_data[0]] = np.array([float(Char) for Char in tmp_data[1:]])
    def embed(self, list_of_words: '(list[str]) example: ["ฉัน","กิน","ข้าว"]'):
        #Note: 1 outer list is for 1 word.
        output = []
        for word in list_of_words:
            embedded_word = []
            tmp_word = word
            if len(word) > self.max_len_char:
                tmp_word = tmp_word[:self.max_len_char]
            for Char in tmp_word:
                if Char in self.dictionary:
                    tmp_vector = self.dictionary[Char]
                else:
                    tmp_vector = np.zeros(self.dictionary['a'].shape)
                embedded_word.append(tmp_vector)
            if len(embedded_word) < self.max_len_char:
                for i in range(self.max_len_char - len(embedded_word)):
                    embedded_word.append(np.zeros(self.dictionary['a'].shape))
            output.append(torch.tensor(embedded_word))
        return torch.stack(output)

class WordEmbedding():
    #use fasttext embedding ==> read from a file
    def __init__(self, fasttext_dictionary_dir: '(str) .vec extension of words and embedded_vectors',\
     Len_embedded_vector: '(int) size of embedded each vector (300 for fasttext) **Count only numbers not words'\
     ) -> None:
        #example of format in fasttext_dictionary_dir
        #กิน 1.0 -2.666 -3 22.5 .... \n
        #นอน 1.5 -5.666 3 9.5 .... \n
        #...
        #...
        self.dictionary = {}
        self.Len_embedded_vector = Len_embedded_vector
        with open(fasttext_dictionary_dir, 'r', encoding = 'utf8') as f:
            for line in f:
                tmp_line = line.strip()
                tmp_words = tmp_line.split()
                if tmp_line != '' and len(tmp_words) == self.Len_embedded_vector + 1:
                    self.dictionary[tmp_words[0]] = np.array([float(element) for element in tmp_words[1:]])
                else:
                    continue
    def embed(self, list_of_words: '(List[str]) for example: ["ฉัน","กิน","ข้าว"]'):
        tmp_list = []
        for word in list_of_words:
            if word in self.dictionary:
                tmp_list.append(self.dictionary[word])
            else:
                #in case of OOV: Zero-vector is used.
                tmp_list.append(np.zeros(self.Len_embedded_vector))
        return torch.tensor(tmp_list)

class POSEmbedding():
    def __init__(self, POSMapping: 'see in POSMap.py'):
        self.dictionary = POSMapping
        self.size = len(self.dictionary)
    def embed(self, list_of_POSs:'(list[str]) example: ["NOUN","VERB","NOUN"]'):
        tmp_list = []
        for POS in list_of_POSs:
            POS = POS.strip()
            if POS == '<pad>':
                tmp_list.append(np.zeros(self.size))
            else:
                tmp_data = np.zeros(self.size)
                tmp_data[self.dictionary[POS]] = 1
                tmp_list.append(tmp_data)
        return torch.tensor(tmp_list)

#new
############### RNN encoding ######################
# class CNN_char(nn.Module):
#     def __init__(self, num_filter: '()'):

#         class My2DConv(nn.Module):
#     def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, use 2d-batchnorm after linear conv',\
#                  activation_func: '(bool) if True, use RELU after BN', input_channel: '(int) number of input channels', \
#                  kernel_size: '(tuple): (width, height) size of the kernels', same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
#         super().__init__()

class RNN_char(nn.Module):
    def __init__(self, num_char_vec_features, hidden_size, num_layers, dropout_gru, bidirectional, \
                output_size, dropout_FCN, num_word):
        super().__init__()
        self.gru = nn.GRU(input_size=num_char_vec_features, hidden_size=hidden_size, num_layers=num_layers,\
                          batch_first = True, dropout=dropout_gru, bidirectional=bidirectional)
        self.linear = nn.Linear(hidden_size*2*num_layers, output_size)
        self.BN = nn.BatchNorm1d(num_word)
        self.dropout = nn.Dropout(dropout_FCN)
        self.num_layers = num_layers
    def forward(self, x):
        batch_size, word_seq, char_seq, char_vec = x.size()
        tmp_list = []
        for i in range(word_seq):
            tmp_compute , _ = self.gru(x[:,i,:,:].float())
            tmp_list.append(tmp_compute.contiguous().view(batch_size,-1))
        tmp_compute = torch.stack(tmp_list,1)
        #print(tmp_compute.size())
        tmp_compute = self.dropout(tmp_compute)
        tmp_compute = self.linear(tmp_compute)
        #print(tmp_compute.size())
        tmp_compute = F.relu(self.BN(tmp_compute))#>>linear >> BachNorm >> relu
        return tmp_compute
    
class over_all_NER2(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 num_char_vec_features: '(int)',\
                 hidden_size: '(int)',\
                 max_num_char: '(int)',\
                 dropout_gru_char: '(double)',\
                 bidirectional_char: '(bool)',\
                 output_char_embed_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)',\
                 pos_size: '(int) size of pos embedding',
                 DO_GRU_out):
        super().__init__()
        self.gru_char = RNN_char(num_char_vec_features, hidden_size, max_num_char, dropout_gru_char, \
                                 bidirectional_char, output_char_embed_size, DOchar_FCN, num_words)
        self.gru_crf_layer = gru_crf(size_of_embedding + output_char_embed_size + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, DO_FCN_GRUCRF, DO_GRU_out)
    def forward(self, x):
        tmp_compute = self.gru_char(x[0])
        #print(tmp_compute.size())
        #print(x[1].size())
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.gru_char(x[0])
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf

class CNN_GRU_CRF(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 max_num_char: '(int)',\
                 nums_filter: '(list[int] see in overall_char_embedding)',\
                 use_BN: '(bool) only for CNNchar',\
                 activation_func: '(bool) only for CNNchar',\
                 input_channel: '(int) see in My2DConv',\
                 kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',\
                 same_padding: '(bool) same padding for CNNchar',\
                 num_char_encoding_size: '(int) size of each char embedding vector',\
                 output_size: '(int) output dimension of CNNchar',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)',\
                 pos_size: '(int) size of pos embedding',\
                 FCN: '(bool) see overall_char_embedding',\
                 drop_weight):
        super().__init__()
        if not FCN:
            output_size = num_char_encoding_size
        #print(f'output_size: {output_size}')
        self.overall_char_embedding = overall_char_embedding((Batch_size, output_size), max_num_char, \
                                                             nums_filter, use_BN, activation_func, \
                                                             input_channel, kernel_sizes, same_padding, \
                                                             num_words, num_char_encoding_size, FCN)

        self.gru_crf_layer = gru_crf(size_of_embedding + output_size + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, \
                                     DO_FCN_GRUCRF, drop_weight)
    def forward(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        #print(tmp_compute.size())
        #print(x[1].size())
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf

class CNN_GRU_word_pos(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)',\
                 pos_size: '(int) size of pos embedding',\
                 drop_GRU_out):
        super().__init__()
        #print(f'output_size: {output_size}')
        self.gru_crf_layer = gru_crf(size_of_embedding + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, \
                                     DO_FCN_GRUCRF, drop_GRU_out)
    def forward(self, x):
        tmp_compute = torch.cat([x[1].float(), x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = torch.cat([x[1].float(), x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf
    
class GRU_CRF_word(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)'):
        super().__init__()
        self.gru_crf_layer = gru_crf(size_of_embedding , gru_hidden_size, num_words, \
                                     dropout_gru, bidirectional, tags, DO_FCN_GRUCRF)
    def forward(self, x):
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((x[1].float(), x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_gru_crf = self.gru_crf_layer.predict((x[1].float(), x[4]), x[3].long())
        return tmp_gru_crf

class CNN_GRU_char(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 max_num_char: '(int)',\
                 nums_filter: '(list[int] see in overall_char_embedding)',
                 use_BN: '(bool) only for CNNchar',
                 activation_func: '(bool) only for CNNchar',
                 input_channel: '(int) see in My2DConv',
                 kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',
                 same_padding: '(bool) same padding for CNNchar',
                 num_char_encoding_size: '(int) size of each char embedding vector',\
                 output_size: '(int) output dimension of CNNchar',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)',\
                 FCN: '(bool) see overall_char_embedding',\
                 DO_weight_gru: '(float) weight dropout'):
        super().__init__()
        if not FCN:
            output_size = num_char_encoding_size
        #print(f'output_size: {output_size}')
        self.overall_char_embedding = overall_char_embedding((Batch_size, output_size), max_num_char, \
                                                             nums_filter, use_BN, activation_func, \
                                                             input_channel, kernel_sizes, same_padding, \
                                                             num_words, num_char_encoding_size, FCN)

        self.gru_crf_layer = gru_crf(output_size, gru_hidden_size, num_words, dropout_gru, bidirectional, tags, \
                                     DO_FCN_GRUCRF, DO_weight_gru)
    def forward(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf

class CNN_GRU_char_pos(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 max_num_char: '(int)',\
                 nums_filter: '(list[int] see in overall_char_embedding)',\
                 use_BN: '(bool) only for CNNchar',\
                 activation_func: '(bool) only for CNNchar',\
                 input_channel: '(int) see in My2DConv',\
                 kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',\
                 same_padding: '(bool) same padding for CNNchar',\
                 num_char_encoding_size: '(int) size of each char embedding vector',\
                 output_size: '(int) output dimension of CNNchar',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', \
                 DO_FCN_GRUCRF: '(double)', \
                 pos_size: '(int) size of pos embedding', \
                 FCN: '(bool) see overall_char_embedding',\
                 drop_weight):
        super().__init__()
        if not FCN:
            output_size = num_char_encoding_size
        #print(f'output_size: {output_size}')
        self.overall_char_embedding = overall_char_embedding((Batch_size, output_size), max_num_char, \
                                                             nums_filter, use_BN, activation_func, \
                                                             input_channel, kernel_sizes, same_padding, \
                                                             num_words, num_char_encoding_size, FCN)

        self.gru_crf_layer = gru_crf(output_size + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, \
                                     DO_FCN_GRUCRF, drop_weight)
    def forward(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        #print(tmp_compute.size())
        #print(x[1].size())
        tmp_compute = torch.cat([tmp_compute, x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        tmp_compute = torch.cat([tmp_compute, x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf


#new
def get_index(len_row, len_col)->'(iterator of all ((int)row, (int)col))':
    for i in range(len_row):
        for j in range(len_col):
            yield(i,j)

def get_longest_seq_len(MASK: '(torch.tensor: shape=(batch_size, num_words)) \
    of mask 1 for non padding, 0 for otherwise')->'(int) col index of first zero in\
    of the longest sequence example: x=torch.tensor([[1,1,0],[1,0,0]]) -> return 2':
    tmp_mask = MASK.numpy()
    if len(tmp_mask.shape) != 1:
        tmp_mask = np.sum(tmp_mask,0)
    col = 0
    for i in range(tmp_mask.shape[0]):
        if tmp_mask[i]==0:
            col = i
            break
    if col == 0:
        col = tmp_mask.shape[0]
    return col

class overall_char_embedding(nn.Module):
    def __init__(self, output_size: '(tuple of ints): (batch_size, embedding_size_per_word)',\
    max_len_char: '(int) see in CharEmbedding',\
    nums_filter: '(list) list of number of filters according to each kernel_sizes (respectively)',\
    use_BN: 'see in My2DConv',\
    activation_func: 'see in My2DConv',\
    input_channel: 'see in My2DConv',\
    kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',\
    same_padding: 'see in My2DConv',\
    num_words: 'number of words used in 1 sample',\
    num_char_encoding_size: 'size of encoding for each char',\
    FCN: '(bool) use FCN after CNN or not'):
        super().__init__()
        self.batch_size, self.embedding_size_per_word = output_size
        tmp_cnn_models = []
        for ind_cnn, kernel_size in enumerate(kernel_sizes):
            tmp_cnn_models.append(\
            My2DConvChar(nums_filter[ind_cnn], use_BN, activation_func, input_channel,\
            (kernel_size, 1), same_padding)
            )
        self.num_words = num_words
        self.CNNs = nn.ModuleList(tmp_cnn_models)
        self.MyMaxPool = nn.MaxPool2d((max_len_char, 1), stride= (1,1))
        self.FCN = FCN
        if self.FCN:
            self.MyFCN = nn.Linear(sum(nums_filter)*num_char_encoding_size, output_size[1])
            self.BN = nn.BatchNorm1d(output_size[1])
    def forward(self, x):
        batch_size, num_word, num_char, embedding_size = x.size()
        #print(x.size())
        tmp_compute = x.view(batch_size, num_word, 1, num_char, \
        embedding_size)
        all_output_list = []
        for num_word in range(self.num_words):
            tmp_output_cnn = []
            for tmp_cnn in self.CNNs:
                tmp_output_cnn.append(self.MyMaxPool(tmp_cnn(tmp_compute[:,\
                num_word,:,:,:])).view((batch_size, -1)))
            tmp = torch.cat(tmp_output_cnn, 1)
            #print(tmp.size())
            if self.FCN:
                all_output_list.append(F.relu(self.BN(self.MyFCN(tmp))))
            else:
                all_output_list.append(tmp)
        #print(all_output_list[0].size())
        #print(len(all_output_list))
        all_output_list = torch.stack(all_output_list, dim=1)
        #print(all_output_list.size())
        return all_output_list
                
class gru_crf(nn.Module):
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features', num_layers: '(int) number of \
    recursion', dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)', drop_GRU_out):
        super().__init__()
        self.gru = nn.GRU(input_size = num_input_features, hidden_size = hidden_size, \
                                  num_layers = num_layers, batch_first = True, dropout = dropout_gru, \
                                  bidirectional = bidirectional)
        #self.gru = WeightDropGRU(input_size = num_input_features, hidden_size = hidden_size, \
        #                         num_layers = num_layers, batch_first = True, dropout = dropout_gru, \
        #                         bidirectional = bidirectional, weight_dropout=drop_weight)
        all_transition=allowed_transitions('BIO', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size*2, hidden_size)
        self.BN = nn.BatchNorm1d(num_layers)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.BN2 = nn.BatchNorm1d(num_layers)
        self.crf = ConditionalRandomField(len(tags), all_transition)
        self.dropout = dropout_FCN
        self.drop_GRU_out = drop_GRU_out
        
    def forward(self, samples, target: '(torch.tensor) shape=(...............,)the target tags to be used',\
                mask: 'True for non-pad elements'):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        #print(samples.size())
        tmp_compute = F.dropout(self.gru(samples)[0], p=self.dropout)
        #print('pass inference gru')
        tmp_compute = tmp_compute.view(batch_size, words, -1)
        #print('pass reshape gru')
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        #length = torch.mean(length.float()).item()
        ##############################################
        ###cut padding some parts out#################
        #print(tmp_compute.size())
        #tmp_compute = self.dropout(tmp_compute)
        tmp_compute = F.dropout(F.relu(self.BN(self.linear(tmp_compute))), p=self.drop_GRU_out)
        tmp_compute = F.relu(self.BN2(self.linear2(tmp_compute)))
        tmp_compute = F.dropout(tmp_compute[:, :index_to_cut,:],  p=self.dropout)
        target = target[:, :index_to_cut]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        nll_loss = self.crf(tmp_compute,target.long(),mask)
#         print(f'total CRF time: {time() - tmp_t}')
        return nll_loss#/length
    def predict(self, samples, mask):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        tmp_compute = self.gru(samples)[0].view(batch_size, words, -1)
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        ##############################################
        ###cut padding some parts out#################
        #print(tmp_compute.size())
        
        tmp_compute = F.relu(self.BN(self.linear(tmp_compute)))
        tmp_compute = F.relu(self.BN2(self.linear2(tmp_compute)))
        tmp_compute = tmp_compute[:, :index_to_cut,:]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        tmp_t = time()
        tmp_tags = self.crf.viterbi_tags(tmp_compute,mask)
#         print(f'total CRF prediction time: {time() - tmp_t}')
        return tmp_tags
    
class My2DConv(nn.Module):
    def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, use 2d-batchnorm after linear conv',\
    activation_func: '(bool) if True, use RELU after BN', input_channel: '(int) number of input channels', \
    kernel_size: '(tuple): (width, height) size of the kernels', same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
        super().__init__()
        if same_padding:
            #assume that dialation = 1 and stride = 1
            self.padding = (math.floor((kernel_size[0] - 1)/2), math.floor((kernel_size[1] -1)/2))
        else:
            self.padding = 0
        self.Conv = nn.Conv2d(input_channel, num_filter, kernel_size, padding= self.padding)
        self.use_BN = use_BN
        self.activation_func = activation_func
        if self.use_BN:
            self.BN = nn.BatchNorm2d(num_filter)

    def forward(self, input_data: '(torch.tensor) dimension= (batch_size, num_channel_in, in_height, in_width)') \
    -> '(torch.tensor) shape= (batch_size, num_filter, in_height, in_width)':
        tmp_compute = self.Conv(input_data.float())
        if self.use_BN:
            tmp_compute = self.BN(tmp_compute)
        if self.activation_func:
            tmp_compute = nn.ReLU()(tmp_compute)
        return tmp_compute
        
class My2DConvChar(nn.Module):
    def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, \
                 use 2d-batchnorm after linear conv', activation_func: '(bool) if True, use RELU\
                 after BN', input_channel: '(int) number of input channels', \
                 kernel_size: '(tuple): (width, height) size of the kernels', \
                 same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
        super().__init__()
        if same_padding:
            #assume that dialation = 1 and stride = 1
            self.padding = (math.floor((kernel_size[0] - 1)/2), 0)
        else:
            self.padding = 0
        self.Conv = nn.Conv2d(input_channel, num_filter, kernel_size, padding= self.padding)
        self.use_BN = use_BN
        self.activation_func = activation_func
        if self.use_BN:
            self.BN = nn.BatchNorm2d(num_filter)

    def forward(self, input_data: '(torch.tensor) dimension= (batch_size, num_channel_in, in_height, in_width)') \
    -> '(torch.tensor) shape= (batch_size, num_filter, in_height, in_width)':
        tmp_compute = self.Conv(input_data.float())
        if self.use_BN:
            tmp_compute = self.BN(tmp_compute)
        if self.activation_func:
            tmp_compute = F.relu(tmp_compute)
        return tmp_compute


class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

class AttentionBetweenWordsAndChars(nn.Module):
    def __init__(self, hidden_size: '(int) size of key, query and value vectors',\
    input_vec_size: '(int) incase of fasttext input_vec_size=300'):
        super().__init__()
        self.K_FCN = nn.Linear(input_vec_size, hidden_size)
        self.Q_FCN = nn.Linear(input_vec_size, hidden_size)
        self.V_FCN = nn.Linear(input_vec_size, hidden_size)
        self.AttLayer = ScaledDotProductAttention(math.sqrt(hidden_size), 0.1)
    def forward(self, char_vectors, word_vectors):
        batch_size, word_size, _ = word_vectors.size()
        word_vectors = word_vectors.float()
        char_vectors = char_vectors.float()
#         print(word_vectors.size())
#         print(char_vectors.size())
        K = torch.stack([self.K_FCN(word_vectors),self.K_FCN(char_vectors)],dim = 2)
        Q = torch.stack([self.Q_FCN(word_vectors),self.Q_FCN(char_vectors)],dim = 2)
        V = torch.stack([self.V_FCN(word_vectors),self.V_FCN(char_vectors)],dim = 2)
        all_output_list = []
        for word_ind in range(word_size):
            all_output_list.append(self.AttLayer(Q[:,word_ind,:,:], \
            K[:,word_ind,:,:], V[:,word_ind,:,:])[0].view(batch_size,-1))

        return torch.stack(all_output_list,dim = 1)
    
class over_all_NER(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 max_len_char: '(int) see overall_char_embedding',\
                 num_conv_filters: '(list[int]) see in overall_char_embedding', \
                 use_BN: '(bool) see in overall_char_embedding', \
                 use_activation: '(bool) see in overall_char_embedding', \
                 num_conv_input_channel: '(int) see in overall_char_embedding', \
                 kernel_sizes: '(list[tuple[int, int]]) see in overall_char_embedding', \
                 use_same_padding: '(bool) see in overall_char_embedding', \
                 num_words: '(int) see in overall_char_embedding', \
                 num_char_encoding_size: '(int) see in overall_char_embedding', \
                 att_hidden_size: '(int) see in AttentionBetweenWordsAndChars', \
                 num_input_features: '(int) see in gru_crf', gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf'):
        super().__init__()
        self.char_embed = overall_char_embedding((Batch_size,size_of_embedding), max_len_char, num_conv_filters, \
                                                 use_BN, use_activation, num_conv_input_channel, kernel_sizes, \
                                                 use_same_padding, num_words, num_char_encoding_size)
        self.my_attention = AttentionBetweenWordsAndChars(att_hidden_size, size_of_embedding)
        self.gru_crf_layer = gru_crf(num_input_features, gru_hidden_size, num_words, dropout_gru, \
                                bidirectional, tags)
        self.Batch_size = Batch_size
    def forward(self, x):
        tmp_compute = self.char_embed(x[0])
        tmp_att = self.my_attention(tmp_compute, x[1])
        tmp_gru_crf = self.gru_crf_layer(tmp_att, x[2], x[3].long())
        return tmp_gru_crf#/self.Batch_size
    def predict(self, x):
        tmp_compute = self.char_embed(x[0])
        tmp_att = self.my_attention(tmp_compute, x[1])
        tmp_tags = self.gru_crf_layer.predict(tmp_att, x[3].long())
        return tmp_tags

def get_indices_random_train_test_split(dataset_size:'(int) number of rows', random_seed: '(int)',\
                                        validation_split: '(double)', shuffle_dataset: '(bool)'):
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    return train_sampler, valid_sampler
  
def get_indices_random_val_test_split(dataset_size:'(int) number of rows', random_seed: '(int)',\
                                        validation_split: '(double)', shuffle_dataset: '(bool)'):
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    test_indices, val_indices = indices[split: 2*split], indices[:split]
    # Creating PT data samplers and loaders:
    test_sampler = SubsetRandomSampler(test_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    return test_sampler, valid_sampler

def eval_score(tags: '(dict[int: str])', pred: '(list[(list, float)])', label: 'torch.tensor'):
    pred = np.array([np.array(i[0]) for i in pred])
    label = label.cpu().numpy().astype('int8')
    label = [label[i][:len(pred[i])] for i in range(len(pred))]
    conf_mat = np.zeros((len(tags), len(tags)))
#     print(len(label))
#     print(len(pred))
#     print('---------------')
    for i in range(len(label)):
#         print(len(label[i]))
#         print(len(pred[i]))
        conf_mat += confusion_matrix(label[i],pred[i],range(len(tags)))
    performance_mat = np.zeros((len(tags), 3))#recall, precision, f1-score
    for i in range(len(tags)):
        if np.sum(conf_mat[i]) == 0:
            performance_mat[i][0] = 0
        else:
            performance_mat[i][0] = conf_mat[i][i]/np.sum(conf_mat[i])
        if np.sum(conf_mat[:,i]) == 0:
            performance_mat[i][1] = 0
        else:
            performance_mat[i][1] = conf_mat[i][i]/np.sum(conf_mat[:,i])
        if performance_mat[i][1]+performance_mat[i][0] == 0:
            performance_mat[i][2] = 0
        else:
            performance_mat[i][2] = (2*performance_mat[i][0]*performance_mat[i][1])/(performance_mat[i][1]+performance_mat[i][0])
    return performance_mat, conf_mat[:,:-1]

class CNN_GRU_char_pos(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 max_num_char: '(int)',\
                 nums_filter: '(list[int] see in overall_char_embedding)',\
                 use_BN: '(bool) only for CNNchar',\
                 activation_func: '(bool) only for CNNchar',\
                 input_channel: '(int) see in My2DConv',\
                 kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',\
                 same_padding: '(bool) same padding for CNNchar',\
                 num_char_encoding_size: '(int) size of each char embedding vector',\
                 output_size: '(int) output dimension of CNNchar',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', \
                 DO_FCN_GRUCRF: '(double)', \
                 pos_size: '(int) size of pos embedding', \
                 FCN: '(bool) see overall_char_embedding', \
                 drop_weight
                 ):
        super().__init__()
        if not FCN:
            output_size = num_char_encoding_size
        #print(f'output_size: {output_size}')
        self.overall_char_embedding = overall_char_embedding((Batch_size, output_size), max_num_char, \
                                                             nums_filter, use_BN, activation_func, \
                                                             input_channel, kernel_sizes, same_padding, \
                                                             num_words, num_char_encoding_size, FCN)

        self.gru_crf_layer = gru_crf(output_size + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, \
                                     DO_FCN_GRUCRF, drop_weight)
    def forward(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        #print(tmp_compute.size())
        #print(x[1].size())
        tmp_compute = torch.cat([tmp_compute, x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.overall_char_embedding(x[0])
        tmp_compute = torch.cat([tmp_compute, x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf

def plot_grad_flow(named_parameters):
    ave_grads = []
    layers = []
    for ind, tmp in enumerate(named_parameters):
        n, p= tmp
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.show()

# Training

In [None]:
BS = 4
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 2000
word_length = 2000
early_stop_n = 100
max_size_char = 6
num_search = 1000
filename = 'aaaaaaaaa'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

data = MyDataloader('../clean84withpos.txt', '../label84withpos.txt', RULEs, \
                    word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
                    '../fasttext.th.vec', 300, device, '../pos_tag84withpos.txt',POSMAP)

tr, te = get_indices_random_val_test_split(len(data), 1, 0.0005, True)
train_loader = DataLoader(data, batch_size=BS, sampler=tr)
test_loader = DataLoader(data, batch_size=BS, sampler=te)

# NER = over_all_NER(BS,300, max_size_char, num_kernels,True,True,1,kernel_sizes,\
#                    True,word_length,135,attention_in, attention_out, gru_hidden_size, \
#                    gru_dropout, True, tags)
#####
# Batch_size: '(int)',\
#                  num_char_vec_features: '(int)',\
#                  hidden_size: '(int)',\
#                  max_num_char: '(int)',\
#                  dropout_gru_char: '(double)',\
#                  bidirectional_char: '(bool)',\
#                  output_char_embed_size: '(int)',\
#                  size_of_embedding: '(int) size of each word embedding vector',\
#                  num_words: '(int) see in overall_char_embedding', \
#                  gru_hidden_size: '(int) see in gru_crf', \
#                  dropout_gru: '(double) see in gru_crf', \
#                  bidirectional: '(bool)', \
#                  tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)')
#####
for cur_ind in range(num_search):
    torch.cuda.empty_cache()
    grucrf_dropout = random.uniform(0.2,0.5)#0.5
    DO_FCN_GRUCRF = random.uniform(0.2,0.5)#0.5
    grucrf_hidden_size = 128#random.choice([128, 256])#5
    gru_weight_dropout = random.uniform(0.1,0.4)
    LR = 5*10**random.uniform(-3,-5)#0.001
    
    print(f'cur_ind: {cur_ind}. gru_weight_dropout: {gru_weight_dropout}')
    print(f'grucrf_dropout: {grucrf_dropout}, DO_FCN_GRUCRF: {DO_FCN_GRUCRF}')
    print(f'grucrf_hidden_size: {grucrf_hidden_size}, LR: {LR}')

    NER = CNN_GRU_CRF(BS, max_size_char, [1], True, True, 1, \
                      [3], True, 135, 135,\
                      300, word_length, grucrf_hidden_size, grucrf_dropout, \
                      True, tags, DO_FCN_GRUCRF, len(POSMAP), False)

    optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
    my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

    print(device)
    NER.to(device)
    best_score = 0
    best_mat = np.zeros((len(tags)-1,3))
    cnt_idle = 0
    for epoch in range(10):
        print(f'epoch {epoch}')
        all_loss = []
        for ind, batch_x in enumerate(train_loader):
            if ind%5 == 0:
                print(ind)
            t2 = time()
            NER = NER.train()
            print(time() - t2)
            NER.zero_grad()
            t1 = time()
            loss = NER(batch_x)
            loss = loss*(-1)
            print(f'time per batch: {time() - t1}')
            print(loss)
            all_loss.append(loss)
            loss.backward()
            nn.utils.clip_grad_value_(NER.parameters(), 10)
            optimizer.step()
        total_loss = sum(all_loss)/(ind + 1)
        my_scheduler.step(total_loss)
        print(f'total loss of epoch: {total_loss.item()}')
        print('testing')
        per_mat = np.zeros((len(tags), 3))
        for ind, batch_test in enumerate(test_loader):
            NER = NER.eval()
            output = NER.predict(batch_test)
            per_mat += eval_score(tags, output, batch_test[2])
        per_mat = per_mat/(ind+1)
        per_mat = per_mat[:len(tags),:]
        print(per_mat)
        score = sum(per_mat[:,2])/(len(tags)-1)
        if best_score < score:
            best_mat=per_mat
            best_score = score
            cnt_idle = 0
        else:
            cnt_idle += 1
        print(f'overall score: {score}')
        print('--------------------')
        if early_stop_n == cnt_idle:
            break
    break

    print(f'best_score: {best_score}\n')
    #print(f'best_mat\n')
    print(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
    print(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
    print(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
    #print(f'best_mat\n')
    print(f'----------------------------------\n')

cpu
cur_ind: 0. gru_weight_dropout: 0.20635630823301318
grucrf_dropout: 0.24840664118118352, DO_FCN_GRUCRF: 0.3266306490218036
grucrf_hidden_size: 128, LR: 0.0002994937337779445
cpu
epoch 0
0
0.00018978118896484375


In [9]:
BS = 2
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 50
word_length = 84
early_stop_n = 5
max_size_char = 20
num_search = 100
nums_filter = [1]
use_BN = True
activation_func = True
input_channel = 1
kernel_sizes = [3]
same_padding = True
num_char_encoding_size = 135
output_size = 64
size_of_embedding = 300
pos_size = len(POSMAP)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

data = MyDataloader('../clean84withpos.txt', '../label84withpos.txt', RULEs, \
                    word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
                    '../fasttext.th.vec', 300, device, '../pos_tag84withpos.txt',POSMAP)


tr, te = get_indices_random_val_test_split(len(data), 1, 0.00005, True)
train_loader = DataLoader(data, batch_size=BS, sampler=tr)
test_loader = DataLoader(data, batch_size=BS, sampler=te)

# NER = over_all_NER(BS,300, max_size_char, num_kernels,True,True,1,kernel_sizes,\
#                    True,word_length,135,attention_in, attention_out, gru_hidden_size, \
#                    gru_dropout, True, tags)
#####
# Batch_size: '(int)',\
#                  num_char_vec_features: '(int)',\
#                  hidden_size: '(int)',\
#                  max_num_char: '(int)',\
#                  dropout_gru_char: '(double)',\
#                  bidirectional_char: '(bool)',\
#                  output_char_embed_size: '(int)',\
#                  size_of_embedding: '(int) size of each word embedding vector',\
#                  num_words: '(int) see in overall_char_embedding', \
#                  gru_hidden_size: '(int) see in gru_crf', \
#                  dropout_gru: '(double) see in gru_crf', \
#                  bidirectional: '(bool)', \
#                  tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)')
#####
for cur_ind in range(num_search):
    torch.cuda.empty_cache()
    #grucrf_dropout = random.uniform(0.3,0.7)#0.5
    DO_FCN_LSTMCRF = random.uniform(0.3,0.7)#0.5
    
    dropouti = 0#random.uniform(0.1,0.7)
    dropouto = 0#random.uniform(0.1,0.7)
    dropoutw = random.uniform(0.1,0.7)
    
    lstmcrf_hidden_size = random.choice([128])#5
    LR = 5*10**random.uniform(-3,-5)#0.001
    print(f'LE: {LR}')
    print(f'dropouti: {dropouti}, DO_FCN_LSTMCRF: {DO_FCN_LSTMCRF}')
    print(f'lstmcrf_hidden_size: {lstmcrf_hidden_size}, dropouti: {dropouti}')
    print(f'dropouto: {dropouto}, dropoutw: {dropoutw}')

    
    NER = CNN_LSTM_CRF(BS, max_size_char, nums_filter, use_BN, activation_func, input_channel, \
                       kernel_sizes, same_padding, num_char_encoding_size, output_size,\
                       size_of_embedding, word_length, lstmcrf_hidden_size, \
                       True, tags, DO_FCN_LSTMCRF, pos_size, False, dropouti=dropouti, \
                       dropouto=dropouto, dropoutw=dropoutw)
#self, Batch_size, max_num_char, nums_filter, use_BN, activation_func, input_channel, 
#kernel_sizes, same_padding, num_char_encoding_size, output_size, size_of_embedding, 
#num_words, gru_hidden_size, dropout_gru, bidirectional, tags, DO_FCN_GRUCRF, pos_size, FCN, dropouti, 
#dropoutw, dropouto

    optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
    my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

    print(device)
    NER.to(device)
    best_score = 0
    best_mat = np.zeros((len(tags)-1,3))
    cnt_idle = 0
    for epoch in range(6):
        print(f'epoch {epoch}')
        all_loss = []
        for ind, batch_x in enumerate(train_loader):
            if ind%5 == 0:
                print(ind)
            t2 = time()
            print('------------train--------------------')
            NER = NER.train()
            print(time() - t2)
            NER.zero_grad()
            t1 = time()
            loss = NER(batch_x)
            output = NER.predict(batch_x)
#             for i in range(len(output)):
#                 print(batch_x[2][i])
#                 print(output[i])
            loss = loss*(-1)
            print(f'time per batch: {time() - t1}')
            print(loss)
            all_loss.append(loss)
            loss.backward()
            nn.utils.clip_grad_value_(NER.parameters(), 1)
            plot_grad_flow(NER.named_parameters())
            optimizer.step()
        total_loss = sum(all_loss)/(ind + 1)
        my_scheduler.step(total_loss)
        print(f'total loss of epoch: {total_loss.item()}')
        print('testing')
        per_mat = np.zeros((len(tags), 3))
        for ind, batch_test in enumerate(test_loader):
            NER = NER.eval()
            output = NER.predict(batch_test)
            for i in range(len(output)):
                print(batch_test[2][i])
                print(output[i])
            per_mat += eval_score(tags, output, batch_test[2])
        per_mat = per_mat/(ind+1)
        per_mat = per_mat[:len(tags),:]
        print(per_mat)
        score = sum(per_mat[:,2])/(len(tags)-1)
        if best_score < score:
            best_mat=per_mat
            best_score = score
            cnt_idle = 0
        else:
            cnt_idle += 1
        print(f'overall score: {score}')
        print('--------------------')
        if early_stop_n == cnt_idle:
            break

cpu
LE: 0.0019529859275205712
dropouti: 0, DO_FCN_LSTMCRF: 0.41411433791047636
lstmcrf_hidden_size: 128, dropouti: 0
dropouto: 0, dropoutw: 0.3811934675043389


NameError: name 'CNN_LSTM_CRF' is not defined

# Experiment1 word with POS only

In [9]:
stop_sign = False
time_list1 = []
BS = 32
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 1000
word_length = 84
early_stop_n = 1000
size_char = 6
num_search = 1000
num_epoch = 30
pos_size = len(POSMAP)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
data_tr = MyDataloader('../Data/clean84withpos_ne_tr0.txt', '../Data/label84withpos_ne_tr0.txt',\
                               RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
                               '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr0.txt',POSMAP)
data_te = MyDataloader('../Data/clean84withpos_ne_te0.txt', '../Data/label84withpos_ne_te0.txt', \
                       RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
                       '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te0.txt',POSMAP)

#         train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
#         test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)
tr, te = get_indices_random_val_test_split(len(data_tr), 1, 0.0015, True)
train_loader = DataLoader(data_tr, batch_size=BS, sampler=tr)
test_loader = DataLoader(data_tr, batch_size=BS, sampler=te)
BS = 8
for IND in range(2):
    #BS = 8
    tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
    scheduler_n = 1000
    word_length = 84
    early_stop_n = 1000
    max_size_char = 6
    num_search = 1000
    num_epoch = 30
    pos_size = len(POSMAP)
    
    
    
#     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     print(device)

#     data_tr = MyDataloader('../Data/clean84withpos_ne_tr'+ str(IND) +'.txt', '../Data/label84withpos_ne_tr'+ str(IND) +'.txt',\
#                            RULEs, word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
#                            '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr'+ str(IND) +'.txt',POSMAP)
#     data_te = MyDataloader('../Data/clean84withpos_ne_te'+ str(IND) +'.txt', '../Data/label84withpos_ne_te'+ str(IND) +'.txt', \
#                            RULEs, word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
#                            '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te'+ str(IND) +'.txt',POSMAP)

#     train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
#     test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)
    torch.cuda.empty_cache()
    
    grucrf_dropout = [0, 0.15, 0.30, 0.45, 0.60]#random.uniform(0.2,0.5)#0.5#random.uniform(0.2,0.5)#0.5
    total_search = len(grucrf_dropout)*2
    for i in grucrf_dropout:
        grucrf_hidden_size = 128#random.choice([128])#5
        LR = 10**(-4)#**random.uniform(-4,-5)#0.001
        print(f'lstmcrf_dropout = DO_FCN_LSTMCRF: {i}')
        print(f'lstmcrf_hidden_size: {grucrf_hidden_size}, LR: {LR}')

#         NER = CNN_GRU_word_pos(BS, 300, word_length, grucrf_hidden_size, i, True, tags, \
#                                i, pos_size, 0.5)
        NER = CNN_GRU_word_pos(BS, 300, word_length, grucrf_hidden_size, 0, True, tags, \
                               0, pos_size, 0)

        optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
        my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

        print(device)
        NER.to(device)
        best_score = 0
        best_mat = np.zeros((len(tags)-1,3))
        cnt_idle = 0
        for epoch in range(num_epoch):
            ttt = time()
            print(f'epoch {epoch}')
            all_loss = []
            for ind, batch_x in enumerate(train_loader):
                ttt2 = time()
                print(f'progress: {(100*(grucrf_dropout.index(i)+1)*(IND+1))/total_search}')
                if ind%5 == 0:
                    print(ind)
                t2 = time()
                NER = NER.train()
                print(time() - t2)
                NER.zero_grad()
                t1 = time()
                loss = NER(batch_x)
                loss = loss*(-1)
                print(f'time per batch: {time() - t1}')
                print(loss)
                all_loss.append(loss)
                loss.backward()
                nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
                optimizer.step()
                time_list1.append(time()-ttt2)
            total_loss = sum(all_loss)/(ind + 1)
            my_scheduler.step(total_loss)
            print(f'total loss of epoch: {total_loss.item()}')
            print('testing')
            per_mat = np.zeros((len(tags), 3))
            cnt_mat = np.zeros((len(tags), 3))
            for ind, batch_test in enumerate(test_loader):
                NER = NER.eval()
                output = NER.predict(batch_test)
                a, b= eval_score(tags, output, batch_test[2])
                per_mat += a
                cnt_mat += b
            per_mat = per_mat/(ind+1)
            per_mat = per_mat[:len(tags),:]
            cnt_mat = cnt_mat[:len(tags),:]
            print(cnt_mat)
            print(per_mat)
            score = sum(per_mat[:,2])/(len(tags)-1)
            if best_score < score:
                best_mat=per_mat
                best_score = score
                cnt_idle = 0
            else:
                cnt_idle += 1
            print(f'overall score: {score}')
            print('--------------------')
            if early_stop_n == cnt_idle:
                break
            print(f'total epoch time: {ttt-time()}')
        print(f'best_score: {best_score}\n')
        print(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
        print(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
        print(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
        print(f'----------------------------------\n')
        break
    break
print('end!!!')

cpu
lstmcrf_dropout = DO_FCN_LSTMCRF: 0
lstmcrf_hidden_size: 128, LR: 0.0001
cpu
epoch 0
progress: 10.0
0
8.416175842285156e-05
time per batch: 11.097868204116821
tensor(961.2801, grad_fn=<MulBackward>)
progress: 10.0
0.0001049041748046875
time per batch: 9.962333917617798
tensor(953.5808, grad_fn=<MulBackward>)
progress: 10.0
0.00015616416931152344
time per batch: 5.370298147201538
tensor(379.1391, grad_fn=<MulBackward>)
total loss of epoch: 764.6666870117188
testing
[[0.000e+00 1.000e+00 0.000e+00]
 [1.230e+02 7.400e+01 0.000e+00]
 [1.164e+03 2.310e+02 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00]]
[[0.         0.         0.        ]
 [0.39222793 0.28005128 0.32374882]
 [0.         0.         0.        ]
 [0.         0.         0.        ]]
overall score: 0.10791627233758282
--------------------
total epoch time: -145.08972597122192
epoch 1
progress: 10.0
0
9.322166442871094e-05
time per batch: 7.407656192779541
tensor(574.2061, grad_fn=<MulBackward>)
progress: 10.0
9.703636169433594e-

progress: 10.0
0
0.00010180473327636719
time per batch: 6.081189870834351
tensor(573.8969, grad_fn=<MulBackward>)
progress: 10.0
8.296966552734375e-05
time per batch: 6.310005187988281
tensor(620.9992, grad_fn=<MulBackward>)
progress: 10.0
9.012222290039062e-05
time per batch: 3.8401989936828613
tensor(301.7004, grad_fn=<MulBackward>)
total loss of epoch: 498.865478515625
testing
[[0.000e+00 0.000e+00 1.000e+00]
 [0.000e+00 6.800e+01 1.290e+02]
 [0.000e+00 3.230e+02 1.072e+03]
 [0.000e+00 0.000e+00 0.000e+00]]
[[0.         0.         0.        ]
 [0.33433433 0.17268224 0.22663641]
 [0.7647567  0.88254162 0.8193834 ]
 [0.         0.         0.        ]]
overall score: 0.3486732691214918
--------------------
total epoch time: -121.03786206245422
epoch 12
progress: 10.0
0
0.00010275840759277344
time per batch: 6.103687047958374
tensor(528.2590, grad_fn=<MulBackward>)
progress: 10.0
9.298324584960938e-05
time per batch: 7.309694766998291
tensor(638.6710, grad_fn=<MulBackward>)
progress: 10

progress: 10.0
0
9.202957153320312e-05
time per batch: 6.074416875839233
tensor(575.2055, grad_fn=<MulBackward>)
progress: 10.0
7.82012939453125e-05
time per batch: 6.412566900253296
tensor(622.0676, grad_fn=<MulBackward>)
progress: 10.0
9.608268737792969e-05
time per batch: 3.559398889541626
tensor(275.6323, grad_fn=<MulBackward>)
total loss of epoch: 490.9684753417969
testing
[[0.000e+00 0.000e+00 1.000e+00]
 [0.000e+00 4.100e+01 1.560e+02]
 [0.000e+00 1.810e+02 1.214e+03]
 [0.000e+00 0.000e+00 0.000e+00]]
[[0.         0.         0.        ]
 [0.202905   0.17827807 0.18896552]
 [0.86933397 0.8853206  0.87717994]
 [0.         0.         0.        ]]
overall score: 0.35538181750455117
--------------------
total epoch time: -120.38819813728333
epoch 23
progress: 10.0
0
7.677078247070312e-05
time per batch: 6.097383737564087
tensor(643.4900, grad_fn=<MulBackward>)
progress: 10.0
9.775161743164062e-05
time per batch: 5.997034072875977
tensor(597.4818, grad_fn=<MulBackward>)
progress: 10.0

In [10]:
print(b)
print(cnt_mat)

[[  0.   0.   0.]
 [  0.   6.  29.]
 [  0.  37. 253.]
 [  0.   0.   0.]]
[[0.000e+00 0.000e+00 1.000e+00]
 [0.000e+00 4.100e+01 1.560e+02]
 [0.000e+00 1.780e+02 1.217e+03]
 [0.000e+00 0.000e+00 0.000e+00]]


# Experiment2 char with POS

In [11]:
num_epoch = 30
time_list2 = []

for IND in range(2):
    
    #BS = 8
    tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
    scheduler_n = 10002
    word_length = 84
    early_stop_n = 10003
    max_size_char = [6]#[5, 10, 20]
    nums_filter = [1]
    use_BN = True
    activation_func = True
    input_channel = 1
    kernel_sizes = [3]
    same_padding = True
    num_char_encoding_size = 135
    output_size = 64
    size_of_embedding = 300
    pos_size = len(POSMAP)
    FCN = False
    grucrf_dropout = [0.6]#[0, 0.15, 0.30, 0.45, 0.60]
    total_search = len(max_size_char)*len(grucrf_dropout)*2
    for size_char in max_size_char:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(device)

#         data_tr = MyDataloader('../Data/clean84withpos_ne_tr'+ str(IND) +'.txt', '../Data/label84withpos_ne_tr'+ str(IND) +'.txt',\
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr'+ str(IND) +'.txt',POSMAP)
#         data_te = MyDataloader('../Data/clean84withpos_ne_te'+ str(IND) +'.txt', '../Data/label84withpos_ne_te'+ str(IND) +'.txt', \
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te'+ str(IND) +'.txt',POSMAP)

# #         train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
# #         test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)
#         tr, te = get_indices_random_val_test_split(len(data_tr), 1, 0.0005, True)
#         train_loader = DataLoader(data_tr, batch_size=BS, sampler=tr)
#         test_loader = DataLoader(data_tr, batch_size=BS, sampler=te)
        
        torch.cuda.empty_cache()
        for i in grucrf_dropout:
            
            grucrf_hidden_size = 128
            LR = 10**(-3)
            print(f'lstmcrf_dropout = DO_FCN_LSTMCRF: {i}')
            print(f'lstmcrf_hidden_size: {grucrf_hidden_size}, LR: {LR}')
            NER = CNN_GRU_char_pos(BS, size_char, nums_filter, use_BN, activation_func, input_channel, \
                 kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, grucrf_hidden_size, \
                 0, True, tags, 0, pos_size, FCN, 0)
            
            class over_all_NER2( BS, 135, word_length, size_char, 0, True, 5, size_of_embedding:, num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)',\
                 pos_size: '(int) size of pos embedding'):
#             NER = CNN_GRU_char_pos(BS, size_char, nums_filter, use_BN, activation_func, input_channel, \
#                  kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, grucrf_hidden_size, \
#                  i, True, tags, i, pos_size, FCN, 0.5)

            optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
            my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

            print(device)
            NER.to(device)
            best_score = 0
            best_mat = np.zeros((len(tags)-1,3))
            cnt_idle = 0
            for epoch in range(num_epoch):
                ttt = time()
                print(f'epoch {epoch}')
                all_loss = []
                for ind, batch_x in enumerate(train_loader):
                    ttt2 = time()
                    print(f'progress: {(100*(grucrf_dropout.index(i)+1)*(max_size_char.index(size_char)+1)*(IND+1))/total_search}')
                    if ind%5 == 0:
                        print(ind)
                    t2 = time()
                    NER = NER.train()
                    print(time() - t2)
                    NER.zero_grad()
                    t1 = time()
                    loss = NER(batch_x)
                    loss = loss*(-1)
                    print(f'time per batch: {time() - t1}')
                    print(loss)
                    all_loss.append(loss)
                    loss.backward()
                    #nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
                    optimizer.step()
                    time_list2.append(time()-ttt2)
                total_loss = sum(all_loss)/(ind + 1)
                my_scheduler.step(total_loss)
                
                print(f'total loss of epoch: {total_loss.item()}')
                print('testing')
                per_mat = np.zeros((len(tags), 3))
                cnt_mat = np.zeros((len(tags), 3))
                for ind, batch_test in enumerate(test_loader):
                    NER = NER.eval()
                    output = NER.predict(batch_test)
                    a, b = eval_score(tags, output, batch_test[2])
                    per_mat += a
                    cnt_mat += b
                per_mat = per_mat/(ind+1)
                per_mat = per_mat[:len(tags),:]
                cnt_mat = cnt_mat[:len(tags),:]
                print(cnt_mat)
                print(per_mat)
                score = sum(per_mat[:,2])/(len(tags)-1)
                if best_score < score:
                    best_mat=per_mat
                    best_score = score
                    cnt_idle = 0
                else:
                    cnt_idle += 1
                print(f'overall score: {score}')
                print('--------------------')
                if early_stop_n == cnt_idle:
                    break
                print(f'total epoch time: {ttt-time()}')
            break
            print(f'best_score: {best_score}\n')
            print(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
            print(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
            print(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
            print(f'----------------------------------\n')
        break
    break

cpu
lstmcrf_dropout = DO_FCN_LSTMCRF: 0.6
lstmcrf_hidden_size: 128, LR: 0.001
cpu
epoch 0
progress: 50.0
0
0.00010704994201660156
time per batch: 8.43209195137024
tensor(1379.6573, grad_fn=<MulBackward>)
progress: 50.0
0.0001819133758544922
time per batch: 8.654768943786621
tensor(876.6379, grad_fn=<MulBackward>)
progress: 50.0
0.00040411949157714844
time per batch: 4.156712055206299
tensor(291.5123, grad_fn=<MulBackward>)
total loss of epoch: 849.2692260742188
testing
[[0.00e+00 1.00e+00 0.00e+00]
 [0.00e+00 1.95e+02 2.00e+00]
 [0.00e+00 1.39e+03 5.00e+00]
 [0.00e+00 0.00e+00 0.00e+00]]
[[0.         0.         0.        ]
 [0.98780488 0.12186748 0.2165372 ]
 [0.00353813 0.72222222 0.00703907]
 [0.         0.         0.        ]]
overall score: 0.07452542321587348
--------------------
total epoch time: -132.42664098739624
epoch 1
progress: 50.0
0
0.00011801719665527344
time per batch: 6.686136960983276
tensor(907.5428, grad_fn=<MulBackward>)
progress: 50.0
0.0001342296600341797
time pe

progress: 50.0
0
0.00013589859008789062
time per batch: 7.830029010772705
tensor(748.0881, grad_fn=<MulBackward>)
progress: 50.0
0.00014710426330566406
time per batch: 8.668892860412598
tensor(956.8408, grad_fn=<MulBackward>)
progress: 50.0
0.00014090538024902344
time per batch: 4.933787822723389
tensor(384.9534, grad_fn=<MulBackward>)
total loss of epoch: 696.62744140625
testing
[[  0.   0.   1.]
 [  0.  84. 113.]
 [  0. 790. 605.]
 [  0.   0.   0.]]
[[0.         0.         0.        ]
 [0.4251715  0.10351797 0.16550619]
 [0.43470905 0.83069014 0.57048898]
 [0.         0.         0.        ]]
overall score: 0.24533172438511205
--------------------
total epoch time: -146.51898646354675
epoch 12
progress: 50.0
0
0.0001430511474609375
time per batch: 8.059916019439697
tensor(930.5657, grad_fn=<MulBackward>)
progress: 50.0
0.00013899803161621094
time per batch: 9.062623023986816
tensor(821.2032, grad_fn=<MulBackward>)
progress: 50.0
0.00017833709716796875
time per batch: 4.737653970718384

progress: 50.0
0
0.00012421607971191406
time per batch: 6.147907018661499
tensor(741.3986, grad_fn=<MulBackward>)
progress: 50.0
0.00043892860412597656
time per batch: 7.257084131240845
tensor(939.1605, grad_fn=<MulBackward>)
progress: 50.0
0.00012993812561035156
time per batch: 3.4747719764709473
tensor(280.5001, grad_fn=<MulBackward>)
total loss of epoch: 653.6864013671875
testing
[[  0.   0.   1.]
 [  0.  67. 130.]
 [  0. 565. 830.]
 [  0.   0.   0.]]
[[0.         0.         0.        ]
 [0.33374486 0.10537539 0.15996578]
 [0.59287982 0.86049757 0.70182819]
 [0.         0.         0.        ]]
overall score: 0.28726465583729754
--------------------
total epoch time: -126.87371873855591
epoch 24
progress: 50.0
0
0.000141143798828125
time per batch: 6.237536191940308
tensor(793.9743, grad_fn=<MulBackward>)
progress: 50.0
0.00017690658569335938
time per batch: 7.7300169467926025
tensor(845.2818, grad_fn=<MulBackward>)
progress: 50.0
0.00011992454528808594
time per batch: 3.592409133911

# Experiment3 char, word, POS

In [12]:
num_epoch = 30
time_list3 = []
for IND in range(2):
    #BS = 8
    tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
    scheduler_n = 100002
    word_length = 84
    early_stop_n = 100003
    max_size_char = [6]#[5, 10, 20]
    nums_filter = [1]
    use_BN = True
    activation_func = True
    input_channel = 1
    kernel_sizes = [3]
    same_padding = True
    num_char_encoding_size = 135
    output_size = 64
    size_of_embedding = 300
    pos_size = len(POSMAP)
    FCN = False
    grucrf_dropout = [0, 0.15, 0.30, 0.45, 0.60]
    total_search = len(max_size_char)*len(grucrf_dropout)*2
    for size_char in max_size_char:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(device)

#         data_tr = MyDataloader('../Data/clean84withpos_ne_tr'+ str(IND) +'.txt', '../Data/label84withpos_ne_tr'+ str(IND) +'.txt',\
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr'+ str(IND) +'.txt',POSMAP)
#         data_te = MyDataloader('../Data/clean84withpos_ne_te'+ str(IND) +'.txt', '../Data/label84withpos_ne_te'+ str(IND) +'.txt', \
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te'+ str(IND) +'.txt',POSMAP)

#         train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
#         test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)

        torch.cuda.empty_cache()
        for i in grucrf_dropout:
            grucrf_hidden_size = 128
            LR = 10**(-4)
            print(f'lstmcrf_dropout = DO_FCN_LSTMCRF: {i}')
            print(f'lstmcrf_hidden_size: {grucrf_hidden_size}, LR: {LR}')

#             NER = CNN_GRU_CRF(BS, size_char, nums_filter, use_BN, activation_func, \
#                               input_channel, kernel_sizes, same_padding, num_char_encoding_size, \
#                               output_size, size_of_embedding, word_length, grucrf_hidden_size, i, \
#                               True, tags, i, pos_size, FCN)
            NER = CNN_GRU_CRF(BS, size_char, nums_filter, use_BN, activation_func, \
                              input_channel, kernel_sizes, same_padding, num_char_encoding_size, \
                              output_size, size_of_embedding, word_length, grucrf_hidden_size, 0, \
                              True, tags, 0, pos_size, FCN, 0)

            optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
            my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

            print(device)
            NER.to(device)
            best_score = 0
            best_mat = np.zeros((len(tags)-1,3))
            cnt_idle = 0
            for epoch in range(num_epoch):
                ttt = time()
                print(f'epoch {epoch}')
                all_loss = []
                for ind, batch_x in enumerate(train_loader):
                    ttt2 = time()
                    print(f'progress: {(100*(grucrf_dropout.index(i)+1)*(max_size_char.index(size_char)+1)*(IND+1))/total_search}')
                    if ind%5 == 0:
                        print(ind)
                    t2 = time()
                    NER = NER.train()
                    print(time() - t2)
                    NER.zero_grad()
                    t1 = time()
                    loss = NER(batch_x)
                    loss = loss*(-1)
                    print(f'time per batch: {time() - t1}')
                    print(loss)
                    all_loss.append(loss)
                    loss.backward()
                    nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
                    optimizer.step()
                    time_list3.append(time()-ttt2)
                total_loss = sum(all_loss)/(ind + 1)
                my_scheduler.step(total_loss)
                
                print(f'total loss of epoch: {total_loss.item()}')
                print('testing')
                per_mat = np.zeros((len(tags), 3))
                cnt_mat = np.zeros((len(tags), 3))
                for ind, batch_test in enumerate(test_loader):
                    NER = NER.eval()
                    output = NER.predict(batch_test)
                    a, b = eval_score(tags, output, batch_test[2])
                    per_mat += a
                    cnt_mat += b
                per_mat = per_mat/(ind+1)
                per_mat = per_mat[:len(tags),:]
                cnt_mat = cnt_mat[:len(tags),:]
                print(cnt_mat)
                print(per_mat)
                score = sum(per_mat[:,2])/(len(tags)-1)
                if best_score < score:
                    best_mat=per_mat
                    best_score = score
                    cnt_idle = 0
                else:
                    cnt_idle += 1
                print(f'overall score: {score}')
                print('--------------------')
                if early_stop_n == cnt_idle:
                    break
                print(f'total epoch time: {ttt-time()}')
            print(f'best_score: {best_score}\n')
            print(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
            print(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
            print(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
            print(f'----------------------------------\n')
            break
        break
    break

cpu
lstmcrf_dropout = DO_FCN_LSTMCRF: 0
lstmcrf_hidden_size: 128, LR: 0.0001
cpu
epoch 0
progress: 10.0
0
0.00011801719665527344
time per batch: 7.582920074462891
tensor(514.3008, grad_fn=<MulBackward>)
progress: 10.0
0.00015115737915039062
time per batch: 9.895946025848389
tensor(517.1383, grad_fn=<MulBackward>)
progress: 10.0
0.00015020370483398438
time per batch: 3.740856170654297
tensor(210.5461, grad_fn=<MulBackward>)
total loss of epoch: 413.9951171875
testing
[[0.00e+00 1.00e+00 0.00e+00]
 [0.00e+00 1.74e+02 2.30e+01]
 [0.00e+00 1.34e+03 5.50e+01]
 [0.00e+00 0.00e+00 0.00e+00]]
[[0.         0.         0.        ]
 [0.86017316 0.10804996 0.19160554]
 [0.03809868 0.68005952 0.07214797]
 [0.         0.         0.        ]]
overall score: 0.08791783595881804
--------------------
total epoch time: -153.87249088287354
epoch 1
progress: 10.0
0
0.00012803077697753906
time per batch: 5.742196798324585
tensor(436.2852, grad_fn=<MulBackward>)
progress: 10.0
0.00013303756713867188
time per 

progress: 10.0
0
0.0001239776611328125
time per batch: 5.758953094482422
tensor(500.7930, grad_fn=<MulBackward>)
progress: 10.0
0.00013399124145507812
time per batch: 5.790640115737915
tensor(462.2381, grad_fn=<MulBackward>)
progress: 10.0
0.00013303756713867188
time per batch: 3.342494010925293
tensor(197.4832, grad_fn=<MulBackward>)
total loss of epoch: 386.8381042480469
testing
[[0.00e+00 1.00e+00 0.00e+00]
 [0.00e+00 1.74e+02 2.30e+01]
 [0.00e+00 1.34e+03 5.50e+01]
 [0.00e+00 0.00e+00 0.00e+00]]
[[0.         0.         0.        ]
 [0.88590073 0.12260971 0.21467757]
 [0.04130592 0.70684524 0.07794573]
 [0.         0.         0.        ]]
overall score: 0.09754109782677522
--------------------
total epoch time: -111.84771919250488
epoch 12
progress: 10.0
0
0.00012373924255371094
time per batch: 5.820000886917114
tensor(397.1657, grad_fn=<MulBackward>)
progress: 10.0
0.00013375282287597656
time per batch: 5.876884937286377
tensor(583.8167, grad_fn=<MulBackward>)
progress: 10.0
0.0001

progress: 10.0
0
0.0001232624053955078
time per batch: 5.766570806503296
tensor(519.6766, grad_fn=<MulBackward>)
progress: 10.0
0.00013113021850585938
time per batch: 5.800692081451416
tensor(416.8517, grad_fn=<MulBackward>)
progress: 10.0
0.00012183189392089844
time per batch: 3.354806900024414
tensor(209.0154, grad_fn=<MulBackward>)
total loss of epoch: 381.847900390625
testing
[[0.000e+00 0.000e+00 1.000e+00]
 [0.000e+00 2.500e+01 1.720e+02]
 [0.000e+00 1.320e+02 1.263e+03]
 [0.000e+00 0.000e+00 0.000e+00]]
[[0.         0.         0.        ]
 [0.12261905 0.15346586 0.13598157]
 [0.9027639  0.87727548 0.88979355]
 [0.         0.         0.        ]]
overall score: 0.34192504116954053
--------------------
total epoch time: -112.88307189941406
epoch 23
progress: 10.0
0
0.000125885009765625
time per batch: 5.784952878952026
tensor(517.9050, grad_fn=<MulBackward>)
progress: 10.0
0.0001239776611328125
time per batch: 5.809537172317505
tensor(404.7404, grad_fn=<MulBackward>)
progress: 10.

In [13]:
print(len(time_list1))
print(len(time_list2))
print(len(time_list3))

90
90
90


In [14]:
os.getcwd()

'/Users/abc/Downloads/min/Codes/LSTM-CRF-NER'

In [15]:
with open('time_exper_BS' + str(BS) +'.txt', 'w', encoding='utf8') as f:
    f.write('exper1, exper2, exper3\n')
    for i in range(len(time_list1)):
        f.write(f'{time_list1[i]}, {time_list2[i]}, {time_list3[i]}\n')

In [43]:
BS = 2
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 1003
word_length = 84
early_stop_n = 10005
max_size_char = 6
same_padding = True
use_BN = True
activation_func = True
input_channel = 1
nums_filter = [1]
output_size = 64
size_of_embedding = 300
pos_size = len(POSMAP)
num_char_encoding_size = 135
FCN = False
file_name = input('enter ur logname: ')
num_epoch = 30
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
data = MyDataloader('../clean84withpos.txt', '../label84withpos.txt', RULEs, \
                           word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
                           '../fasttext.th.vec', 300, device, '../pos_tag84withpos.txt',POSMAP)
tr, te = get_indices_random_val_test_split(len(data), 1, 0.00015, True)

def objective(trial):
    kernel_sizes = [trial.suggest_categorical('kernel_sizes', [3,5])]
    gru_fcn_dropout = trial.suggest_uniform('gru_fcn_dropout', 0, 0.7)
    gru_dropout = trial.suggest_uniform('gru_dropout', 0, 0.7)
    gru_out_dropout = trial.suggest_uniform('gru_out_dropout', 0, 0.7)
    LR = trial.suggest_uniform('LR', 5, 10)*10**(-5)
    grucrf_hidden_size = trial.suggest_categorical('grucrf_hidden_size', [64, 128])
    w_decay = trial.suggest_categorical('w_decay', [-3,-4,-5])
    
    train_loader = DataLoader(data, batch_size=BS, sampler=tr)
    test_loader = DataLoader(data, batch_size=BS, sampler=te)
    
    with open(file_name + '.txt', 'a', encoding='utf8') as f:
        f.write(f'kernel_sizes: {kernel_sizes}, gru_fcn_dropout: {gru_fcn_dropout}\n')
        f.write(f'gru_dropout: {gru_dropout}, gru_out_dropout: {gru_out_dropout}\n')
        f.write(f'LR: {LR}, grucrf_hidden_size: {grucrf_hidden_size}\n')
        f.write(f'w_decay: {w_decay}\n')

    NER = CNN_GRU_char_pos(BS, max_size_char, nums_filter, use_BN, activation_func, input_channel, \
                           kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, \
                           grucrf_hidden_size, gru_dropout, True, tags, gru_fcn_dropout, pos_size, FCN, \
                           gru_out_dropout)
    optimizer = optim.Adam(NER.parameters(), lr=LR, eps=1e-08, weight_decay=10**w_decay,amsgrad=True)
    my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', verbose=True)
    
    best_score = 0
    
    for epoch in range(num_epoch):
        ttt = time()
        print(f'epoch {epoch}')
        all_loss = []
        for ind, batch_x in enumerate(train_loader):
            if ind%5 == 0:
                print(ind)
            t2 = time()
            NER = NER.train()
            print(time() - t2)
            NER.zero_grad()
            t1 = time()
            loss = NER(batch_x)
            loss = loss*(-1)
            print(f'time per batch: {time() - t1}')
            print(loss)
            all_loss.append(loss)
            loss.backward()
            nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
            optimizer.step()
        total_loss = sum(all_loss)/(ind + 1)
        my_scheduler.step(total_loss)
        print(f'total loss of epoch: {total_loss.item()}')
        print('testing')
        per_mat = np.zeros((len(tags), 3))
        cnt_mat = np.zeros((len(tags), 3))
        for ind, batch_test in enumerate(test_loader):
            NER = NER.eval()
            output = NER.predict(batch_test)
            a, b = eval_score(tags, output, batch_test[2])
            per_mat += a
            cnt_mat += b
        per_mat = per_mat/(ind+1)
        per_mat = per_mat[:len(tags),:]
        cnt_mat = cnt_mat[:len(tags),:]
        print(cnt_mat)
        print(per_mat)
        score = sum(per_mat[:,2])/(len(tags)-1)

        with open(file_name + '.txt', 'a', encoding='utf8') as f:
            f.write(f'epoch: {epoch}, score: {score}\n')
        
        if score - best_score >= 0.005:
            best_mat=per_mat
            best_score = score
            cnt_idle = 0
        else:
            cnt_idle += 1
        print(f'overall score: {score}')
        print('--------------------')
        if early_stop_n == cnt_idle:
            break
        print(f'total epoch time: {ttt-time()}')
    with open(file_name + '.txt', 'a', encoding='utf8') as f:
        f.write(f'cnt_mat\n')
        f.write(f'I => : {cnt_mat[0,0]}, : {cnt_mat[0,1]}, : {cnt_mat[0,2]}\n')
        f.write(f'B => : {cnt_mat[1,0]}, : {cnt_mat[1,1]}, : {cnt_mat[1,2]}\n')
        f.write(f'O => : {cnt_mat[2,0]}, : {cnt_mat[2,1]}, : {cnt_mat[2,2]}\n')
        f.write(f'best_score: {best_score}\n')
        f.write(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
        f.write(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
        f.write(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
        f.write(f'----------------------------------\n')
    return best_score

enter ur logname: optuna_test


In [44]:
import optuna
study = optuna.study.load_study(storage='sqlite:///test_optuna.db', study_name='test optuna')
study.optimize(objective, n_trials=50)

epoch 0
0
0.00011491775512695312
time per batch: 2.3418049812316895
tensor(34.8441, grad_fn=<MulBackward>)
0.00012922286987304688
time per batch: 2.1915690898895264
tensor(80.6080, grad_fn=<MulBackward>)
0.0001728534698486328
time per batch: 2.0867130756378174
tensor(33.8390, grad_fn=<MulBackward>)
0.00016689300537109375
time per batch: 2.2237839698791504
tensor(50.1646, grad_fn=<MulBackward>)
0.00018095970153808594
time per batch: 2.2291951179504395
tensor(26.9115, grad_fn=<MulBackward>)
5
0.000164031982421875
time per batch: 2.351447820663452
tensor(11.8826, grad_fn=<MulBackward>)
0.00019788742065429688
time per batch: 2.503523826599121
tensor(17.7553, grad_fn=<MulBackward>)
0.00016617774963378906
time per batch: 2.5306408405303955
tensor(24.1425, grad_fn=<MulBackward>)
0.0001227855682373047
time per batch: 4.957162857055664
tensor(20.1968, grad_fn=<MulBackward>)
0.000164031982421875
time per batch: 2.9302730560302734
tensor(18.5895, grad_fn=<MulBackward>)
10
0.00016498565673828125
t

0.00011992454528808594
time per batch: 1.9130549430847168
tensor(20.9848, grad_fn=<MulBackward>)
0.00017595291137695312
time per batch: 2.2734529972076416
tensor(14.5507, grad_fn=<MulBackward>)
0.0001709461212158203
time per batch: 2.3001480102539062
tensor(46.7139, grad_fn=<MulBackward>)
0.00016999244689941406
time per batch: 2.0149738788604736
tensor(32.0082, grad_fn=<MulBackward>)
10
0.0001690387725830078
time per batch: 1.7409262657165527
tensor(4.3721, grad_fn=<MulBackward>)
total loss of epoch: 25.759477615356445
testing
[[  0.   0.   0.]
 [  0.   6.  22.]
 [  0.  18. 245.]
 [  0.   0.   0.]]
[[0.         0.         0.        ]
 [0.23863636 0.24242424 0.22164502]
 [0.90598753 0.90408587 0.90344813]
 [0.         0.         0.        ]]
overall score: 0.37503105010651
--------------------
total epoch time: -114.85900282859802
epoch 12
0
0.0003199577331542969
time per batch: 1.8766100406646729
tensor(13.1292, grad_fn=<MulBackward>)
0.00018095970153808594
time per batch: 1.9215056896

[I 2019-11-06 22:39:52,879] Finished trial#106 resulted in value: 0.3767177645093474. Current best value is 0.3767177645093474 with parameters: {'LR': 5.826992948964408, 'gru_dropout': 0.6023800703615053, 'gru_fcn_dropout': 0.026658703460356214, 'gru_out_dropout': 0.25442982964747174, 'grucrf_hidden_size': 64, 'kernel_sizes': 5, 'w_decay': -3}.


epoch 0
0
0.00019407272338867188
time per batch: 6.435258865356445
tensor(10.3699, grad_fn=<MulBackward>)
0.000125885009765625
time per batch: 3.1884660720825195
tensor(20.6080, grad_fn=<MulBackward>)
0.00011801719665527344
time per batch: 4.82292103767395
tensor(16.3300, grad_fn=<MulBackward>)
0.00011968612670898438
time per batch: 3.0032050609588623
tensor(17.8857, grad_fn=<MulBackward>)
0.00012183189392089844
time per batch: 3.218485116958618
tensor(64.2073, grad_fn=<MulBackward>)
5
0.0001678466796875
time per batch: 3.7899110317230225
tensor(25.4161, grad_fn=<MulBackward>)
0.00012421607971191406
time per batch: 4.840949058532715
tensor(59.6643, grad_fn=<MulBackward>)
0.0001201629638671875
time per batch: 4.619460105895996
tensor(18.3695, grad_fn=<MulBackward>)
0.00011897087097167969
time per batch: 5.243571043014526
tensor(19.8214, grad_fn=<MulBackward>)
0.0004420280456542969
time per batch: 6.41439414024353
tensor(19.2011, grad_fn=<MulBackward>)
10
0.00015807151794433594
time per 

[I 2019-11-06 23:14:50,977] Finished trial#107 resulted in value: 0.31699187810287116. Current best value is 0.3767177645093474 with parameters: {'LR': 5.826992948964408, 'gru_dropout': 0.6023800703615053, 'gru_fcn_dropout': 0.026658703460356214, 'gru_out_dropout': 0.25442982964747174, 'grucrf_hidden_size': 64, 'kernel_sizes': 5, 'w_decay': -3}.


epoch 0
0
0.00019097328186035156
time per batch: 6.1610801219940186
tensor(26.5541, grad_fn=<MulBackward>)
0.0001900196075439453
time per batch: 4.142956018447876
tensor(17.5993, grad_fn=<MulBackward>)
0.0001201629638671875
time per batch: 3.2134928703308105
tensor(24.0803, grad_fn=<MulBackward>)
0.0001850128173828125
time per batch: 2.6568210124969482
tensor(106.2484, grad_fn=<MulBackward>)
0.0001506805419921875
time per batch: 2.6093101501464844
tensor(24.5067, grad_fn=<MulBackward>)
5
0.00017404556274414062
time per batch: 2.4154341220855713
tensor(31.5961, grad_fn=<MulBackward>)
0.00012111663818359375
time per batch: 2.852065086364746
tensor(15.0602, grad_fn=<MulBackward>)
0.00011801719665527344
time per batch: 2.4248390197753906
tensor(20.3710, grad_fn=<MulBackward>)
0.0004951953887939453
time per batch: 2.7142980098724365
tensor(16.4489, grad_fn=<MulBackward>)
0.00018906593322753906
time per batch: 2.513058662414551
tensor(46.1006, grad_fn=<MulBackward>)
10
0.00018405914306640625

KeyboardInterrupt: 

In [56]:
import sqlite3
cnx = sqlite3.connect('test_optuna.db')

df = pd.read_sql_query("SELECT name FROM * WHERE type='table';", cnx)

DatabaseError: Execution failed on sql 'SELECT name FROM * WHERE type='table';': near "*": syntax error

In [57]:
study = optuna.load_study(storage='sqlite:///test_optuna.db', study_name='test optuna')
df = study.trials_dataframe()

In [60]:
df.tail()

Unnamed: 0_level_0,number,state,value,datetime_start,datetime_complete,params,params,params,params,params,params,params,system_attrs,system_attrs
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,LR,gru_dropout,gru_fcn_dropout,gru_out_dropout,grucrf_hidden_size,kernel_sizes,w_decay,_number,fail_reason
104,104,TrialState.RUNNING,,2019-11-06 21:40:01.802894,NaT,8.99895,0.07583,0.044007,0.506712,64.0,3.0,-5.0,104,
105,105,TrialState.RUNNING,,2019-11-06 21:41:37.515459,NaT,7.578753,0.270617,0.432835,0.123188,128.0,3.0,-5.0,105,
106,106,TrialState.COMPLETE,0.376718,2019-11-06 21:42:43.610794,2019-11-06 22:39:52.725988,5.826993,0.60238,0.026659,0.25443,64.0,5.0,-3.0,106,
107,107,TrialState.COMPLETE,0.316992,2019-11-06 22:39:52.884477,2019-11-06 23:14:50.689517,6.558108,0.163159,0.485613,0.526041,128.0,5.0,-5.0,107,
108,108,TrialState.RUNNING,,2019-11-06 23:14:51.032208,NaT,8.818262,0.182233,0.396082,0.471763,64.0,3.0,-4.0,108,


In [29]:
a = torch.randn((1))
b = torch.randn((1))

In [34]:
if a-b<5:
    print('wrd')

wrd


In [39]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import regex as re
from time import time
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from allennlp.modules.conditional_random_field import ConditionalRandomField
from allennlp.modules.conditional_random_field import allowed_transitions
from torch.utils.data import Dataset, DataLoader
#from torchcrf import CRF
from torch.utils.data.sampler import SubsetRandomSampler
import random

from typing import *

from sklearn.metrics import confusion_matrix

from torch.nn import Parameter
from functools import wraps
from RULE import RULEs
from POSMap import POSMAP
from my_stuff_opt import *

num_trial = 50
cur_trial = 1
BS = 2
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 3
word_length = 84
early_stop_n = 5
max_size_char = 6
same_padding = True
use_BN = True
activation_func = True
input_channel = 1
nums_filter = [1]
output_size = 64
size_of_embedding = 300
pos_size = len(POSMAP)
num_char_encoding_size = 135
FCN = False
file_name = input('enter ur logname: ')
num_epoch = 2
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
Train = MyDataloader('../clean84withpos_ne.txt', '../label84withpos_ne.txt', RULEs, \
                           word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
                           '../fasttext.th.vec', 300, device, '../pos_tag84withpos_ne.txt',POSMAP)
# Test = MyDataloader('clean84withpos_ne_te0.txt', 'label84withpos_ne_te0.txt', RULEs, \
#                            word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
#                            'fasttext.th.vec', 300, device, 'pos_tag84withpos_ne_te0.txt',POSMAP)
tr, te = get_indices_random_val_test_split(len(Train), 1, 0.0002, True)
def objective(trial):
    global cur_trial
    cur_trial += 1
    cnt_idle = 0
    kernel_sizes = [trial.suggest_categorical('kernel_sizes', [3,5])]
    gru_fcn_dropout = trial.suggest_uniform('gru_fcn_dropout', 0, 0.6)
    gru_dropout = trial.suggest_uniform('gru_dropout', 0, 0.6)
    gru_out_dropout = trial.suggest_uniform('gru_out_dropout', 0, 0.6)
    LR = trial.suggest_uniform('LR', 5, 10)*10**(-4)
    grucrf_hidden_size = trial.suggest_categorical('grucrf_hidden_size', [64, 128])
    w_decay = trial.suggest_categorical('w_decay', [-4,0])
    
#     train_loader = DataLoader(Train, batch_size=BS, shuffle=True)
#     test_loader = DataLoader(Test, batch_size=BS, shuffle=True)

    train_loader = DataLoader(Train, batch_size=BS, sampler=tr)
    test_loader = DataLoader(Train, batch_size=BS, sampler=te)
    
    with open(file_name + '.txt', 'a', encoding='utf8') as f:
        f.write(f'kernel_sizes: {kernel_sizes}, gru_fcn_dropout: {gru_fcn_dropout}\n')
        f.write(f'gru_dropout: {gru_dropout}, gru_out_dropout: {gru_out_dropout}\n')
        f.write(f'LR: {LR}, grucrf_hidden_size: {grucrf_hidden_size}\n')
        f.write(f'w_decay: {w_decay}\n')

    NER = CNN_GRU_char_pos(BS, max_size_char, nums_filter, use_BN, activation_func, input_channel, \
                           kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, \
                           grucrf_hidden_size, gru_dropout, True, tags, gru_fcn_dropout, pos_size, FCN, \
                           gru_out_dropout)
    optimizer = optim.Adam(NER.parameters(), lr=LR, eps=1e-08, weight_decay=10**w_decay,amsgrad=True)
    my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', verbose=True)
    print(device)
    NER.to(device)
    best_score = 0
    last_loss = 100000000
    for epoch in range(num_epoch):
        ttt = time()
        print(f'epoch {epoch}')
        all_loss = []
        for ind, batch_x in enumerate(train_loader):
            if ind%5 == 0:
                print(ind)
                print(f'cur_trial: {cur_trial}')
            t2 = time()
            NER = NER.train()
            print(time() - t2)
            NER.zero_grad()
            t1 = time()
            loss = NER(batch_x)
            loss = loss*(-1)
            print(f'time per batch: {time() - t1}')
            print(loss)
            all_loss.append(loss)
            loss.backward()
            nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
            optimizer.step()
        total_loss = sum(all_loss)/(ind + 1)
        my_scheduler.step(total_loss)
        print(f'total loss of epoch: {total_loss.item()}')
        print('testing')
        per_mat = np.zeros((len(tags), 3))
        cnt_mat = np.zeros((len(tags), 3))
        for ind, batch_test in enumerate(test_loader):
            NER = NER.eval()
            output = NER.predict(batch_test)
            a, b = eval_score(tags, output, batch_test[2])
            per_mat += a
            cnt_mat += b
        per_mat = per_mat/(ind+1)
        per_mat = per_mat[:len(tags),:]
        cnt_mat = cnt_mat[:len(tags),:]
        print(cnt_mat)
        print(per_mat)
        score = sum(per_mat[:,2])/(len(tags)-1)

        with open(file_name + '.txt', 'a', encoding='utf8') as f:
            f.write(f'epoch: {epoch}, score: {score}\n')
        
        if last_loss - total_loss.item() >= 0.1:
            best_mat=per_mat
            best_score = score
            cnt_idle = 0
        else:
            cnt_idle += 1
        last_loss = total_loss.item()
        print(f'overall score: {score}')
        print('--------------------')
        if early_stop_n == cnt_idle:
            break
        print(f'total epoch time: {ttt-time()}')
    with open(file_name + '.txt', 'a', encoding='utf8') as f:
        f.write(f'cur_trial: {cur_trial}')
        f.write(f'cnt_mat\n')
        f.write(f'I => : {cnt_mat[0,0]}, : {cnt_mat[0,1]}, : {cnt_mat[0,2]}\n')
        f.write(f'B => : {cnt_mat[1,0]}, : {cnt_mat[1,1]}, : {cnt_mat[1,2]}\n')
        f.write(f'O => : {cnt_mat[2,0]}, : {cnt_mat[2,1]}, : {cnt_mat[2,2]}\n')
        f.write(f'best_score: {best_score}\n')
        f.write(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
        f.write(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
        f.write(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
        f.write(f'----------------------------------\n')
    return best_score

study = optuna.study.create_study(storage='sqlite:///'+ file_name +'.db', study_name='test_optuna_' + file_name, direction='maximize')
study.optimize(objective, n_trials=num_trial)

enter ur logname: sss


[I 2019-11-08 14:54:48,542] A new study created with name: test_optuna_sss


cpu
epoch 0
0
cur_trial: 2
0.00024700164794921875
time per batch: 4.190179824829102
tensor(164.4053, grad_fn=<MulBackward>)
0.00010800361633300781
time per batch: 2.489389181137085
tensor(79.4787, grad_fn=<MulBackward>)
0.00011706352233886719
time per batch: 3.622425079345703
tensor(68.4122, grad_fn=<MulBackward>)
0.0001590251922607422
time per batch: 2.1142139434814453
tensor(20.1260, grad_fn=<MulBackward>)
0.00011110305786132812
time per batch: 2.023732900619507
tensor(38.8639, grad_fn=<MulBackward>)
5
cur_trial: 2
0.0001857280731201172
time per batch: 3.5059359073638916
tensor(39.5278, grad_fn=<MulBackward>)
0.00011491775512695312
time per batch: 2.8796801567077637
tensor(24.6600, grad_fn=<MulBackward>)
total loss of epoch: 62.210548400878906
testing
[[  0.   0.   0.]
 [ 38.   7.   3.]
 [268.  19.  22.]
 [  0.   0.   0.]]
[[0.         0.         0.        ]
 [0.17568543 0.32142857 0.20466914]
 [0.07894159 0.39072039 0.12475934]
 [0.         0.         0.        ]]
overall score: 0.1

[I 2019-11-08 14:57:20,621] Finished trial#0 resulted in value: 0.06746031746031746. Current best value is 0.06746031746031746 with parameters: {'LR': 9.311652578438999, 'gru_dropout': 0.5494176585798091, 'gru_fcn_dropout': 0.2272287674142849, 'gru_out_dropout': 0.45003621827429685, 'grucrf_hidden_size': 64, 'kernel_sizes': 3, 'w_decay': -4}.


cpu
epoch 0
0
cur_trial: 3
0.0001239776611328125
time per batch: 4.3107030391693115
tensor(26.4435, grad_fn=<MulBackward>)
0.00019812583923339844
time per batch: 3.8920347690582275
tensor(102.5037, grad_fn=<MulBackward>)


KeyboardInterrupt: 

# char-RNN vs char-CNN

In [8]:
num_epoch = 1
time_RNN = []
time_CNN = []
word_length = 84
size_char = 6
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
data_tr = MyDataloader('../Data/clean84withpos_ne_tr0.txt', '../Data/label84withpos_ne_tr0.txt',\
                               RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
                               '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr0.txt',POSMAP)
data_te = MyDataloader('../Data/clean84withpos_ne_te0.txt', '../Data/label84withpos_ne_te0.txt', \
                       RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
                       '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te0.txt',POSMAP)

#         train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
#         test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)
tr, te = get_indices_random_val_test_split(len(data_tr), 1, 0.0005, True)

list_BS = [4,8,16,32,64,128]
for BS in list_BS:
    train_loader = DataLoader(data_tr, batch_size=BS, sampler=tr)
    test_loader = DataLoader(data_tr, batch_size=BS, sampler=te)
    #BS = 8
    tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
    scheduler_n = 10002
    word_length = 84
    early_stop_n = 10003
    max_size_char = [6]#[5, 10, 20]
    nums_filter = [1]
    use_BN = True
    activation_func = True
    input_channel = 1
    kernel_sizes = [3]
    same_padding = True
    num_char_encoding_size = 135
    output_size = 64
    size_of_embedding = 300
    pos_size = len(POSMAP)
    FCN = False
    grucrf_dropout = [0]#[0, 0.15, 0.30, 0.45, 0.60]
    total_search = len(max_size_char)*len(grucrf_dropout)*2
    for size_char in max_size_char:

#         data_tr = MyDataloader('../Data/clean84withpos_ne_tr'+ str(IND) +'.txt', '../Data/label84withpos_ne_tr'+ str(IND) +'.txt',\
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_tr'+ str(IND) +'.txt',POSMAP)
#         data_te = MyDataloader('../Data/clean84withpos_ne_te'+ str(IND) +'.txt', '../Data/label84withpos_ne_te'+ str(IND) +'.txt', \
#                                RULEs, word_length, '|', 'char_vec_dictionary.txt', size_char, \
#                                '../fasttext.th.vec', 300, device, '../Data/pos_tag84withpos_ne_te'+ str(IND) +'.txt',POSMAP)

# #         train_loader = DataLoader(data_tr, batch_size=BS, shuffle= True)
# #         test_loader = DataLoader(data_te, batch_size=BS, shuffle= True)
#         tr, te = get_indices_random_val_test_split(len(data_tr), 1, 0.0005, True)
#         train_loader = DataLoader(data_tr, batch_size=BS, sampler=tr)
#         test_loader = DataLoader(data_tr, batch_size=BS, sampler=te)
        
        torch.cuda.empty_cache()
        for i in grucrf_dropout:
            
            grucrf_hidden_size = 128
            LR = 10**(-3)
            print(f'lstmcrf_dropout = DO_FCN_LSTMCRF: {i}')
            print(f'lstmcrf_hidden_size: {grucrf_hidden_size}, LR: {LR}')
            NER = CNN_GRU_char_pos(BS, size_char, nums_filter, use_BN, activation_func, input_channel, \
                 kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, grucrf_hidden_size, \
                 0, True, tags, 0, pos_size, FCN, 0)
#             NER = CNN_GRU_char_pos(BS, size_char, nums_filter, use_BN, activation_func, input_channel, \
#                  kernel_sizes, same_padding, num_char_encoding_size, output_size, word_length, grucrf_hidden_size, \
#                  i, True, tags, i, pos_size, FCN, 0.5)

            optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
            my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

            print(device)
            NER.to(device)
            best_score = 0
            best_mat = np.zeros((len(tags)-1,3))
            cnt_idle = 0
            for epoch in range(num_epoch):
                ttt = time()
                print(f'epoch {epoch}')
                all_loss = []
                for ind, batch_x in enumerate(train_loader):
                    print(f'NER1 with BS: {BS}')
                    ttt2 = time()
                    if ind%5 == 0:
                        print(ind)
                    t2 = time()
                    NER = NER.train()
                    print(time() - t2)
                    NER.zero_grad()
                    t1 = time()
                    loss = NER(batch_x)
                    loss = loss*(-1)
                    print(f'time per batch: {time() - t1}')
                    print(loss)
                    all_loss.append(loss)
                    loss.backward()
                    #nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
                    optimizer.step()
                    time_CNN.append(time()-ttt2)
                total_loss = sum(all_loss)/(ind + 1)
                my_scheduler.step(total_loss)
                
            NER = over_all_NER2(BS,135,5,size_char,0,True, 5,size_of_embedding,word_length,128,0,True,tags, \
                                0, 0,pos_size, 0)
            optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
            my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')
            print(device)
            NER.to(device)
            best_score = 0
            best_mat = np.zeros((len(tags)-1,3))
            cnt_idle = 0
            for epoch in range(num_epoch):
                ttt = time()
                print(f'epoch {epoch}')
                all_loss = []
                for ind, batch_x in enumerate(train_loader):
                    print(f'NER2 with BS: {BS}')
                    ttt2 = time()
                    if ind%5 == 0:
                        print(ind)
                    t2 = time()
                    NER = NER.train()
                    print(time() - t2)
                    NER.zero_grad()
                    t1 = time()
                    loss = NER(batch_x)
                    loss = loss*(-1)
                    print(f'time per batch: {time() - t1}')
                    print(loss)
                    all_loss.append(loss)
                    loss.backward()
                    #nn.utils.clip_grad_norm_(NER.parameters(), 5, norm_type=2)
                    optimizer.step()
                    time_RNN.append(time()-ttt2)
                total_loss = sum(all_loss)/(ind + 1)
                my_scheduler.step(total_loss)
            with open(f'log_time_cnn_rnn_char_BS_{BS}.txt', 'w', encoding = 'utf8') as f:
                f.write('CNN, RNN\n')
                for i in range(len(time_RNN)):
                    f.write(f'{time_CNN[i]}, {time_RNN[i]}\n')
            
#                 print(f'total loss of epoch: {total_loss.item()}')
#                 print('testing')
#                 per_mat = np.zeros((len(tags), 3))
#                 cnt_mat = np.zeros((len(tags), 3))
#                 for ind, batch_test in enumerate(test_loader):
#                     NER = NER.eval()
#                     output = NER.predict(batch_test)
#                     a, b = eval_score(tags, output, batch_test[2])
#                     per_mat += a
#                     cnt_mat += b
#                 per_mat = per_mat/(ind+1)
#                 per_mat = per_mat[:len(tags),:]
#                 cnt_mat = cnt_mat[:len(tags),:]
#                 print(cnt_mat)
#                 print(per_mat)
#                 score = sum(per_mat[:,2])/(len(tags)-1)
#                 if best_score < score:
#                     best_mat=per_mat
#                     best_score = score
#                     cnt_idle = 0
#                 else:
#                     cnt_idle += 1
#                 print(f'overall score: {score}')
#                 print('--------------------')
#                 if early_stop_n == cnt_idle:
#                     break
#                 print(f'total epoch time: {ttt-time()}')


cpu
lstmcrf_dropout = DO_FCN_LSTMCRF: 0
lstmcrf_hidden_size: 128, LR: 0.001
cpu
epoch 0
NER1 with BS: 4
0
0.00011897087097167969
time per batch: 3.700333833694458
tensor(242.6188, grad_fn=<MulBackward>)
NER1 with BS: 4
0.00011897087097167969
time per batch: 4.085191011428833
tensor(122.6828, grad_fn=<MulBackward>)
NER1 with BS: 4
0.00010895729064941406
time per batch: 3.710599184036255
tensor(111.6462, grad_fn=<MulBackward>)
NER1 with BS: 4
0.00011491775512695312
time per batch: 3.8553411960601807
tensor(104.4827, grad_fn=<MulBackward>)
NER1 with BS: 4
0.00011706352233886719
time per batch: 3.7547922134399414
tensor(92.9099, grad_fn=<MulBackward>)
NER1 with BS: 4
5
0.00011873245239257812
time per batch: 3.8132340908050537
tensor(62.8398, grad_fn=<MulBackward>)
NER1 with BS: 4
0.00012087821960449219
time per batch: 3.9524810314178467
tensor(72.6978, grad_fn=<MulBackward>)
cpu
epoch 0
NER2 with BS: 4
0
0.00011920928955078125
time per batch: 5.445732116699219
tensor(124.3198, grad_fn=<Mul

KeyboardInterrupt: 

In [9]:
os.getcwd()

'/Users/abc/Downloads/min/Codes/LSTM-CRF-NER'