In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import regex as re
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from allennlp.modules.conditional_random_field import ConditionalRandomField
from allennlp.modules.conditional_random_field import allowed_transitions
from torch.utils.data import Dataset, DataLoader, random_split
#from torchcrf import CRF
from torch.utils.data.sampler import SubsetRandomSampler
import random

from sklearn.metrics import confusion_matrix
#from sklearn_crfsuite import metrics

from RULE import RULEs
from POSMap import POSMAP

In [7]:
#new
class MyDataloader(Dataset):
    def __init__(self, TextDir: '.txt extension of samples', LabelDir: '.txt extension of labels',rules:\
                 'the rules to be replaced => see in RULE.py', Len_word_vec: 'size of word vector', \
                delimiter: '(str) delimiter used to separate data', dir_char_dictionary: \
                '(str) see in CharEmbedding', max_len_char: '(int) see in CharEmbedding', \
                fasttext_dictionary_dir: '(str) see in WordEmbedding',\
                Len_embedded_vector: '(int) see in WordEmbedding', device, POSDir: '(str) .txt extension of POS',\
                POSMapping: 'see in POSMap.py') -> None:
        super().__init__()
        self.DF = pd.read_csv(TextDir, names=['text'])
        self.Label_DF = pd.read_csv(LabelDir, names=['text'])
        self.pos_DF = pd.read_csv(POSDir, names=['text'])
        self.rules = rules
        self.Len_word_vec = Len_word_vec
        self.delimiter = delimiter
        self.char_embedder = CharEmbedding(dir_char_dictionary, max_len_char)
        self.word_embedder = WordEmbedding(fasttext_dictionary_dir, Len_embedded_vector)
        self.device = device
        self.pos_embedder = POSEmbedding(POSMapping)
    def __len__(self):
        return len(self.DF)
    def __getitem__(self, Index) -> '(sample: (torch.tensor), label: (torch.tensor))':
        all_words = [word.strip() for word in self.DF['text'][Index].strip().split(self.delimiter)]
        for i in range(len(all_words)):
            for rule in self.rules:
                all_words[i] = re.sub(*rule, all_words[i])
        Label = [float(word.strip()) for word in self.Label_DF['text'][Index].strip().split(self.delimiter)]
        mask = [1.0]*len(all_words)
        POS = [pos.strip() for pos in self.pos_DF['text'][Index].strip().split(self.delimiter)]
        if len(all_words) < self.Len_word_vec:
            Label = Label + [3.0]*(self.Len_word_vec - len(all_words))
            mask = mask + [0.0]*(self.Len_word_vec - len(all_words))
            POS = POS + ['<pad>']*(self.Len_word_vec - len(all_words))
            all_words = all_words + ['<pad>']*(self.Len_word_vec - len(all_words))
        char_embed = self.char_embedder.embed(all_words)
        word_embed = self.word_embedder.embed(all_words)
        pos_embed = self.pos_embedder.embed(POS)
        # print(len(all_words))
        # print(len(Label))
        # print(len(mask))
        # print('----------')
        return (char_embed.to(self.device), word_embed.to(self.device), \
                torch.tensor(Label).to(self.device), torch.tensor(mask).to(self.device), \
                len(all_words), pos_embed.float().to(device))
    

class CharEmbedding():
    def __init__(self,\
    dir_char_dictionary: '(str) .txt',\
    max_len_char: '(int) max size of char representation, for example: given max_len_char=3 and word= "abcde" => only "abc" is used'):
    #Example: given embed_capital=True and 'a' is embedded as array([1.,0.,0.,0.,0]). 'A' is then embedded as array([1.,0.,0.,0.,1.])
        self.dictionary = {}
        self.max_len_char = max_len_char
        with open(dir_char_dictionary, 'r', encoding='utf8') as f:
            for line in f:
                tmp_data = line.strip().split()
                self.dictionary[tmp_data[0]] = np.array([float(Char) for Char in tmp_data[1:]])
    def embed(self, list_of_words: '(list[str]) example: ["ฉัน","กิน","ข้าว"]'):
        #Note: 1 outer list is for 1 word.
        output = []
        for word in list_of_words:
            embedded_word = []
            tmp_word = word
            if len(word) > self.max_len_char:
                tmp_word = tmp_word[:self.max_len_char]
            for Char in tmp_word:
                if Char in self.dictionary:
                    tmp_vector = self.dictionary[Char]
                else:
                    tmp_vector = np.zeros(self.dictionary['a'].shape)
                embedded_word.append(tmp_vector)
            if len(embedded_word) < self.max_len_char:
                for i in range(self.max_len_char - len(embedded_word)):
                    embedded_word.append(np.zeros(self.dictionary['a'].shape))
            output.append(torch.tensor(embedded_word))
        return torch.stack(output)

class WordEmbedding():
    #use fasttext embedding ==> read from a file
    def __init__(self, fasttext_dictionary_dir: '(str) .vec extension of words and embedded_vectors',\
     Len_embedded_vector: '(int) size of embedded each vector (300 for fasttext) **Count only numbers not words'\
     ) -> None:
        #example of format in fasttext_dictionary_dir
        #กิน 1.0 -2.666 -3 22.5 .... \n
        #นอน 1.5 -5.666 3 9.5 .... \n
        #...
        #...
        self.dictionary = {}
        self.Len_embedded_vector = Len_embedded_vector
        with open(fasttext_dictionary_dir, 'r', encoding = 'utf8') as f:
            for line in f:
                tmp_line = line.strip()
                tmp_words = tmp_line.split()
                if tmp_line != '' and len(tmp_words) == self.Len_embedded_vector + 1:
                    self.dictionary[tmp_words[0]] = np.array([float(element) for element in tmp_words[1:]])
                else:
                    continue
    def embed(self, list_of_words: '(List[str]) for example: ["ฉัน","กิน","ข้าว"]'):
        tmp_list = []
        for word in list_of_words:
            if word in self.dictionary:
                tmp_list.append(self.dictionary[word])
            else:
                #in case of OOV: Zero-vector is used.
                tmp_list.append(np.zeros(self.Len_embedded_vector))
        return torch.tensor(tmp_list)

class POSEmbedding():
    def __init__(self, POSMapping: 'see in POSMap.py'):
        self.dictionary = POSMapping
        self.size = len(self.dictionary)
    def embed(self, list_of_POSs:'(list[str]) example: ["NOUN","VERB","NOUN"]'):
        tmp_list = []
        for POS in list_of_POSs:
            POS = POS.strip()
            if POS == '<pad>':
                tmp_list.append(np.zeros(self.size))
            else:
                tmp_data = np.zeros(self.size)
                tmp_data[self.dictionary[POS]] = 1
                tmp_list.append(tmp_data)
        return torch.tensor(tmp_list)

# Separating data

In [8]:
seq_length=4
batch_size=3

data = torch.randn(seq_length, batch_size, num_tags)#shape(seq_length, batch_size, num_tags)

In [9]:
target = torch.tensor([[0,0,0],[0,0,0],[2,2,2],[0,3,3]])#shape = (seq_length, batch_size)
print(target.size())

In [12]:
mask = torch.tensor([[1,1,1],[1,1,1], [1,1,1],[0,1,0]])
#(seq_length, batch_size)
#mask = torch.tensor([[1,0,0],[0,1,0],[0,0,1]])
print(mask.size())

# Defining layers

In [3]:
#new
############### RNN encoding ######################
class RNN_char(nn.Module):
    def __init__(self, num_char_vec_features, hidden_size, num_layers, dropout_gru, bidirectional, \
                output_size, dropout_FCN, num_word):
        super().__init__()
        self.gru = nn.GRU(input_size=num_char_vec_features, hidden_size=hidden_size, num_layers=num_layers,\
                          batch_first = True, dropout=dropout_gru, bidirectional=bidirectional)
        self.linear = nn.Linear(hidden_size*2*num_layers, output_size)
        self.BN = nn.BatchNorm1d(num_word)
        self.dropout = nn.Dropout(dropout_FCN)
        self.num_layers = num_layers
    def forward(self, x):
        batch_size, word_seq, char_seq, char_vec = x.size()
        tmp_list = []
        for i in range(word_seq):
            tmp_compute , _ = self.gru(x[:,i,:,:].float())
            tmp_list.append(tmp_compute.contiguous().view(batch_size,-1))
        tmp_compute = torch.stack(tmp_list,1)
        #print(tmp_compute.size())
        tmp_compute = self.dropout(tmp_compute)
        tmp_compute = self.linear(tmp_compute)
        #print(tmp_compute.size())
        tmp_compute = F.relu(self.BN(tmp_compute))#>>linear >> BachNorm >> relu
        return tmp_compute
    
class over_all_NER2(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 num_char_vec_features: '(int)',\
                 hidden_size: '(int)',\
                 max_num_char: '(int)',\
                 dropout_gru_char: '(double)',\
                 bidirectional_char: '(bool)',\
                 output_char_embed_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 num_words: '(int) see in overall_char_embedding', \
                 gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', \
                 bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)',\
                 pos_size: '(int) size of pos embedding'):
        super().__init__()
        self.gru_char = RNN_char(num_char_vec_features, hidden_size, max_num_char, dropout_gru_char, \
                                 bidirectional_char, output_char_embed_size, DOchar_FCN, num_words)
        self.gru_crf_layer = gru_crf(size_of_embedding + output_char_embed_size + pos_size, \
                                     gru_hidden_size, num_words, dropout_gru, bidirectional, tags, DO_FCN_GRUCRF)
    def forward(self, x):
        tmp_compute = self.gru_char(x[0])
        #print(tmp_compute.size())
        #print(x[1].size())
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        #print(tmp_compute.size())
        tmp_gru_crf = self.gru_crf_layer((tmp_compute, x[4]), x[2], x[3].long())
        return tmp_gru_crf
    def predict(self, x):
        tmp_compute = self.gru_char(x[0])
        tmp_compute = torch.cat([tmp_compute, x[1].float(), x[5]], 2)
        tmp_gru_crf = self.gru_crf_layer.predict((tmp_compute, x[4]), x[3].long())
        return tmp_gru_crf

In [4]:
#new
def get_index(len_row, len_col)->'(iterator of all ((int)row, (int)col))':
    for i in range(len_row):
        for j in range(len_col):
            yield(i,j)

def get_longest_seq_len(MASK: '(torch.tensor: shape=(batch_size, num_words)) \
    of mask 1 for non padding, 0 for otherwise')->'(int) col index of first zero in\
    of the longest sequence example: x=torch.tensor([[1,1,0],[1,0,0]]) -> return 2':
    tmp_mask = MASK.numpy()
    if len(tmp_mask.shape) != 1:
        tmp_mask = np.sum(tmp_mask,0)
    col = 0
    for i in range(tmp_mask.shape[0]):
        if tmp_mask[i]==0:
            col = i
            break
    if col == 0:
        col = tmp_mask.shape[0]
    return col

class overall_char_embedding(nn.Module):
    def __init__(self, output_size: '(tuple of ints): (batch_size, embedding_size_per_word)',
    max_len_char: '(int) see in CharEmbedding',\
    nums_filter: '(list) list of number of filters according to each kernel_sizes (respectively)',
    use_BN: 'see in My2DConv',
    activation_func: 'see in My2DConv',
    input_channel: 'see in My2DConv',
    kernel_sizes: '(list[int]) list of size of kernels used, and they will be computed concurrently',
    same_padding: 'see in My2DConv',
    num_words: 'number of words used in 1 sample',
    num_char_encoding_size: 'size of encoding for each char'):
        super().__init__()
        self.batch_size, self.embedding_size_per_word = output_size
        tmp_cnn_models = []
        for ind_cnn, kernel_size in enumerate(kernel_sizes):
            tmp_cnn_models.append(\
            My2DConv(nums_filter[ind_cnn], use_BN, activation_func, input_channel,\
            (kernel_size, num_char_encoding_size), same_padding)
            )
        self.num_words = num_words
        self.CNNs = nn.ModuleList(tmp_cnn_models)
        self.MyMaxPool = nn.MaxPool2d((1, num_char_encoding_size), stride= (1,1))
        self.MyFCN = nn.Linear(sum(nums_filter)*max_len_char, output_size[1])
    def forward(self, x):
        batch_size, num_word, num_char, embedding_size = x.size()
        #print(x.size())
        tmp_compute = x.view(batch_size, num_word, 1, num_char, \
        embedding_size)
        all_output_list = []
        for num_word in range(self.num_words):
            tmp_output_cnn = []
            for tmp_cnn in self.CNNs:
                tmp_output_cnn.append(self.MyMaxPool(tmp_cnn(tmp_compute[:,\
                num_word,:,:,:])).view((batch_size, -1)))
            all_output_list.append(F.relu(self.MyFCN(torch.cat(tmp_output_cnn, 1))))
        #print(all_output_list[0].size())
        #print(len(all_output_list))
        all_output_list = torch.stack(all_output_list, dim=1)
        return all_output_list
                
class gru_crf(nn.Module):
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features', num_layers: '(int) number of \
    recursion', dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)'):
        super().__init__()
        self.gru = nn.GRU(input_size=num_input_features, hidden_size=hidden_size, num_layers=num_layers,\
        batch_first = True, dropout=dropout_gru, bidirectional=bidirectional)
        #all_transition=allowed_transitions('IOB1', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size*2, hidden_size)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.crf = ConditionalRandomField(len(tags))
        self.dropout = nn.Dropout(dropout_FCN)
        
    def forward(self, samples, target: '(torch.tensor) shape=(...............,)the target tags to be used',\
                mask: 'True for non-pad elements'):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        tmp_compute = self.gru(samples)[0].view(batch_size, words, -1)
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        #length = torch.mean(length.float()).item()
        ##############################################
        ###cut padding some parts out#################
        tmp_compute = tmp_compute[:, :index_to_cut,:]
        target = target[:, :index_to_cut]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        tmp_compute = self.dropout(tmp_compute)
        tmp_compute = F.relu(self.linear(tmp_compute))
        tmp_compute = self.dropout(tmp_compute)
        tmp_compute = F.relu(self.linear2(tmp_compute))
        #print(tmp_compute.size())
        nll_loss = self.crf(tmp_compute,target.long(),mask)
#         print(f'total CRF time: {time() - tmp_t}')
        return nll_loss#/length
    def predict(self, samples, mask):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        tmp_compute = self.gru(samples)[0].view(batch_size, words, -1)
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        ##############################################
        ###cut padding some parts out#################
        tmp_compute = tmp_compute[:, :index_to_cut,:]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        
        tmp_compute = F.relu(self.linear(tmp_compute))
        tmp_compute = F.relu(self.linear2(tmp_compute))
        #print(tmp_compute.size())
        tmp_t = time()
        tmp_tags = self.crf.viterbi_tags(tmp_compute,mask)
#         print(f'total CRF prediction time: {time() - tmp_t}')
        return tmp_tags
    
class My2DConv(nn.Module):
    def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, use 2d-batchnorm after linear conv',\
    activation_func: '(bool) if True, use RELU after BN', input_channel: '(int) number of input channels', \
    kernel_size: '(tuple): (width, height) size of the kernels', same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
        super().__init__()
        if same_padding:
            #assume that dialation = 1 and stride = 1
            self.padding = (math.floor((kernel_size[0] - 1)/2), math.floor((kernel_size[1] -1)/2))
        else:
            self.padding = 0
        self.Conv = nn.Conv2d(input_channel, num_filter, kernel_size, padding= self.padding)
        self.use_BN = use_BN
        self.activation_func = activation_func
        if self.use_BN:
            self.BN = nn.BatchNorm2d(num_filter)

    def forward(self, input_data: '(torch.tensor) dimension= (batch_size, num_channel_in, in_height, in_width)') \
    -> '(torch.tensor) shape= (batch_size, num_filter, in_height, in_width)':
        tmp_compute = self.Conv(input_data.float())
        if self.use_BN:
            tmp_compute = self.BN(tmp_compute)
        if self.activation_func:
            tmp_compute = nn.ReLU()(tmp_compute)
        return tmp_compute
        



class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

class AttentionBetweenWordsAndChars(nn.Module):
    def __init__(self, hidden_size: '(int) size of key, query and value vectors',\
    input_vec_size: '(int) incase of fasttext input_vec_size=300'):
        super().__init__()
        self.K_FCN = nn.Linear(input_vec_size, hidden_size)
        self.Q_FCN = nn.Linear(input_vec_size, hidden_size)
        self.V_FCN = nn.Linear(input_vec_size, hidden_size)
        self.AttLayer = ScaledDotProductAttention(math.sqrt(hidden_size), 0.1)
    def forward(self, char_vectors, word_vectors):
        batch_size, word_size, _ = word_vectors.size()
        word_vectors = word_vectors.float()
        char_vectors = char_vectors.float()
#         print(word_vectors.size())
#         print(char_vectors.size())
        K = torch.stack([self.K_FCN(word_vectors),self.K_FCN(char_vectors)],dim = 2)
        Q = torch.stack([self.Q_FCN(word_vectors),self.Q_FCN(char_vectors)],dim = 2)
        V = torch.stack([self.V_FCN(word_vectors),self.V_FCN(char_vectors)],dim = 2)
        all_output_list = []
        for word_ind in range(word_size):
            all_output_list.append(self.AttLayer(Q[:,word_ind,:,:], \
            K[:,word_ind,:,:], V[:,word_ind,:,:])[0].view(batch_size,-1))

        return torch.stack(all_output_list,dim = 1)
    
class over_all_NER(nn.Module):
    def __init__(self, Batch_size: '(int)',\
                 size_of_embedding: '(int) size of each word embedding vector',\
                 max_len_char: '(int) see overall_char_embedding',\
                 num_conv_filters: '(list[int]) see in overall_char_embedding', \
                 use_BN: '(bool) see in overall_char_embedding', \
                 use_activation: '(bool) see in overall_char_embedding', \
                 num_conv_input_channel: '(int) see in overall_char_embedding', \
                 kernel_sizes: '(list[tuple[int, int]]) see in overall_char_embedding', \
                 use_same_padding: '(bool) see in overall_char_embedding', \
                 num_words: '(int) see in overall_char_embedding', \
                 num_char_encoding_size: '(int) see in overall_char_embedding', \
                 att_hidden_size: '(int) see in AttentionBetweenWordsAndChars', \
                 num_input_features: '(int) see in gru_crf', gru_hidden_size: '(int) see in gru_crf', \
                 dropout_gru: '(double) see in gru_crf', bidirectional: '(bool)', \
                 tags: '(dict[int: str]) see in gru_crf'):
        super().__init__()
        self.char_embed = overall_char_embedding((Batch_size,size_of_embedding), max_len_char, num_conv_filters, \
                                                 use_BN, use_activation, num_conv_input_channel, kernel_sizes, \
                                                 use_same_padding, num_words, num_char_encoding_size)
        self.my_attention = AttentionBetweenWordsAndChars(att_hidden_size, size_of_embedding)
        self.gru_crf_layer = gru_crf(num_input_features, gru_hidden_size, num_words, dropout_gru, \
                                bidirectional, tags)
        self.Batch_size = Batch_size
    def forward(self, x):
        tmp_compute = self.char_embed(x[0])
        tmp_att = self.my_attention(tmp_compute, x[1])
        tmp_gru_crf = self.gru_crf_layer(tmp_att, x[2], x[3].long())
        return tmp_gru_crf#/self.Batch_size
    def predict(self, x):
        tmp_compute = self.char_embed(x[0])
        tmp_att = self.my_attention(tmp_compute, x[1])
        tmp_tags = self.gru_crf_layer.predict(tmp_att, x[3].long())
        return tmp_tags

def get_indices_random_train_test_split(dataset_size:'(int) number of rows', random_seed: '(int)',\
                                        validation_split: '(double)', shuffle_dataset: '(bool)'):
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]
    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    return train_sampler, valid_sampler
  
def get_indices_random_val_test_split(dataset_size:'(int) number of rows', random_seed: '(int)',\
                                        validation_split: '(double)', shuffle_dataset: '(bool)'):
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    test_indices, val_indices = indices[split: 2*split], indices[:split]
    # Creating PT data samplers and loaders:
    test_sampler = SubsetRandomSampler(test_indices)
    valid_sampler = SubsetRandomSampler(val_indices)
    return test_sampler, valid_sampler

def eval_score(tags: '(dict[int: str])', pred: '(list[(list, float)])', label: 'torch.tensor'):
    pred = np.array([np.array(i[0]) for i in pred])
    label = label.cpu().numpy().astype('int8')
    label = [label[i][:len(pred[i])] for i in range(len(pred))]
    conf_mat = np.zeros((len(tags), len(tags)))
#     print(len(label))
#     print(len(pred))
#     print('---------------')
    for i in range(len(label)):
#         print(len(label[i]))
#         print(len(pred[i]))
        conf_mat += confusion_matrix(label[i],pred[i],range(len(tags)))
    performance_mat = np.zeros((len(tags), 3))#recall, precision, f1-score
    for i in range(len(tags)):
        if np.sum(conf_mat[i]) == 0:
            performance_mat[i][0] = 0
        else:
            performance_mat[i][0] = conf_mat[i][i]/np.sum(conf_mat[i])
        if np.sum(conf_mat[:,i]) == 0:
            performance_mat[i][1] = 0
        else:
            performance_mat[i][1] = conf_mat[i][i]/np.sum(conf_mat[:,i])
        if performance_mat[i][1]+performance_mat[i][0] == 0:
            performance_mat[i][2] = 0
        else:
            performance_mat[i][2] = (2*performance_mat[i][0]*performance_mat[i][1])/(performance_mat[i][1]+performance_mat[i][0])
    return performance_mat

# Training

In [None]:
BS = 4
tags = {0:'I', 1:'B', 2:'O', 3:'<pad>'}
scheduler_n = 15
word_length = 84
early_stop_n = 5
max_size_char = 6
num_search = 50

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

data = MyDataloader('../clean84withpos.txt', '../label84withpos.txt', RULEs, \
                    word_length, '|', 'char_vec_dictionary.txt',max_size_char, \
                    '../fasttext.th.vec', 300, device, '../pos_tag84withpos.txt',POSMAP)


tr, te = get_indices_random_val_test_split(len(data), 1, 0.0005, True)
train_loader = DataLoader(data, batch_size=BS, sampler=tr)
test_loader = DataLoader(data, batch_size=BS, sampler=te)

# NER = over_all_NER(BS,300, max_size_char, num_kernels,True,True,1,kernel_sizes,\
#                    True,word_length,135,attention_in, attention_out, gru_hidden_size, \
#                    gru_dropout, True, tags)
#####
# Batch_size: '(int)',\
#                  num_char_vec_features: '(int)',\
#                  hidden_size: '(int)',\
#                  max_num_char: '(int)',\
#                  dropout_gru_char: '(double)',\
#                  bidirectional_char: '(bool)',\
#                  output_char_embed_size: '(int)',\
#                  size_of_embedding: '(int) size of each word embedding vector',\
#                  num_words: '(int) see in overall_char_embedding', \
#                  gru_hidden_size: '(int) see in gru_crf', \
#                  dropout_gru: '(double) see in gru_crf', \
#                  bidirectional: '(bool)', \
#                  tags: '(dict[int: str]) see in gru_crf', DO_FCN_GRUCRF: '(double)', DOchar_FCN: '(double)')
#####
for cur_ind in range(num_search):
    torch.cuda.empty_cache()
    grucrf_dropout = random.uniform(0.1,0.6)#0.5
    gruchar_dropout = random.uniform(0.1,0.6)#0.5
    DO_FCN_GRUCRF = random.uniform(0.1,0.6)#0.5
    DO_FCN_CHAR = random.uniform(0.1,0.6)#0.5
    grucrf_hidden_size = random.choice([8,16,32,64,128])#5
    hidden_size_char_gru = random.choice([8,16,32,64,128])#20
    LR = 5*10**random.uniform(-3,-5)#0.001
    with open('my_logs.txt', 'a', encoding ='utf8') as f:
        f.write(f'cur_ind: {cur_ind}\n')
        f.write(f'grucrf_dropout: {grucrf_dropout}, gruchar_dropout: {gruchar_dropout}\n')
        f.write(f'DO_FCN_GRUCRF: {DO_FCN_GRUCRF}, DO_FCN_CHAR: {DO_FCN_CHAR}\n')
        f.write(f'grucrf_hidden_size: {grucrf_hidden_size}, hidden_size_char_gru: {hidden_size_char_gru}\n')
        f.write(f'LR: {LR}\n')

    NER = over_all_NER2(BS, 135, hidden_size_char_gru, max_size_char, gruchar_dropout,\
        True, 100, 300, word_length, grucrf_hidden_size, grucrf_dropout, True, \
            tags, DO_FCN_GRUCRF, DO_FCN_CHAR, len(POSMAP))

    optimizer = optim.Adam(NER.parameters(), lr=LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4, amsgrad=True)
    my_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min')

    print(device)
    NER.to(device)

    best_score = 0
    best_mat = np.zeros((len(tags)-1,3))
    cnt_idle = 0
    for epoch in range(6):
        print(f'epoch {epoch}')
        all_loss = []
        for ind, batch_x in enumerate(train_loader):
            if ind%5 == 0:
                print(ind)

            NER = NER.train()
            NER.zero_grad()
            t1 = time()
            loss = NER(batch_x)
            loss = loss*(-1)
            print(f'time per batch: {time() - t1}')
            print(loss)
            all_loss.append(loss)
            loss.backward()
            nn.utils.clip_grad_value_(NER.parameters(), 10)
            optimizer.step()
        total_loss = sum(all_loss)/(ind + 1)
        my_scheduler.step(total_loss)
        print(f'total loss of epoch: {total_loss.item()}')
        print('testing')
        per_mat = np.zeros((len(tags), 3))
        for ind, batch_test in enumerate(test_loader):
            NER = NER.eval()
            output = NER.predict(batch_test)
            per_mat += eval_score(tags, output, batch_test[2])
        per_mat = per_mat/(ind+1)
        per_mat = per_mat[:len(tags),:]
        print(per_mat)
        score = sum(per_mat[:,2])/(len(tags)-1)
        if best_score < score:
            best_mat=per_mat
            best_score = score
            cnt_idle = 0
        else:
            cnt_idle += 1
        print(f'overall score: {score}')
        print('--------------------')
        if early_stop_n == cnt_idle:
            break
    with open('my_logs.txt', 'a', encoding ='utf8') as f:
        f.write(f'best_score: {best_score}\n')
        f.write(f'best_mat\n')
        f.write(f'I => recall: {best_mat[0,0]}, precision: {best_mat[0,1]}, , f1: {best_mat[0,2]}\n')
        f.write(f'B => recall: {best_mat[1,0]}, precision: {best_mat[1,1]}, , f1: {best_mat[1,2]}\n')
        f.write(f'O => recall: {best_mat[2,0]}, precision: {best_mat[2,1]}, , f1: {best_mat[2,2]}\n')
        f.write(f'best_mat\n')
        f.write(f'----------------------------------\n')

cpu
cpu
epoch 0
0
time per batch: 4.17745304107666
tensor(42.5975, grad_fn=<MulBackward>)
time per batch: 2.901552200317383
tensor(100.1544, grad_fn=<MulBackward>)
time per batch: 2.372246026992798
tensor(61.3648, grad_fn=<MulBackward>)
time per batch: 2.416790008544922
tensor(73.5869, grad_fn=<MulBackward>)
time per batch: 2.3500452041625977
tensor(81.5008, grad_fn=<MulBackward>)
5
time per batch: 2.4607222080230713
tensor(58.1430, grad_fn=<MulBackward>)
time per batch: 2.4254212379455566
tensor(74.5243, grad_fn=<MulBackward>)
time per batch: 2.3986759185791016
tensor(130.3741, grad_fn=<MulBackward>)
time per batch: 2.4162909984588623
tensor(49.1455, grad_fn=<MulBackward>)
time per batch: 2.427771806716919
tensor(33.7512, grad_fn=<MulBackward>)
10
time per batch: 2.3170104026794434
tensor(45.8107, grad_fn=<MulBackward>)
time per batch: 2.5479819774627686
tensor(46.8537, grad_fn=<MulBackward>)
time per batch: 2.463021755218506
tensor(47.6579, grad_fn=<MulBackward>)
time per batch: 2.46