In [74]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import regex as re
from time import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from allennlp.modules.conditional_random_field import ConditionalRandomField
from torch.utils.data import Dataset, DataLoader

from RULE import RULEs

In [3]:
for i in RULEs:
    print(i)

('๑', '1')
('๒', '2')
('๓', '3')
('๔', '4')
('๕', '5')
('๖', '6')
('๗', '7')
('๘', '8')
('๙', '9')
('๐', '0')


In [121]:
def check_len(text_dir: 'path to text dir', delimeter: 'delimeter used for split()'):
    Max_len = 0
    Min_len = 1000000
    with open(text_dir, 'r', encoding='utf8') as f:
        for line in f:
            tmp_len = len(line.split(delimeter))
            Max_len = max(tmp_len, Max_len)
            Min_len = min(tmp_len, Min_len)
    return Max_len, Min_len

In [122]:
check_len('label_384.txt', '||')

(538, 384)

In [183]:
def gen_char_dicitonary(char_dict_dir: 'all unique chars', out_dic_vec_dir: 'dir of dictionary vectors',\
 num_unique_char):
    with open(char_dict_dir, 'r', encoding='utf8') as f:
        with open(out_dic_vec_dir, 'w', encoding='utf8') as o1:
            for line in f:
                for ind, Char in enumerate(line.strip()):
                    o1.write(Char + ' ')
                    for i in range(num_unique_char + 1):
                        if i != ind:
                            o1.write('0 ')
                        else:
                            o1.write('1 ')
                    o1.write('\n')
                    if 'a' <= Char <= 'z':
                        o1.write(Char.upper() + ' ')
                        for i in range(num_unique_char):
                            if i != ind:
                                o1.write('0 ')
                            else:
                                o1.write('1 ')
                        o1.write('1 ')
                        o1.write('\n')

In [184]:
gen_char_dicitonary('./char-word-level-LSTM-CRF/char_dictionary.txt', './char-word-level-LSTM-CRF/char_vec_dictionary.txt', 134)

In [179]:
with open('./char-word-level-LSTM-CRF/char_dictionary.txt', 'r', encoding='utf8') as f:
    cnt = 0
    for i in f:
        for char in i.strip():
            cnt = cnt + 1
print(cnt)

134


In [186]:
class MyDataloader(Dataset):
    def __init__(self, TextDir: '.txt extension of samples', LabelDir: '.txt extension of labels',rules:\
    'the rules to be replaced => see in RULE.py', Len_word_vec: 'size of word vector') -> None:
        super().__init__()
        self.DF = pd.read_csv(TextDir, names=['text'])
        self.Label_DF = pd.read_csv(LabelDir, names=['text'])
        self.rules = rules
        self.Len_word_vec = Len_word_vec
    def __len__(self):
        return len(self.DF)
    def __getitem__(self, Index) -> '(sample: (torch.tensor), label: (torch.tensor))':
        all_words = [word.strip() for word in self.DF['text'][Index].strip().split('||')]
        for i in range(len(all_words)):
            for rule in self.rules:
                all_words[i] = re.sub(*rule, all_words[i])
        Label = [float(word.strip()) for word in self.Label_DF['text'][Index].strip().split('||')]
        if len(all_words) < self.Len_word_vec - 1:
            Label = Label + [0.0]*(self.Len_word_vec - len(all_words))
            all_words = all_words + ['<\s>'] + ['']*(self.Len_word_vec - 1 - len(all_words))
        # print(len(Label))
        # print(len(all_words))
        # print('-----------')
        return (all_words, torch.tensor(Label))

In [208]:
class NameEntityRecognition(nn.Module):
    def __init__(self, ):
        pass



# class TimeDistributed(nn.Module):
#     def __init__(self, layer: '(nn.Module) layer to be processed', time_steps: '(int)'):
#         super().__init__()
#         self.layers = nn.ModuleList([layer for i in range(time_steps)])

#     def forward(self, x) -> '(torch.tensor) shape=(1, embedding_size)':
#         batch_size, time_steps, C, H, W = x.size()
#         output = torch.tensor([])
#         for i in range(time_steps):
#           output_t = self.layers[i](x[:, i, :, :, :])
#           output_t  = torch.flatten(output_t)
#           output = torch.cat((output, output_t ), 1)
#         return output

# class Convs(nn.Module):
#     def __init__(self, List_of_kernel_sizes: 'example: [(3,100),(5,100),(7,100)]', List_num_filter: 'example: \
#     [64,64,128] ***len(List_num_filter) must equal to len(List_of_kernel_sizes)***',\
#     use_BN: 'see My2DConv', activation_func: 'see My2DConv', input_channel: 'see My2DConv', \
#     same_padding: 'see My2DConv', time_steps: 'see TimeDistributed'):
#         tmp_List_layers = []
#         for ind, kernel_size in enumerate(List_of_kernel_sizes):
#             tmp_List_layers.append(TimeDistributed(My2DConv(List_num_filter[ind], use_BN, \
#             activation_func, input_channel, kernel_size, same_padding), time_steps))
#         self.Layer_list = nn.ModuleList(tmp_List_layers)

class My2DConv(nn.Module):
    def __init__(self, num_filter: '(int) number of filters', use_BN: '(bool) if True, use 2d-batchnorm after linear conv',\
    activation_func: '(bool) if True, use RELU after BN', input_channel: '(int) number of input channels', \
    kernel_size: '(int or tuple) size of the kernels', same_padding: '(bool) if True, input_w,input_h=output_w,output_h'):
        super().__init__()
        if same_padding:
            #assume that dialation = 1 and stride = 1
            self.padding = (math.floor((kernel_size[0] - 1)/2), math.floor((kernel_size[1] -1)/2))
        else:
            self.padding = 0
        self.Conv = nn.Conv2d(input_channel, num_filter, kernel_size, padding= self.padding)
        self.use_BN = use_BN
        self.activation_func = activation_func
        if self.use_BN:
            self.BN = nn.BatchNorm2d(num_filter)

    def forward(self, input_data: '(torch.tensor) dimension= (batch_size, num_channel_in, in_height, in_width)') \
    -> '(torch.tensor) shape= (batch_size, num_filter, in_height, in_width)':
        tmp_compute = self.Conv(input_data)
        if self.use_BN:
            tmp_compute = self.BN(tmp_compute)
        if self.activation_func:
            tmp_compute = nn.ReLU()(tmp_compute)
        return tmp_compute
        

class CharEmbedding(nn.Module):
    def __init__(self,\
    dir_char_dictionary: '(str) .txt extension of characters in the same line, for example: abcdefg.......ฬอฮ',\
    max_len_char: '(int) max size of char representation, for example: given max_len_char=3 and word= "abcde" => only "abc" is used', batch_size):
    #Example: given embed_capital=True and 'a' is embedded as array([1.,0.,0.,0.,0]). 'A' is then embedded as array([1.,0.,0.,0.,1.])
        super().__init__()
        self.dictionary = {}
        self.max_len_char = max_len_char
        self.batch_size = batch_size
        with open(dir_char_dictionary, 'r', encoding='utf8') as f:
            for line in f:
                tmp_data = line.strip().split()
                self.dictionary[tmp_data[0]] = np.array([float(Char) for Char in tmp_data[1:]])
    def forward(self, list_of_tuples: '(List) for \
    example: [("w1_article1","w1_article2",...,"w1_articlen"),\
            ("w2_article1","w2_article2",...,"w2_articlen"),\
            ....\
            ("wm_article1","wm_article2",...,"wm_articlen"),\
            ]') -> '(torch.tensor) \
    shape:(max_len_char, len(dictionary)(+1))':
        #Note: 1 outer list is for 1 word.
        output = []
        for tmp_tuple in list_of_tuples:
            for word in tmp_tuple:
                embedded_word = []
                tmp_word = word
                if len(word) > self.max_len_char:
                    tmp_word = tmp_word[:self.max_len_char]
                for Char in tmp_word:
                    if Char in self.dictionary:
                        tmp_vector = self.dictionary[Char]
                    else:
                        tmp_vector = np.zeros(self.dictionary['a'].shape)
                    embedded_word.append(tmp_vector)
                if len(embedded_word) < self.max_len_char:
                    for i in range(self.max_len_char - len(embedded_word)):
                        embedded_word.append(np.zeros(self.dictionary['a'].shape))
                output.append(torch.tensor(embedded_word))
        tensor_out = []
        for i in range(self.batch_size):
            tensor_out.append([])
        for word_ind, word in enumerate(output):
            tensor_out[word_ind%self.batch_size].append(word)
        print(len(tensor_out))
        #print(tensor_out)
        for ind in range(len(tensor_out)):
            # for j in tensor_out[ind]:
            #     print(j.size())
            # print('-------------')
            tensor_out[ind] = torch.stack(tensor_out[ind])
        return torch.stack(tensor_out)

class WordEmbedding(nn.Module):
    #use fasttext embedding ==> read from a file
    def __init__(self, fasttext_dictionary_dir: '(str) .vec extension of words and embedded_vectors',\
     Len_embedded_vector: '(int) size of embedded each vector (300 for fasttext) **Count only numbers not words'\
     , batch_size) -> None:
        #example of format in fasttext_dictionary_dir
        #กิน 1.0 -2.666 -3 22.5 .... \n
        #นอน 1.5 -5.666 3 9.5 .... \n
        #...
        #...
        super().__init__()
        self.dictionary = {}
        self.Len_embedded_vector = Len_embedded_vector
        self.batch_size = batch_size
        with open(fasttext_dictionary_dir, 'r', encoding = 'utf8') as f:
            for line in f:
                tmp_line = line.strip()
                tmp_words = tmp_line.split()
                if tmp_line != '' and len(tmp_words) == self.Len_embedded_vector + 1:
                    self.dictionary[tmp_words[0]] = np.array([float(element) for element in tmp_words[1:]])
                else:
                    continue
    def forward(self, list_of_tuples: '(List) for \
    example: [("w1_article1","w1_article2",...,"w1_articlen"),\
            ("w2_article1","w2_article2",...,"w2_articlen"),\
            ....\
            ("wm_article1","wm_article2",...,"wm_articlen"),\
            ]') -> '(torch.tensor) \
    shape:(max_len_char, len(dictionary)(+1))':
        tmp_list = []
        for tmp_tuple in list_of_tuples:
            for word in tmp_tuple:
                if word in self.dictionary:
                    tmp_list.append(self.dictionary[word])
                else:
                    #in case of OOV: Zero-vector is used.
                    tmp_list.append(np.zeros(self.Len_embedded_vector))
        tensor_out = []
        for i in range(self.batch_size):
            tensor_out.append([])
        for i in range(len(tmp_list)):
            tensor_out[i%self.batch_size].append(tmp_list[i])
        for i in range(self.batch_size):
            # print(len(tensor_out[i]))
            # print(tensor_out[i][0])
            tensor_out[i] = torch.tensor(tensor_out[i])
        #print(torch.stack(tensor_out))
        return torch.stack(tensor_out)

class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

In [118]:
x = torch.tensor([[1.0,2,3],[2,3,4]])
y = torch.tensor([[2.0,3,4],[1,2,3]])
z = torch.tensor([[1.0,1,1],[1,1,1]])
w = torch.stack([x,y,z])
Att = ScaledDotProductAttention(math.sqrt(3), 0.)

In [119]:
out, att = Att(w,w,w)
print(out)
print(att)

tensor([[[1.9696, 2.9696, 3.9696],
         [1.9945, 2.9945, 3.9945]],

        [[1.9945, 2.9945, 3.9945],
         [1.9696, 2.9696, 3.9696]],

        [[1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000]]])
tensor([[[0.0304, 0.9696],
         [0.0055, 0.9945]],

        [[0.9945, 0.0055],
         [0.9696, 0.0304]],

        [[0.5000, 0.5000],
         [0.5000, 0.5000]]])


In [209]:
dataloader = DataLoader(MyDataloader('clean_384.txt', 'label_384.txt', RULEs, 544), batch_size=256, shuffle=True)

In [210]:
for ind, i in enumerate(dataloader):
    break
print('ok')
t1 = time()
char_embed = CharEmbedding('./char-word-level-LSTM-CRF/char_vec_dictionary.txt',5, 256)
print(f'char_embed: {time() - t1}')
t1 = time()
output = char_embed(i[0])
print(f'embedding: {time() - t1}')
t1=time()
word_embed = WordEmbedding('./char-word-level-LSTM-CRF/fasttext.th.vec', 300, 256)
print(f'word_embed: {time() - t1}')
t1=time()
print(word_embed(i[0]).size())
print(f'embedding: {time() - t1}')

ok
char_embed: 0.011304855346679688
256
embedding: 24.250362873077393
word_embed: 17.82865834236145
torch.Size([256, 544, 300])
embedding: 7.2322680950164795


In [211]:
print(output.size())
print(output[:,1,:,:].size())

torch.Size([256, 544, 5, 135])
torch.Size([256, 5, 135])


In [58]:
a = 'ำ'
print('ห' + a)

หำ


In [11]:
n=0
for i in dataloader:
    if n >= 1:
        break
    print(i)
    n = n + 1

[[('มี', 'สิทธิ', 'มะซัก', 'ดอก'), ('ความ', 'แตกต่าง', 'Sapindus', 'ดก'), ('เห็น', 'กัน', 'rarak', 'ดี'), ('ว่า', 'ผู้', 'ใบ', 'กว่า'), ('เนื่อง', 'ที่', 'เสม็ด', 'ปลูก'), ('จาก', 'ทำ', 'หรือ', 'ใน'), ('ประเทศไทย', 'กรรม', 'เสม็ด', 'ที่'), ('เป็น', 'ดี', 'ขาว', 'ร่ม'), ('ประเทศ', 'มา', 'Melaleuca', 'ขยาย'), ('เกษตรกรรม', 'มาก', 'cajuputi', 'พันธุ์'), ('ประชาชน', 'ที่สุด', 'ต้น', 'โดย'), ('ส่วน', 'ก็', 'ขอบชะนาง', 'ใช้'), ('ใหญ่', 'คือ', 'หรือ', 'กิ่ง'), ('ซึ่ง', 'พระมหากษัตริย์', 'หญ้า', 'ตอน'), ('อยู่', 'เพราะ', 'หนอนตาย', '113'), ('ใน', 'ทรง', 'Pouzolzia', 'สายหยุด'), ('ชนบท', 'เป็น', 'pentandra', 'Desmos'), ('นั้น', 'พระโพธิสัตว์', 'เปลือก', 'chinensis'), ('ประกอบ', 'ทรง', 'ใบ', 'Lour.'), ('อาชีพ', 'มี', 'และ', 'สายหยุด'), ('ทาง', 'ฐานะ', 'ผล', 'เป็น'), ('การ', 'เป็น', 'สะเดา', 'ไม้'), ('เกษตร', 'ธรรมมิกราชาธิราช', 'Azadirachta', 'พุ่ม'), ('และ', 'ซึ่ง', 'indica', 'หรือ'), ('เนื่อง', 'ทำ', 'เปลือก', 'ไม้'), ('จาก', 'หน้าที่', 'กระเจา', 'รอ'), ('พื้นที่', 'จรรโลง', 'หรือ', 'เลื้อย'),

In [46]:
a= ('a','b','c')
print(len(a))

3


In [19]:
print(len(i[0]))
print(len(i[0][2]))

384
4


In [24]:
a['text'][len(a)-1]

'วาน||นี้||27||พ.ค.||นายโสภณ  ซารัมย์||รัฐมนตรี||ว่าการ||กระทรวงคมนาคม||เปิดเผย||ถึง||โครงการ||เช่า||รถ||เมล์||4||000||คัน||วงเงิน||6||.||79||หมื่น||ล้าน||บาท||โดย||ยอม||รับ||ว่า||จนถึง||ขณะ||นี้||ประชาชน||และ||คน||ส่วน||ใหญ่||ยัง||ไม่||เข้าใจ||โครงการ||นี้||เข้าใจ||ว่า||ใช้||เงิน||งบ||ประมาณ||ซึ่ง||จริง||ๆ||แล้ว||ไม่||ใช่||คาด||ว่า||เป็น||เพราะ||ช่วง||ก่อนหน้า||นี้||ไม่||ได้||มี||การ||อธิบาย||เรื่อง||ดัง||กล่าว||อย่าง||ละเอียด||และ||คิด||ว่า||พูด||เพียง||เล็กน้อย||แล้ว||คน||จะ||เข้าใจ||แต่||พอ||ถึง||เวลา||จริง||คน||ก็||ไม่||เข้าใจ||อย่าง||ไร||ก็ตาม||ตน||จะ||อธิบาย||ให้||ประชาชน||และ||คณะ||รัฐมนตรี||ครม.||รับฟัง||ไป||เรื่อย||ๆ||เชื่อ||ว่า||จะ||สามารถ||ตอบ||คำ||ถาม||ได้||ทุก||คำ||ถาม||และ||เมื่อ||ตอบ||ได้||ทุก||คำ||ถาม||แล้ว||หาก||ยัง||ไม่||ยอม||อนุมัติ||ให้||อีก||ก็||คง||จะ||ต้อง||ถาม||ว่า||จะ||เอา||อย่าง||ไร||และ||คน||กทม.||จะ||ว่า||อย่าง||ไร||ค่า||เช่า||ที่||คิด||ใน||โครงการ||ถือ||ได้||ว่า||ถูก||มาก||หาก||เมื่อ||เทียบ||กับ||การ||เช่า||รถ||ทัวร์||ไป||ต่าง||จังหวัด||ที่||ปัจจุบัน||คิด|

In [52]:
with open('char_dictionary.txt', 'r', encoding= 'utf8') as f:
    for i in f:
        for j in i:
            print(j)

a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
ก
ข
ฃ
ค
ต
ฆ
ง
จ
ฉ
ช
ซ
ฌ
ญ
ฎ
ฏ
ฐ
ฑ
ฒ
ณ
ด
ต
ถ
ท
ธ
น
บ
ป
ผ
ฝ
พ
ฟ
ภ
ม
ย
ร
ล
ว
ศ
ษ
ส
ห
ฬ
อ
ฮ
ะ
า
ิ
ี
ึ
ื
ุ
ู
เ
โ
แ
ไ
ใ
ฤ
ๅ
ฦ
ั
่
้
๊
๋
็
์
0
1
2
3
4
5
6
7
8
9
,
;
.
!
?
:
"
/
\
|
_
@
#
%
&
*
+
-
=
<
>
(
)
[
]
{
}
'


In [60]:
a='A'
print(a)
print(a.lower())

A
a


In [58]:
a = np.zeros(5)

In [59]:
a[-1]=1
print(a)

[0. 0. 0. 0. 1.]


In [61]:
a.isupper()

True

In [2]:
a=np.zeros(5)
a[2] = 1
b=np.zeros(5)
b[1] = 1
c=np.zeros(5)
c[4] = 1
d=[a,b,c]
print(d)

[array([0., 0., 1., 0., 0.]), array([0., 1., 0., 0., 0.]), array([0., 0., 0., 0., 1.])]


In [6]:
e=torch.tensor(d)
print(e)
print(e.size())

tensor([[0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.]], dtype=torch.float64)
torch.Size([3, 5])


In [7]:
f = torch.tensor([e,e,e])
print(f)
print(f.size())

ValueError: only one element tensors can be converted to Python scalars

In [51]:
a = [1,2,3]
print(*a)

1 2 3


In [51]:
a=np.array([1,0,0,1])
b=np.array([1,2,0,1])

In [52]:
x=torch.tensor([a,a,a])

In [53]:
y=torch.tensor([b,b,b])

In [56]:
z=torch.stack([x,y])
print(z)

tensor([[[1, 0, 0, 1],
         [1, 0, 0, 1],
         [1, 0, 0, 1]],

        [[1, 2, 0, 1],
         [1, 2, 0, 1],
         [1, 2, 0, 1]]])


In [55]:
tensor_out = []
for i in range(int(64/4)):
    tensor_out.append([])
print(tensor_out)

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]


In [47]:
a = [[]*4]

In [48]:
a

[[]]

In [124]:
w_em = WordEmbedding('./char-word-level-LSTM-CRF/fasttext.th.vec', 300, 1)

In [125]:
w_em(('<\s>'))

tensor([[[-6.6157e+00, -3.6187e+00, -4.4070e+00,  ..., -5.2342e+00,
          -5.5859e+00, -7.4312e+00],
         [-8.6055e-02,  2.0431e-01,  6.4967e-01,  ..., -7.4839e-02,
           1.9741e-01, -2.8951e-01],
         [-2.9842e-01, -2.3566e-02,  2.8939e-01,  ..., -5.4377e-01,
           1.4998e-01, -3.4089e-01],
         [ 7.5550e-01,  8.5246e-01,  8.0554e-01,  ...,  1.1361e+00,
          -9.5987e-01,  5.9702e-02]]], dtype=torch.float64)