In [1]:
import torch
from transformers import BertTokenizer, BertModel

# Data preprocessing

## How to load tokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained('./pytorch-ernie', do_lower_case = True)

### Convert batch sentence to tokens

In [3]:
batch_sentences = ["Hello I'm a single sentence",
                   "And another sentence",
                   "And the very very last one"]
encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt", add_special_tokens=True)

In [4]:
encoded_inputs

{'input_ids': tensor([[    1,  6368,  1675, 17963,  1979,  1545,  7512,  8090,  9595, 10483,
             2],
        [    1,  1662,  7076,  8090,  9595, 10483,     2,     0,     0,     0,
             0],
        [    1,  1662,  1499,  6318,  6318,  6975,  3777,     2,     0,     0,
             0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

## Modeling

In [8]:
from Code.model import *

In [9]:
model = BertClassifier()

In [16]:
batch_sentences = ["Hello I'm a single sentence",
                   "And another sentence",
                   "And the very very last one"]
encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt", add_special_tokens=True)
for ids in encoded_inputs["input_ids"]:
    print(tokenizer.decode(ids))
input_tensor = encoded_inputs["input_ids"]
padding_mask = encoded_inputs['attention_mask']

[CLS] hello i [UNK] m a single sentence [SEP]
[CLS] and another sentence [SEP] [PAD] [PAD] [PAD] [PAD]
[CLS] and the very very last one [SEP] [PAD] [PAD] [PAD]


In [17]:
padding_mask
encoded_inputs["input_ids"]

tensor([[    1,  6368,  1675, 17963,  1979,  1545,  7512,  8090,  9595, 10483,
             2],
        [    1,  1662,  7076,  8090,  9595, 10483,     2,     0,     0,     0,
             0],
        [    1,  1662,  1499,  6318,  6318,  6975,  3777,     2,     0,     0,
             0]])

In [18]:
output = model(input_tensor)
print(output.detach().numpy().shape)

(3, 11)


In [19]:
y_true = output
mask_BCE_loss(y_true, output, padding_mask)

tensor(0.5048, grad_fn=<MulBackward0>)

## Data loader
pos:   372
neg:   4651
total: 5023


In [359]:
import torch
from transformers import BertTokenizer, BertModel
import glob
import chinese_converter
from string import punctuation
import ast
import numpy as np
import re

class AMLDataset(torch.utils.data.Dataset):
    def __init__(self, data_path='./data/training_set/', model_path='./pytorch-ernie'):
#         self.paths = glob.glob(data_path+"*.txt")
        self.paths = glob.glob(data_path+"108.txt")+glob.glob(data_path+"209.txt")+glob.glob(data_path+"1.txt")
        self.tokenizer = BertTokenizer.from_pretrained('./pytorch-ernie', do_lower_case = True)
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.paths)

    def __getitem__(self, index):
        path = self.paths[index]
        with open(path, encoding = 'utf-8') as f:
            text = f.read().split('\n')
            target = " ".join(ast.literal_eval(text[0]))
            print(target)
            input_data = "".join(text[1:])
            input_data = re.sub(r'[A-Za-z]','', input_data)
            # remove punctuation
            punctuation_characters = punctuation+'。、·「」！，）：（【】'
            input_data = input_data.translate(str.maketrans('', '',punctuation_characters ))
            # remove space
            input_data = input_data.replace(' ','')
        return input_data, target
        

def convert_batch_token_target(batch_input_data, batch_target):
    batch_input_data = [chinese_converter.to_simplified(d) for d in batch_input_data]
    input_tensor = tokenizer(batch_input_data, padding=True, return_tensors="pt", add_special_tokens=False)
    input_token, padding_mask = input_tensor["input_ids"], input_tensor['attention_mask']
    batch_target = process_target(input_token, batch_target)
    return  input_token, padding_mask, torch.tensor(batch_target)

def process_target(input_token, batch_target):
    targets = []
    for slice_token, names in zip(input_token, batch_target):
        text = tokenizer.decode(slice_token, clean_up_tokenization_spaces=False).split(' ')
        target = [0]*len(text)
        for name in names.split(" "):
            name = chinese_converter.to_simplified(name)
            for i in range(len(text)-len(name)):
                if name == "".join(text[i:i+len(name)]):
                    target[i:i+len(name)] = [1]*len(name)
        targets.append(target)
    return targets
                    

In [360]:
data = AMLDataset()
dataloader = torch.utils.data.DataLoader(data, batch_size=4, shuffle=True, num_workers=1)

In [361]:
for d, t in dataloader:
    d, m, t = convert_batch_token_target(list(d), list(t))
    print(d.size(), m.size(), t.size())
    break

彭振源 王澤生 楚瑞芳

陳淳伍
torch.Size([3, 1330]) torch.Size([3, 1330]) torch.Size([3, 1330])
