In [1]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import time
import json
import random
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from util import sequence_cross_entropy_with_logits
import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

PATH_NAME = "./"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
BATCH_SIZE = 4
LEARNING_RATE = 1e-05

bert_checkpoint = "trueto/medbert-base-wwm-chinese"
tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)
tokenizer.model_max_len=512
EPOCHS=3
FILE_NAME = "3-5-medical-bert.bin"

gpt_checkpoint = "uer/gpt2-chinese-cluecorpussmall"
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)

warmup_steps = 1e2

In [3]:

f = open('Dataset/validate_data.json')
data = json.load(f)
f.close()
# pandas df works better than a list, so much faster wow
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,188,189,190,191,192,193,194,195,196,197
0,病人：脱发，杨医生你好，我妈妈三个月前开始发现脱发厉害，刚开始时掉头发，现在是连眉毛都开始有...,医生：可能是普秃，属于重症斑秃常，与神经、免疫和内分泌有关，应该查一下T细胞亚群、T3/T4...,,,,,,,,,...,,,,,,,,,,
1,病人：纤维腺瘤，这段时间来月经前就一直左乳比较涨痛。,医生：已诊。,,,,,,,,,...,,,,,,,,,,
2,病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。,医生：你应该找找原因，吃中药调理。,,,,,,,,,...,,,,,,,,,,
3,病人：最初大三阳现在是小三阳，hbsag420.1hbsab2.1hbeag0hbsab0....,医生：说明感染过乙肝病毒，应该检测肝功能、HBVDNA，同时。,,,,,,,,,...,,,,,,,,,,
4,病人：牙痛，一个多月了，最早是不舒服，最近非常痛，痛起来连着左太阳穴一起痛，位置是左上里面第...,医生：根据症状判断应该是龋齿引起牙髓发炎，需要开髓做根管治疗。,,,,,,,,,...,,,,,,,,,,


In [4]:
class CustomDataset(Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.dataframe = df
        self.patient = df[0]
        self.doc = df[1]
        self.max_len = max_len

    def __len__(self):
        return len(self.patient)

    def __getitem__(self, index):
        input = str(self.patient)
        input = " ".join(input.split())

        inputs = self.tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids = inputs['input_ids']
        input_mask = inputs['attention_mask']
        input_token_type_ids = inputs["token_type_ids"]

        output = str(self.doc[index][1])
        output = " ".join(input.split())

        outputs = self.tokenizer.encode_plus(
            output,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        output_ids = outputs['input_ids']
        output_mask = outputs['attention_mask']
        output_token_type_ids = outputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'input_mask': torch.tensor(input_mask, dtype=torch.long),
            'input_token_type_ids': torch.tensor(input_token_type_ids, dtype=torch.long),
            'output_ids': torch.tensor(output_ids, dtype=torch.long),
            'output_mask': torch.tensor(output_mask, dtype=torch.long),
            'output_token_type_ids': torch.tensor(output_token_type_ids, dtype=torch.long),
        }

In [5]:


# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)

test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(len(data)))
print("TRAIN Dataset: {}".format(len(train_dataset)))
print("TEST Dataset: {}".format(len(test_dataset)))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: 340749
TRAIN Dataset: 272599
TEST Dataset: 68150


##BERT + GPT

In [6]:
from torch import nn
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig, BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline, GPT2Config, AdamW
from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(torch.nn.Module):
  def __init__(self,checkpoint,num_labels,temperature=0.5, dropout_rate = 0.1): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 
    #self.projection_dim = 256
    self.temperature = temperature
    self.dropout_rate = dropout_rate

    #Load Model with given checkpoint and extract its body
    myConfig = AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True, return_unused_kwargs=True)
    #myConfig.problem_type = "multi_label_classification"
    #myConfig.temperature = self.temperature

    self.encoder = AutoModel.from_pretrained(checkpoint,config=myConfig).to(device)
    self.decoder = GPT2LMHeadModel.from_pretrained(gpt_checkpoint).to(device)

    self.dropout = torch.nn.Dropout(self.dropout_rate) 
    self.classifier = torch.nn.Linear(self.encoder.config.hidden_size,num_labels) # load and initialize weights
    self.criterion = torch.nn.CrossEntropyLoss() # define loss function

  def forward(self, encoder_input_ids=None, encoder_attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None,labels=None):
    #Extract outputs from the body

    outputs = self.encoder(input_ids=encoder_input_ids, attention_mask=encoder_attention_mask, output_hidden_states=True, use_cache=True)
    
    # but the past_key_values argument in self.decoder arg seems to 
    # take a different shape of (batch_size, num_head, sql_len, head_features)


    # classification head - could move this outside of forward?
    sequence_output = self.dropout(outputs.last_hidden_state)
    #class_logits = self.classifier(sequence_output[:,0,:]) #predict the labels based on the projected output
    #class_loss = self.criterion(class_logits, labels)
    class_logits = 0
    class_loss = 0
    
    # decoder
    mask = torch.cat([encoder_attention_mask, decoder_attention_mask], dim=1)
    seq_logits, _ = self.decoder(decoder_input_ids, mask, past_key_values=hidden_states_tensor)
    return TokenClassifierOutput(loss=class_loss, logits=class_logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions), seq_logits

model=CustomModel(checkpoint=bert_checkpoint,num_labels=10).to(device)
print(model)

Some weights of the model checkpoint at trueto/medbert-base-wwm-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [7]:

def train(optimizer, model, training_loader, testing_loader, device, num_epochs, learning_rate = 0.1):

#------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(num_epochs):
        #------------------------training------------------------
        model.train()
        total_losses = 0
        class_losses = 0
        seq_losses = 0
        times = 0
        for _, data in enumerate(training_loader, 0):
            encoder_input = data['input_ids'].to(device, dtype = torch.long)
            mask_encoder_input = data['input_mask'].to(device, dtype = torch.long)
            #encoder_token_type_ids = data['input_token_type_ids'].to(device, dtype = torch.long)
            #targets = 
            decoder_input = data['output_ids'].to(device, dtype = torch.long)
            mask_decoder_input = data['output_mask'].to(device, dtype=torch.long)
            #decoder_token_type_ids = data['output_token_type_ids'].to(device, dtype=torch.long)
            
            

            classifier_outputs, seq_logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input)
            
            ## classifier backprop
            class_loss = 0
            #class_loss = classifier_outputs.loss
            #class_loss.backward()

            ## seq2seq backprop
            out = seq_logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()
            seq_loss = sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            seq_loss.backward()

            total_losses += class_loss.item() + seq_loss.item()
            class_losses += class_loss.item()
            seq_losses += seq_loss.item()
            times += 1
            
            update_count += 1

            # TODO: why do we need this line?
            # if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
            optimizer.step()
            optimizer.zero_grad()
        end = time.time()
        print('-'*20 + f'epoch {epoch}' + '-'*20)
        print(f'time: {(end - start)}')
        print(f'total loss: {total_losses / times}')
        print(f'classifier loss: {class_losses / times}')
        print(f'seq loss: {seq_losses / times}')
        start = end

In [8]:
# set up optimizer

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# call train function
train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

start training....


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


AttributeError: 'CustomModel' object has no attribute 'config'

##GPT2

In [None]:

# text_generator = TextGenerationPipeline(model, tokenizer)
# result = text_generator("这是很久之前的事情了", max_length=100, do_sample=True)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro



In [None]:
# for grabbing key, value pairs 
"""
# get all named parameters in the model
    named_params = dict(self.encoder.named_parameters())

    # filter named parameters to only include the key-value pairs for the BertSelfAttention layers
    attention_params = {k: v for k, v in named_params.items() if 'attention.self.' in k}

    # print the keys and shapes of the filtered parameters
    keys_tensor = torch.empty((2, self.encoder.config.num_hidden_layers, self.encoder.config.num_attention_heads,
                           self.encoder.config.max_position_embeddings, self.encoder.config.hidden_size // self.encoder.config.num_attention_heads))
    values_tensor = torch.empty((2, self.encoder.config.num_hidden_layers, self.encoder.config.num_attention_heads,
                             self.encoder.config.max_position_embeddings, self.encoder.config.hidden_size // self.encoder.config.num_attention_heads))

    # loop through attention_params and fill tensors
    for i, (k, v) in enumerate(attention_params.items()):
        if 'key' in k:
            keys_tensor[0][i] = v.data.reshape(model.config.num_attention_heads, model.config.max_position_embeddings,
                                            model.config.hidden_size // model.config.num_attention_heads).transpose(0, 1)
        elif 'value' in k:
            values_tensor[1][i] = v.data.reshape(model.config.num_attention_heads, model.config.max_position_embeddings,
                                              model.config.hidden_size // model.config.num_attention_heads).transpose(0, 1)
    # print the shape of the tensors
    print(keys_tensor.shape)
    print(values_tensor.shape)
"""