In [1]:

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import time
import json
import random
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from tqdm.auto import tqdm
from util import sequence_cross_entropy_with_logits
import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

PATH_NAME = "./"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
BATCH_SIZE = 8
LEARNING_RATE = 1e-05

bert_checkpoint = "trueto/medbert-base-wwm-chinese"
encoder_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)
encoder_tokenizer.model_max_len=512
EPOCHS=3
FILE_NAME = "3-14-seq-GPT.bin"

gpt_checkpoint = "uer/gpt2-chinese-cluecorpussmall"
decoder_tokenizer = AutoTokenizer.from_pretrained(gpt_checkpoint)

warmup_steps = 1e2

In [3]:

f = open('Dataset/validate_data.json')
data = json.load(f)
f.close()
# pandas df works better than a list, so much faster wow
df = pd.DataFrame(data)
df = df.loc[:, :1]
df.head()

Unnamed: 0,0,1
0,病人：脱发，杨医生你好，我妈妈三个月前开始发现脱发厉害，刚开始时掉头发，现在是连眉毛都开始有...,医生：可能是普秃，属于重症斑秃常，与神经、免疫和内分泌有关，应该查一下T细胞亚群、T3/T4...
1,病人：纤维腺瘤，这段时间来月经前就一直左乳比较涨痛。,医生：已诊。
2,病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。,医生：你应该找找原因，吃中药调理。
3,病人：最初大三阳现在是小三阳，hbsag420.1hbsab2.1hbeag0hbsab0....,医生：说明感染过乙肝病毒，应该检测肝功能、HBVDNA，同时。
4,病人：牙痛，一个多月了，最早是不舒服，最近非常痛，痛起来连着左太阳穴一起痛，位置是左上里面第...,医生：根据症状判断应该是龋齿引起牙髓发炎，需要开髓做根管治疗。


In [6]:
class CustomDataset(Dataset):

    def __init__(self, df, encoder_tokenizer, decoder_tokenizer, max_len):
        self.encoder_tokenizer = encoder_tokenizer
        self.decoder_tokenizer = decoder_tokenizer
        self.dataframe = df
        self.patient = df[0]
        self.doc = df[1]
        self.max_len = max_len

    def __len__(self):
        return len(self.patient)

    def __getitem__(self, index):
        # grab patient's utterance
        input = str(self.patient[index])
        input = " ".join(input.split())

        inputs = self.encoder_tokenizer.encode_plus(
            input,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids = inputs['input_ids']
        input_mask = inputs['attention_mask']
        input_token_type_ids = inputs["token_type_ids"]
        
        # grab doc's utterance
        output = str(self.doc[index])
        output = " ".join(output.split())

        outputs = self.decoder_tokenizer.encode_plus(
            output,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        output_ids = outputs['input_ids']
        output_mask = outputs['attention_mask']
        output_token_type_ids = outputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'input_mask': torch.tensor(input_mask, dtype=torch.long),
            'input_token_type_ids': torch.tensor(input_token_type_ids, dtype=torch.long),
            'output_ids': torch.tensor(output_ids, dtype=torch.long),
            'output_mask': torch.tensor(output_mask, dtype=torch.long),
            'output_token_type_ids': torch.tensor(output_token_type_ids, dtype=torch.long),
        }

In [7]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)

test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(len(data)))
print("TRAIN Dataset: {}".format(len(train_dataset)))
print("TEST Dataset: {}".format(len(test_dataset)))

training_set = CustomDataset(train_dataset, encoder_tokenizer, decoder_tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, encoder_tokenizer, decoder_tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)


FULL Dataset: 340749
TRAIN Dataset: 272599
TEST Dataset: 68150


$$BERT + GPT

In [8]:
from torch import nn
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig, BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline, GPT2Config, AdamW
from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(torch.nn.Module):
  def __init__(self,checkpoint,num_labels,temperature=0.5, dropout_rate = 0.1): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 
    self.temperature = temperature
    self.dropout_rate = dropout_rate

    #Load Model with given checkpoint and extract its body
    myConfig = AutoConfig.from_pretrained(checkpoint,output_hidden_states=True)
    myConfig.problem_type = "multi_label_classification"
    myConfig.temperature = self.temperature
    myConfig.output_attentions = True

    self.encoder = AutoModel.from_pretrained(checkpoint,config=myConfig)
    self.decoder = GPT2LMHeadModel.from_pretrained(gpt_checkpoint, add_cross_attention=True).to(device)

    self.dropout = torch.nn.Dropout(self.dropout_rate) 
    self.classifier = torch.nn.Linear(self.encoder.config.hidden_size,num_labels) # load and initialize weights
    self.criterion = torch.nn.CrossEntropyLoss() # define loss function

  def forward(self, encoder_input_ids=None, encoder_attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None,labels=None):
    #Extract outputs from the body

    outputs = self.encoder(input_ids=encoder_input_ids, attention_mask=encoder_attention_mask, output_hidden_states=True)
    # select the 12th layer 
    hidden_states = outputs.hidden_states[-1]

    # this decoder outputs a dict of loss, logits, past_key_values, hidden_states, attentions, cross_attentions
    seqoutputs = self.decoder(input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, encoder_hidden_states = hidden_states, labels=decoder_input_ids) 
    
    return seqoutputs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=bert_checkpoint,num_labels=10).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

Some weights of the model checkpoint at trueto/medbert-base-wwm-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at uer/gpt2-chinese-cluecorpus

$$Train

In [9]:

def train(optimizer, model, training_loader, testing_loader, device, num_epochs, learning_rate = 0.1):
#------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(num_epochs):
        #------------------------training------------------------
        model.train()
        total_losses = 0
        seq_losses = 0
        times = 0
        for _, data in enumerate(training_loader, 0):
            encoder_input = data['input_ids'].to(device, dtype = torch.long)
            mask_encoder_input = data['input_mask'].to(device, dtype = torch.long)
            #encoder_token_type_ids = data['input_token_type_ids'].to(device, dtype = torch.long)
            decoder_input = data['output_ids'].to(device, dtype = torch.long)
            mask_decoder_input = data['output_mask'].to(device, dtype=torch.long)
            #decoder_token_type_ids = data['output_token_type_ids'].to(device, dtype=torch.long)


            outputs= model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input)
            
            seq_loss = outputs.loss
            seq_loss.backward()

            total_losses += seq_loss.item()
            times += 1
            update_count += 1

            optimizer.step()
            optimizer.zero_grad()
        end = time.time()
        print('-'*20 + f'epoch {epoch}' + '-'*20)
        print(f'time: {(end - start)}')
        print(f'total loss: {total_losses / times}')
        start = end
    


In [10]:
#num_training_steps = EPOCHS * len(training_loader)
#progress_bar_train = tqdm(range(num_training_steps))

# call train function
#train(optimizer, model, testing_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

# save the model after training
#print('----- saving model -----')
#torch.save(model.state_dict(), "{PATH_NAME}/bert_gpt_validate_data_testingloader_seq2seq.bin".format(PATH_NAME=PATH_NAME))

$$Evaluation

In [11]:
def calculate_perplexity(
    batch_size=1,
    decoder_path='decoder.pth',
    model = None,
    test_dataloader=None,
    device='cuda'):
    # make sure your model is on GPU
    device = torch.device(device)

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    model = model
    model.load_state_dict(torch.load(decoder_path))
    print(f'load from {decoder_path}')
    model = model.to(device)
    model.eval()
    print('load success')
    #------------------------END LOAD MODEL--------------
    #------------------------END LOAD VAL DATA--------------

    perplexity = 0
    batch_count = 0
    print('start calculate the test perplexity....')

    with torch.no_grad():
        for _, data in enumerate(test_dataloader, 0):
            encoder_input = data['input_ids'].to(device, dtype = torch.long)
            mask_encoder_input = data['input_mask'].to(device, dtype = torch.long)
            decoder_input = data['output_ids'].to(device, dtype = torch.long)
            mask_decoder_input = data['output_mask'].to(device, dtype=torch.long)

            outputs = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input)

            logits = outputs.logits
            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1


    print(f'test perplexity: {perplexity / batch_count}')

In [12]:
# passing in the testing set, get perplexity values on training 
#calculate_perplexity(batch_size=BATCH_SIZE, decoder_path="{PATH_NAME}/bert_gpt_validate_data_testingloader_seq2seq.bin".format(PATH_NAME=PATH_NAME), model=model, test_dataloader=testing_loader)

In [13]:
!pip install nltk
from collections import defaultdict
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from nltk.translate.nist_score import sentence_nist
from nltk.util import ngrams
import numpy as np


def bleu(predict, target, n):
    return sentence_bleu([target], predict, weights=tuple(1 / n for i in range(n)))


def nist(predict, target, n):
    if len(predict) < n or len(target) < n:
        return 0
    return sentence_nist([target], predict, n)


def cal_entropy(generated):
    etp_score = [0.0, 0.0, 0.0, 0.0]
    div_score = [0.0, 0.0, 0.0, 0.0]
    counter = [defaultdict(int), defaultdict(int),
               defaultdict(int), defaultdict(int)]
    for gg in generated:
        g = gg.rstrip().split()
        for n in range(4):
            for idx in range(len(g)-n):
                ngram = ' '.join(g[idx:idx+n+1])
                counter[n][ngram] += 1
    for n in range(4):
        total = sum(counter[n].values()) + 1e-10
        for v in counter[n].values():
            etp_score[n] += - (v+0.0) / total * (np.log(v+0.0) - np.log(total))
        div_score[n] = (len(counter[n].values())+0.0) / total
    return etp_score, div_score


def cal_length(sentences):
    sen_length = [len(s.split()) for s in sentences]
    return np.mean(sen_length), np.var(sen_length)


def calculate_metrics(predict, reference):
    reference_len = len(reference)
    predict_len = len(predict)

    #-------------------bleu----------
    bleu_2 = bleu(predict, reference, 2)
    bleu_4 = bleu(predict, reference, 4)
    #-------------------nist----------
    nist_2 = nist(predict, reference, 2)
    nist_4 = nist(predict, reference, 4)
    #-------------------meteor----------
    predict = " ".join(predict)
    reference = " ".join(reference)
    meteor_scores = meteor_score([reference], predict)
    return bleu_2, bleu_4, nist_2, nist_4, meteor_scores




In [14]:
#from metrics import cal_entropy, cal_length, calculate_metrics


def validation(file_name):
    with open(file_name, "r", encoding='utf-8') as f:
        json_data = f.read()
        data = json.loads(json_data)
    
    bleu_2scores = 0
    bleu_4scores = 0
    nist_2scores = 0
    nist_4scores = 0
    meteor_scores = 0
    sentences = []

    for d in tqdm(data):
        reference = list(d['reference'])
        predict = list(d['predict'])
        temp_bleu_2, \
        temp_bleu_4, \
        temp_nist_2, \
        temp_nist_4, \
        temp_meteor_scores = calculate_metrics(predict, reference)

        bleu_2scores += temp_bleu_2
        bleu_4scores += temp_bleu_4
        nist_2scores += temp_nist_2
        nist_4scores += temp_nist_4
        meteor_scores += temp_meteor_scores
        sentences.append(" ".join(predict))

    entro, dist = cal_entropy(sentences)
    mean_len, var_len = cal_length(sentences)
    num = len(sentences)
    print(f'avg: {mean_len}, var: {var_len}')
    print(f'entro: {entro}')
    print(f'dist: {dist}')
    print(f'bleu_2scores: {bleu_2scores / num}')
    print(f'bleu_4scores: {bleu_4scores / num}')
    print(f'nist_2scores: {nist_2scores / num}')
    print(f'nist_4scores: {nist_4scores / num}')
    print(f'meteor_scores: {meteor_scores / num}')

In [37]:
import torch.multiprocessing as mp
import torch.nn.functional as F

def top_k_logits(logits, k):
    """Mask logits so that only top-k logits remain
    """
    values, _ = torch.topk(logits, k)
    print('logits.shape', logits.shape)
    print('values.shape', values.shape)
    print('values[:,-1] shape', values[:,-1].shape)
    print('values[:, :, -1].unsqueeze(0)', values[:, -1].unsqueeze(1).shape)
    min_values = values[:, -1].unsqueeze(1).repeat(1, logits.shape[-1], 4)
    min_values = min_values.transpose(1, 2)
    #min_values = values[:, :, -1].unsqueeze(0).repeat(1, 1, logits.shape[-1])
    print('min_values,shape', min_values.shape)
    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)

def get_decoder(decoder_path):
    old_state_dict = torch.load(decoder_path, map_location='cuda')
    print(f'load from {decoder_path}')
    model = CustomModel(checkpoint=bert_checkpoint,num_labels=10).to(device)
    encoder = model.encoder
    decoder = model.decoder

    encoder_state_dict = encoder.state_dict()
    for i in encoder_state_dict.keys():
        encoder_state_dict[i] = old_state_dict['encoder.' + i]
    encoder.load_state_dict(encoder_state_dict)

    decoder_state_dict = decoder.state_dict()
    for i in decoder_state_dict.keys():
        decoder_state_dict[i] = old_state_dict['decoder.' + i]
    decoder.load_state_dict(decoder_state_dict)
    return encoder, decoder

def generate_sentences(test_dataloader, encoder_tokenizer, decoder_tokenizer, decoder_path, top_k, l):
    # make sure your model is on GPU
    device = torch.device('cuda')

    #------------------------LOAD MODEL-----------------
    encoder, decoder = get_decoder(decoder_path)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    encoder.eval()
    decoder.eval()

    #------------------------END LOAD VALIDATE DATA--------------


    #------------------------START SAMPLE GENERETE-------------------
    for _, data in enumerate(test_dataloader, 0):
        with torch.no_grad():
            encoder_input = data['input_ids'].to(device, dtype = torch.long)
            mask_encoder_input = data['input_mask'].to(device, dtype = torch.long)
            decoder_input = data['output_ids'].to(device, dtype = torch.long)
            mask_decoder_input = data['output_mask'].to(device, dtype=torch.long)
            encoder_outputs = encoder(input_ids=encoder_input, attention_mask=mask_encoder_input, output_hidden_states=True)
            hidden_states = encoder_outputs.hidden_states[-1]
            sentence = []

            prev_pred = decoder_input[:, :1]
            sentence.append(prev_pred)

            length = 1
            # decoding loop
            for i in range(100):
                # this decoder outputs a dict of loss, logits, past_key_values, hidden_states, attentions, cross_attentions
                seqoutputs = decoder(input_ids=decoder_input, attention_mask=mask_decoder_input, encoder_hidden_states = hidden_states, labels=decoder_input) 
                
                logits = seqoutputs.logits.squeeze(1)
                logits = top_k_logits(logits, k=top_k)
                probs = F.softmax(logits, dim=-1)
                prev_pred = torch.multinomial(probs, num_samples=1)
                sentence.append(prev_pred)
                if prev_pred[0][0] == 102:
                    break
                length += 1

            sentence = torch.cat(sentence, dim=-1)
            predict = decoder_tokenizer.convert_ids_to_tokens(sentence[0].tolist())

            target = decoder_input.squeeze(dim=0)
            target_num = (target != 0).sum()
            reference = encoder_tokenizer.convert_ids_to_tokens(target[:target_num].tolist())

            encoder_input = encoder_input.squeeze(dim=0)
            encoder_input_num = (encoder_input != 0).sum()
            inputs = decoder_tokenizer.convert_ids_to_tokens(encoder_input[:encoder_input_num].tolist())
            l.append(["".join(inputs[1:-1]), "".join(predict[1:-1]), "".join(reference[1:-1])])


    #------------------------END SAMPLE GENERETE-------------------


def sample_generate(
    generate_filename,
    top_k = 50,
    decoder_path='/home/ubuntu/CS_224N/bert_gpt_validate_data_testingloader_seq2seq.bin',
    process_num=1
    ):
    encoder_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)
    decoder_tokenizer = AutoTokenizer.from_pretrained(gpt_checkpoint)
    l=list()
    testing_loader = DataLoader(testing_set, **test_params)
    generate_sentences(test_dataloader=testing_loader, encoder_tokenizer=encoder_tokenizer, decoder_tokenizer=decoder_tokenizer, decoder_path=decoder_path, top_k=top_k, l=l)
    
    """
    mgr = mp.Manager()
    l = mgr.list()
    processes = []
    for rank in range(process_num):
        if rank == process_num - 1:
            ## handles the last batch of data
            data = test_data.iloc[int((rank / process_num) * length):]
        else:
            data = test_data.iloc[int((rank / process_num) * length) : int(((rank + 1) / process_num) * length)]
            print('length of batched_data', len(data))
        batch_data_loader = CustomDataset(data, encoder_tokenizer, decoder_tokenizer, MAX_LEN)
        p = mp.Process(target=generate_sentences, args=(batch_data_loader, tokenizer, decoder_path, rank, top_k, l))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    """

    Dialog_list = []
    with open(generate_filename, 'w', encoding='utf-8') as f:
        for s in l:
            cases = dict()
            cases['input'] = s[0]
            cases['predict'] = s[1]
            cases['reference'] = s[2]
            Dialog_list.append(cases)
        json.dump(Dialog_list, f, ensure_ascii = False, indent = 4)


In [38]:
sample_generate(generate_filename='generate_sentences.txt')
print('finished generating')
validation('generate_sentences.txt')

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 22.20 GiB total capacity; 18.99 GiB already allocated; 18.06 MiB free; 20.20 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

$$Evaluation