In [6]:
!pip install -q transformers datasets

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import json
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

# from google.colab import drive
# drive.mount('/content/gdrive')

PATH_NAME = "./"
# %cd {PATH_NAME}

  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing

In [8]:

## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
BATCH_SIZE = 8
LEARNING_RATE = 1e-05

checkpoint = "uer/gpt2-chinese-cluecorpussmall"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512
EPOCHS=3
FILE_NAME = "3-5-medical-bert.bin"

warmup_steps = 1e2


In [3]:

f = open('Dataset/validate_data.json')
data = json.load(f)
f.close()

[['病人：脱发，杨医生你好，我妈妈三个月前开始发现脱发厉害，刚开始时掉头发，现在是连眉毛都开始有些掉。。',
  '医生：可能是普秃，属于重症斑秃常，与神经、免疫和内分泌有关，应该查一下T细胞亚群、T3/T4/Tsh、微量元素等项目，结果出来再联系。可能是普秃，属于重症斑秃常，与神经、免疫和内分泌有关，应该查一下T细胞亚群、T3/T4/Tsh、微量元素等项目，结果出来再联系。'],
 ['病人：纤维腺瘤，这段时间来月经前就一直左乳比较涨痛。', '医生：已诊。'],
 ['病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。', '医生：你应该找找原因，吃中药调理。'],
 ['病人：最初大三阳现在是小三阳，hbsag420.1hbsab2.1hbeag0hbsab0.05hbcab20这是第一次hbsag353.1hbsab0.03hbeag0hbsab0.96hbcab7.58。',
  '医生：说明感染过乙肝病毒，应该检测肝功能、HBVDNA，同时。'],
 ['病人：牙痛，一个多月了，最早是不舒服，最近非常痛，痛起来连着左太阳穴一起痛，位置是左上里面第二颗，疼的晚上睡不着。',
  '医生：根据症状判断应该是龋齿引起牙髓发炎，需要开髓做根管治疗。'],
 ['病人：56天的宝宝左眼有眼屎且流眼泪，刚出生十多天的时候是右眼有眼屎，最后查明是肚脐发炎引起的，并伴有低烧，当时左眼很干净，经过南京儿童医院的治疗，都已全愈，可回来没几天左眼又开始有眼屎并流眼泪，（眼屎不是很多），量了体温，都是36度多，没有发烧的可能.右眼现在很干净。',
  '医生：建议尽快带孩子到我门诊就诊检查一下，进行泪道冲洗后，在确定如何治疗.'],
 ['病人：怀孕，', '医生：我院已经预约不上了，对不起。'],
 ['病人：胆囊息肉，8月17日刚做了甲状腺切除，切片是甲状腺右侧乳头癌，现发现胆囊大小59*22，胆总管内经5，胆囊壁未见增厚，胆汁投声未见明显异常，胆囊壁可见数个点状等回声附着，交大着大小8*6，后方未见声尾及声影显示，不随体位改变而移动，肝内胆管及肝外胆管中上段未见扩张，下段显示不清。',
  '医生：你应该尽早手术治疗了，首选微创腹腔镜胆囊切除术，你做过甲状腺手术，对本次手术没有影响。至于要我主刀，你可以与我的助手王维东主任联系，请与王维东医师联系：4006606120

In [10]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.dialogue = dataframe.dialogue
        #self.targets = self.data.one_hot
        self.max_len = max_len

    def __len__(self):
        return len(self.dialogue)

    def __getitem__(self, index):
        dialogue = str(self.dialogue[index])
        dialogue = " ".join(dialogue.split())

        inputs = self.tokenizer.encode_plus(
            dialogue,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [11]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)

test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (340749, 1)
TRAIN Dataset: (272599, 1)
TEST Dataset: (68150, 1)


##Training

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [13]:
# from torch import nn
# from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
# from transformers.modeling_outputs import TokenClassifierOutput

# class CustomModel(torch.nn.Module):
#   def __init__(self,checkpoint,num_labels,temperature=0.5, dropout_rate = 0.1): 
#     super(CustomModel,self).__init__() 
#     self.num_labels = num_labels 
#     self.projection_dim = 256
#     self.temperature = temperature
#     self.dropout_rate = dropout_rate

#     #Load Model with given checkpoint and extract its body
#     myConfig = AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True)
#     myConfig.problem_type = "multi_label_classification"
#     myConfig.temperature = self.temperature

#     self.model = model = AutoModel.from_pretrained(checkpoint,config=myConfig)

#     # Freezing paramaters
#     # for param in self.model.parameters():
#     #         param.requires_grad = False

#     self.dropout = torch.nn.Dropout(self.dropout_rate) 
#     self.classifier = torch.nn.Linear(self.model.config.hidden_size,num_labels) # load and initialize weights
#     self.criterion = torch.nn.CrossEntropyLoss() # define loss function

#   def forward(self, input_ids=None, attention_mask=None,labels=None):
#     #Extract outputs from the body

#     outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#     sequence_output = self.dropout(outputs[0])
#     logits = self.classifier(sequence_output[:,0,:]) #predict the labels based on the projected output
#     loss = self.criterion(logits, labels)
#     GPTLoss = 1
#     return loss +GPTLoss
#     #seqeuence_output 8 200 768 for decoder 
#     #sequence_output.shape torch.Size([8, 200, 768]) sequence_output[:,0,:].shape torch.Size([8, 768]) sequence_output[:,0,:].view(-1,768).shape torch.Size([8, 768])
#     return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model=CustomModel(checkpoint=checkpoint,num_labels=10).to(device)

In [14]:
# from tqdm.auto import tqdm
# num_training_steps = EPOCHS * len(training_loader)
# from datasets import load_metric
# metric = load_metric("f1")

# def train(optimizer, model, training_loader, testing_loader, device, num_epochs, learning_rate = 0.1):
#     best_accuracy = float('inf')

#     print("Training Started with hyperparameters: batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature))
    
#     for epoch in range(num_epochs):
#         model.train()
#         total_loss_current_epoch = 0
#         for _,data in enumerate(training_loader, 0):
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#             targets = data['targets'].to(device, dtype = torch.float)

#             #Calling the model
#             outputs = model(ids, mask, targets)

#             loss = outputs.loss
#             loss.backward()
#             optimizer.step()
#             # lr_scheduler.step()
#             optimizer.zero_grad()
#             progress_bar_train.update(1)
#             if _%5000==0:
#                 print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
#             #Calculating total loss during this EPOCH
#             total_loss_current_epoch += loss.item()

#         outputs, targets = validation(model, testing_loader,device)
#         outputs = np.array(outputs) >= 0.5
#         accuracy = round(metrics.accuracy_score(targets, outputs),3)

#         if accuracy < best_accuracy:
#             best_accuracy = accuracy

#             torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "3_8_accuracy_{ACCURACY}_batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(ACCURACY=accuracy, BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature)))
#             #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
#         print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

#         epoch_loss = total_loss_current_epoch / len(training_loader)
#         print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")
#         print("#"*50)

# progress_bar_train = tqdm(range(num_training_steps))
# progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))
# optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'




## GPT Model


In [15]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline, GPT2Config, AdamW
epsilon = 1e-8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained(checkpoint).to(device)
configuration = GPT2Config.from_pretrained(checkpoint, output_hidden_states=False)
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# create a bunch of exaples of "question:" and "doctor" answer, and put it into one string. and use "question: XXXX: doctor: " to get the validation 
optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = epsilon
                )

# text_generator = TextGenerationPipeline(model, tokenizer)
# result = text_generator("这是很久之前的事情了", max_length=100, do_sample=True)

print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro



In [16]:
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
total_steps = len(training_loader) * EPOCHS
lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

import time
import datetime
import random 

total_t0 = time.time()

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

def GPTtrain(optimizer, model, training_loader, testing_loader, device, lr_scheduler, num_epochs, learning_rate = 0.1):
    best_accuracy = float('inf')
    t0 = time.time()
    # print("Training Started with hyperparameters: batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature))
    
    for epoch in range(num_epochs):
        model.train()
        total_loss_current_epoch = 0
        for _,data in enumerate(training_loader, 0):
            model.zero_grad() 
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            # targets = data['targets'].to(device, dtype = torch.float)

            #Calling the model
            # print(model)
            # print("ids[0]", ids[7])
            # print("ids.shape", ids.shape, "targets.shape", targets.shape, "mask.shape", mask.shape, "token_type_ids.shape", token_type_ids.shape)
            
            outputs = model(ids, attention_mask=mask, labels=ids, token_type_ids=token_type_ids)

            loss = outputs[0]
            batch_loss = loss.item()
            total_loss_current_epoch += batch_loss


            # Get sample every x batches.
            if _ % 100 == 0 and not _ == 0:

                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(_, len(training_loader), batch_loss, elapsed))

                model.eval()

                sample_outputs = model.generate(
                                        bos_token_id=random.randint(1,30000),
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length = 200,
                                        top_p=0.95, 
                                        num_return_sequences=1,
                                        # token_type_ids=token_type_ids,
                                        # attention_mask=mask
                                    )
                for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
                
                model.train()

            loss.backward()
            optimizer.step()

            lr_scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

            progress_bar_train.update(1)
            if _%5000==0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
            #Calculating total loss during this EPOCH
            total_loss_current_epoch += loss.item()

        # outputs, targets = validation(model, testing_loader,device)
        # outputs = np.array(outputs) >= 0.5
        # accuracy = round(metrics.accuracy_score(targets, outputs),3)

        # if accuracy < best_accuracy:
        #     best_accuracy = accuracy

        # torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "3_8_accuracy_{ACCURACY}_batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(ACCURACY=accuracy, BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature)))
            #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
        # print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

        epoch_loss = total_loss_current_epoch / len(training_loader)
        print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")
        print("#"*50)
num_training_steps = EPOCHS * len(training_loader)
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))

GPTtrain(optimizer, model, training_loader, testing_loader, device, lr_scheduler, EPOCHS, LEARNING_RATE)

  0%|          | 0/102225 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|          | 2/102225 [00:01<13:38:37,  2.08it/s]

Epoch: 0, Loss:  6.134056568145752


  0%|          | 100/102225 [00:13<3:37:14,  7.83it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of  34,075. Loss: 2.074122905731201.   Elapsed: 0:00:14.


  0%|          | 101/102225 [00:15<19:21:41,  1.47it/s]

0: ##靶 病 人 、 患 病 人 和 家 属 ， 而 不 是 在 家 长 群 里 聊 天 中 进 行 询 问 。 一 般 是 在 微 信 群 里 做 这 类 预 防 治 疗 的 ， 在 微 信 群 里 可 以 和 家 长 一 起 聊 天 ， 还 可 以 给 家 长 发 短 信 。 王 晓 云 表 示 。 针 对 这 些 留 守 儿 童 问 题 ， 国 家 计 生 委 在 今 年 开 始 实 施 孤 独 症 康 复 助 学 金 政 策 ， 帮 助 留 守 儿 童 实 现 心 理 康 复 。 此 外 ， 国 家 对 留 守 儿 童 进 行 了 专 项 减 免 ， 对 其 中 的 学 龄 前 儿 童 给 予 资 助 。 国 家 对 留 守 儿 童 的 特 殊 照 顾 政 策 是 对 留 守 儿 童 的 关 爱 。 王 晓 云 表 示 ， 对 留 守 儿 童 的 治 疗 也 是 一 项 长


  0%|          | 200/102225 [00:28<3:36:31,  7.85it/s] The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   200  of  34,075. Loss: 2.7972519397735596.   Elapsed: 0:00:28.


  0%|          | 201/102225 [00:30<18:39:13,  1.52it/s]

0: 15058 病 房 。 病 人 ： 我 不 要 这 样 。 病 人 ： 我 们 只 是 想 找 一 个 能 让 你 更 好 地 完 成 任 务 的 团 队 ， 而 不 是 每 天 上 班 都 需 要 加 班 ， 而 不 是 去 安 排 任 务 。 如 果 你 只 是 想 安 排 任 务 ， 那 你 就 要 多 想 想 这 个 团 队 是 如 何 进 行 的 。 这 个 团 队 有 很 多 人 ， 当 你 没 有 能 力 提 升 ， 你 就 不 要 去 做 这 件 事 情 。 所 以 在 做 这 件 事 的 过 程 中 我 会 有 两 个 最 重 要 的 问 题 ， 一 是 我 的 时 间 都 浪 费 在 了 吃 喝 玩 乐 上 ， 第 二 个 问 题 是 我 的 时 间 都 浪 费 了 。 这 样 很 多 人 都 会 产 生 一 个 后 遗 症 ： 我 很 忙 ， 而 且 我 觉 得 我 的 工


  0%|          | 300/102225 [00:42<3:38:06,  7.79it/s] The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   300  of  34,075. Loss: 1.698535442352295.   Elapsed: 0:00:43.


## T5 Model


In [None]:
import torch
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

# load tokenizer and model 
pretrained_model = "IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)
config = T5Config.from_pretrained(pretrained_model)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model, config=config)
model.resize_token_embeddings(len(tokenizer))
model.eval()

In [None]:
# tokenize
text = "病情诊断任务： 指甲上突起的横纹病情描述： 上周发现拇指指甲上突起一道横纹！不疼不痒，突起来的横纹！希望获得的帮助： 在家用什么药！患病多久： 一月内过敏史： 无（2020-03-27填写）既往病史： 【手术】：17年做过肠癌手术！（2020-03-27填写）既往病史： 无（2020-03-23 告诉我这个病人应该看什么医生"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long(),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long(),
  }

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
  # early_stopping=True,
  )

logits=logits[:,3:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("shibing624/prompt-t5-base-chinese")
model = T5ForConditionalGeneration.from_pretrained("shibing624/prompt-t5-base-chinese")
def batch_generate(input_texts, max_length=64):
    features = tokenizer(input_texts, return_tensors='pt')
    outputs = model.generate(input_ids=features['input_ids'],
                             attention_mask=features['attention_mask'],
                             max_length=max_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
r = batch_generate(["推理：指甲上突起的横纹病情描述： 上周发现拇指指甲上突起一道横纹！不疼不痒，突起来的横纹！希望获得的帮助： 在家用什么药！患病多久： 一月内过敏史： 无（2020-03-27填写）既往病史： 【手术】：17年做过肠癌手术！"])
print(r)

In [None]:
import numpy as np
from transformers import T5Tokenizer,MT5ForConditionalGeneration

pretrain_path = 'IDEA-CCNL/Randeng-T5-784M-QA-Chinese'
tokenizer=T5Tokenizer.from_pretrained(pretrain_path)
model=MT5ForConditionalGeneration.from_pretrained(pretrain_path)

sample={"context":"在柏林,胡格诺派教徒创建了两个新的社区:多罗西恩斯塔特和弗里德里希斯塔特。到1700年,这个城市五分之一的人口讲法语。柏林胡格诺派在他们的教堂服务中保留了将近一个世纪的法语。他们最终决定改用德语,以抗议1806-1807年拿破仑占领普鲁士。他们的许多后代都有显赫的地位。成立了几个教会,如弗雷德里夏(丹麦)、柏林、斯德哥尔摩、汉堡、法兰克福、赫尔辛基和埃姆登的教会。","question":"除了多罗西恩斯塔特,柏林还有哪个新的社区?","idx":1}
plain_text='question:'+sample['question']+'knowledge:'+sample['context'][:500]

res_prefix=tokenizer.encode('answer',add_special_token=False)
res_prefix.append(tokenizer.convert_tokens_to_ids('<extra_id_0>'))
EOS_TOKEN_ID = tokenizer.convert_tokens_to_ids('<extra_id_0>')
res_prefix.append(EOS_TOKEN_ID)
l_rp=len(res_prefix)

tokenized=tokenizer.encode(plain_text,add_special_tokens=False,truncation=True,max_length=500)
tokenized+=res_prefix
batch=[tokenized]*2
input_ids=torch.tensor(np.array(batch),dtype=torch.long)

# Generate answer
pred_ids = model.generate(input_ids=input_ids,max_new_token=500,do_sample=True,top_p=0.9)
pred_tokens=tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
res=pred_tokens.replace('<extra_id_0>','').replace('有答案:','')

## Validation 

In [None]:

def validation( model, testing_loader, device, model_weights_path = None):

    if model_weights_path:
      state_dict = torch.load(PATH_NAME+model_weights_path)
      model.load_state_dict(state_dict)
      # model.classifier.load_state_dict(torch.load(PATH_NAME+'classification_head_weights.bin'))

    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, targets)
            
            sigmoids = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()
            idx = np.argmax(sigmoids, axis=-1)
            sigmoids = np.zeros( (len(sigmoids), len(sigmoids[1])) )
            sigmoids[ np.arange(len(sigmoids)), idx] = 1
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoids)
    return fin_outputs, fin_targets

## Grid Search

In [None]:
param_grid = {'dropout': [0.1, 0.2],
              'learning_rate': [0.0001,0.001, 0.01],
              'batch_size': [32, 64],
              'temperature': [0.9, 0.7]}

from sklearn.metrics import accuracy_score
import torch.optim as optim

for batch_size in param_grid['batch_size']:
    train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    test_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    for learning_rate in param_grid['learning_rate']:
        for dropout_rate in param_grid['dropout']:
            for temperature in param_grid['temperature']:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model=CustomModel(checkpoint=checkpoint, num_labels=10, temperature=temperature, dropout_rate=dropout_rate).to(device)
                optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

                train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

                

                


In [None]:
from sklearn import metrics

# Define additional metrics
def precision(outputs, targets):
    return metrics.precision_score(targets, outputs, average = 'weighted')

def recall(outputs, targets):
    return metrics.recall_score(targets, outputs, average = 'weighted')

In [None]:
for epoch in range(1):
    outputs, targets = validation(FILE_NAME)
    print('outputs', outputs)
    print('targets', targets)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = precision(outputs, targets)
    recall_score = recall(outputs, targets)
      
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision Score = {precision_score}")
    print(f"Recall Score = {recall_score}")

In [None]:
torch.save(model.state_dict(), "{PATH_NAME}/roberta-base-additional-classification-layer-bert-medical.bin".format(PATH_NAME=PATH_NAME))