In [3]:
!pip install -q transformers datasets

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

import json
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig

# from google.colab import drive
# drive.mount('/content/gdrive')

PATH_NAME = "./"
# %cd {PATH_NAME}

  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing

In [5]:

## Sections of config

# Defining key variables for dataLoader, Training
MAX_LEN = 200
BATCH_SIZE = 8
LEARNING_RATE = 1e-05

checkpoint = "uer/gpt2-chinese-cluecorpussmall"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512
EPOCHS=3
FILE_NAME = "3-5-medical-bert.bin"

warmup_steps = 1e2


In [6]:
# df = pd.read_csv("patient_data.csv")
# #df['list'] = df[df.columns[1:3]].values.tolist()
# new_df = df[['id', 'doctor_faculty', 'description']].copy()
# # new_df.head()

# #map each department to an index 
# departments = new_df['doctor_faculty'].unique()
# d2ind = {departments[i]:i for i in range(len(departments))}
# ind2d = {i:departments[i] for i in range(len(departments))}

# ground_truths = []
# for i, row in new_df.iterrows():
#   dept_name = row['doctor_faculty']
#   dept_ind = d2ind[dept_name]
#   one_hot = np.zeros(len(departments))
#   one_hot[dept_ind] = 1
#   ground_truths.append(one_hot)
# new_df['one_hot'] = list(ground_truths)

# new_df.drop(['id', 'doctor_faculty'], axis='columns', inplace=True)

# new_df.head()
f = open('Dataset/validate_data.json')
data = json.load(f)
f.close()
conversation_list = []
for row in data:
    dialogue = ''
    for d in row:
        dialogue += d.strip()
    conversation_list.append(dialogue)
df = pd.DataFrame(conversation_list, columns=['dialogue'])
df.head()

Unnamed: 0,dialogue
0,病人：脱发，杨医生你好，我妈妈三个月前开始发现脱发厉害，刚开始时掉头发，现在是连眉毛都开始有...
1,病人：纤维腺瘤，这段时间来月经前就一直左乳比较涨痛。医生：已诊。
2,病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。医生：你应该找找原因，吃中药调理。
3,病人：最初大三阳现在是小三阳，hbsag420.1hbsab2.1hbeag0hbsab0....
4,病人：牙痛，一个多月了，最早是不舒服，最近非常痛，痛起来连着左太阳穴一起痛，位置是左上里面第...


In [7]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.dialogue = dataframe.dialogue
        #self.targets = self.data.one_hot
        self.max_len = max_len

    def __len__(self):
        return len(self.dialogue)

    def __getitem__(self, index):
        dialogue = str(self.dialogue[index])
        dialogue = " ".join(dialogue.split())

        inputs = self.tokenizer.encode_plus(
            dialogue,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [8]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)

test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)






FULL Dataset: (340749, 1)
TRAIN Dataset: (272599, 1)
TEST Dataset: (68150, 1)


In [17]:
## GPT2
import re 

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json.dialogue[:10000]:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train_path = "gpt_train_dataset.txt"
build_text_files(training_set, train_path)
print(len(training_set))

test_path = "gpt_test_dataset.txt"
build_text_files(testing_set, test_path)


272599


In [10]:
from transformers import TextDataset
train_path = "gpt_train_dataset.txt"
training_loader = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

Token indices sequence length is longer than the specified maximum sequence length for this model (1915645 > 1024). Running this sequence through the model will result in indexing errors


In [18]:
testing_loader = TextDataset(tokenizer=tokenizer, file_path=test_path, block_size=128)

Creating features from dataset file at 
Saving features into cached file cached_lm_BertTokenizerFast_126_gpt_test_dataset.txt [took 0.023 s]


##Training

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [12]:
# from torch import nn
# from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
# from transformers.modeling_outputs import TokenClassifierOutput

# class CustomModel(torch.nn.Module):
#   def __init__(self,checkpoint,num_labels,temperature=0.5, dropout_rate = 0.1): 
#     super(CustomModel,self).__init__() 
#     self.num_labels = num_labels 
#     self.projection_dim = 256
#     self.temperature = temperature
#     self.dropout_rate = dropout_rate

#     #Load Model with given checkpoint and extract its body
#     myConfig = AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True)
#     myConfig.problem_type = "multi_label_classification"
#     myConfig.temperature = self.temperature

#     self.model = model = AutoModel.from_pretrained(checkpoint,config=myConfig)

#     # Freezing paramaters
#     # for param in self.model.parameters():
#     #         param.requires_grad = False

#     self.dropout = torch.nn.Dropout(self.dropout_rate) 
#     self.classifier = torch.nn.Linear(self.model.config.hidden_size,num_labels) # load and initialize weights
#     self.criterion = torch.nn.CrossEntropyLoss() # define loss function

#   def forward(self, input_ids=None, attention_mask=None,labels=None):
#     #Extract outputs from the body

#     outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#     sequence_output = self.dropout(outputs[0])
#     logits = self.classifier(sequence_output[:,0,:]) #predict the labels based on the projected output
#     loss = self.criterion(logits, labels)
#     GPTLoss = 1
#     return loss +GPTLoss
#     #seqeuence_output 8 200 768 for decoder 
#     #sequence_output.shape torch.Size([8, 200, 768]) sequence_output[:,0,:].shape torch.Size([8, 768]) sequence_output[:,0,:].view(-1,768).shape torch.Size([8, 768])
#     return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model=CustomModel(checkpoint=checkpoint,num_labels=10).to(device)

In [13]:
# from tqdm.auto import tqdm
# num_training_steps = EPOCHS * len(training_loader)
# from datasets import load_metric
# metric = load_metric("f1")

# def train(optimizer, model, training_loader, testing_loader, device, num_epochs, learning_rate = 0.1):
#     best_accuracy = float('inf')

#     print("Training Started with hyperparameters: batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature))
    
#     for epoch in range(num_epochs):
#         model.train()
#         total_loss_current_epoch = 0
#         for _,data in enumerate(training_loader, 0):
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#             targets = data['targets'].to(device, dtype = torch.float)

#             #Calling the model
#             outputs = model(ids, mask, targets)

#             loss = outputs.loss
#             loss.backward()
#             optimizer.step()
#             # lr_scheduler.step()
#             optimizer.zero_grad()
#             progress_bar_train.update(1)
#             if _%5000==0:
#                 print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
#             #Calculating total loss during this EPOCH
#             total_loss_current_epoch += loss.item()

#         outputs, targets = validation(model, testing_loader,device)
#         outputs = np.array(outputs) >= 0.5
#         accuracy = round(metrics.accuracy_score(targets, outputs),3)

#         if accuracy < best_accuracy:
#             best_accuracy = accuracy

#             torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "3_8_accuracy_{ACCURACY}_batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(ACCURACY=accuracy, BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature)))
#             #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
#         print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

#         epoch_loss = total_loss_current_epoch / len(training_loader)
#         print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")
#         print("#"*50)

# progress_bar_train = tqdm(range(num_training_steps))
# progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))
# optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

# train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'




## GPT Model


In [15]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline, GPT2Config, AdamW, TrainingArguments, Trainer, AutoModelWithLMHead, DataCollatorForLanguageModeling
epsilon = 1e-8

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelWithLMHead.from_pretrained(checkpoint).to(device)
configuration = GPT2Config.from_pretrained(checkpoint, output_hidden_states=False)
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# create a bunch of exaples of "question:" and "doctor" answer, and put it into one string. and use "question: XXXX: doctor: " to get the validation 
optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = epsilon
                )

# text_generator = TextGenerationPipeline(model, tokenizer)
# result = text_generator("这是很久之前的事情了", max_length=100, do_sample=True)
training_args = TrainingArguments(
    output_dir="./gpt2-gerchef", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=training_loader,
    eval_dataset=testing_loader,
    # prediction_loss_only=True,
)

trainer.train()


***** Running training *****
  Num examples = 15203
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1428
  Number of trainable parameters = 102068736


Step,Training Loss


Saving model checkpoint to ./gpt2-gerchef/checkpoint-800
Configuration saved in ./gpt2-gerchef/checkpoint-800/config.json
Configuration saved in ./gpt2-gerchef/checkpoint-800/generation_config.json
Configuration saved in ./gpt2-gerchef/checkpoint-800/config.json
Configuration saved in ./gpt2-gerchef/checkpoint-800/generation_config.json
Model weights saved in ./gpt2-gerchef/checkpoint-800/pytorch_model.bin


TrainOutput(global_step=1428, training_loss=2.7532657227930235, metrics={'train_runtime': 377.9806, 'train_samples_per_second': 120.665, 'train_steps_per_second': 3.778, 'total_flos': 2979317071872000.0, 'train_loss': 2.7532657227930235, 'epoch': 3.0})

In [16]:
trainer.save_model()

Saving model checkpoint to ./gpt2-gerchef
Configuration saved in ./gpt2-gerchef/config.json
Configuration saved in ./gpt2-gerchef/generation_config.json
Model weights saved in ./gpt2-gerchef/pytorch_model.bin


In [23]:
from transformers import pipeline
chef = pipeline('text-generation',model='./gpt2-gerchef', tokenizer=checkpoint)
result = chef("这是很久之前的事情了")[0]['generated_text']


loading configuration file ./gpt2-gerchef/config.json
Model config GPT2Config {
  "_name_or_path": "./gpt2-gerchef",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 320
    }
  },
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": 

In [29]:
result = chef("病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。")[0]['generated_text']
print(result)

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 320,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


病人：便秘，便秘灌肠，四五天不大便，大便不干，发黑。 吃 后 3 - 4 天 ， 发 现 大 便 不 干 ， 肛 门 处 也 有 。 医 生 ： 您 好 ！ 您 好 ！ 病 人 ： 双 腿 发 冷, 头 痛 。 检 查 及 化 验 ： ： 右 腿 下 方 三 个 小 针 眼 位 置 、 、 右 腿 膝 关 节 附 近 不 能 动 。 医 生 ： 你 好 ！ 病 人 ： 颈 椎 病 ， 男 ， 36 岁 ， 颈 椎 病 ， 患 时 发 病 三 年 ， 左


In [None]:
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
total_steps = len(training_loader) * EPOCHS
lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

import time
import datetime
import random 

total_t0 = time.time()

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

def GPTtrain(optimizer, model, training_loader, testing_loader, device, lr_scheduler, num_epochs, learning_rate = 0.1):
    best_accuracy = float('inf')
    t0 = time.time()
    # print("Training Started with hyperparameters: batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature))
    
    for epoch in range(num_epochs):
        model.train()
        total_loss_current_epoch = 0
        for _,data in enumerate(training_loader, 0):
            model.zero_grad() 
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            # targets = data['targets'].to(device, dtype = torch.float)

            #Calling the model
            # print(model)
            # print("ids[0]", ids[7])
            # print("ids.shape", ids.shape, "targets.shape", targets.shape, "mask.shape", mask.shape, "token_type_ids.shape", token_type_ids.shape)
            
            outputs = model(ids, attention_mask=mask, labels=ids, token_type_ids=token_type_ids)

            loss = outputs[0]
            batch_loss = loss.item()
            total_loss_current_epoch += batch_loss


            # Get sample every x batches.
            if _ % 100 == 0 and not _ == 0:

                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(_, len(training_loader), batch_loss, elapsed))

                model.eval()

                sample_outputs = model.generate(
                                        bos_token_id=random.randint(1,30000),
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length = 200,
                                        top_p=0.95, 
                                        num_return_sequences=1,
                                        # token_type_ids=token_type_ids,
                                        # attention_mask=mask
                                    )
                for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
                
                model.train()

            loss.backward()
            optimizer.step()

            lr_scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

            progress_bar_train.update(1)
            if _%5000==0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
            #Calculating total loss during this EPOCH
            total_loss_current_epoch += loss.item()

        # outputs, targets = validation(model, testing_loader,device)
        # outputs = np.array(outputs) >= 0.5
        # accuracy = round(metrics.accuracy_score(targets, outputs),3)

        # if accuracy < best_accuracy:
        #     best_accuracy = accuracy

        # torch.save(model.classifier.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = "3_8_accuracy_{ACCURACY}_batch_{BATCH}_lr_{LEARNING_RATE}_dropout_{DROPOUT}_temp_{TEMPERATURE}_weights.bin".format(ACCURACY=accuracy, BATCH=training_loader.batch_size, LEARNING_RATE=learning_rate, DROPOUT=model.dropout_rate, TEMPERATURE=model.temperature)))
            #torch.save(model.state_dict(), "{PATH_NAME}/{FILE_NAME}".format(PATH_NAME=PATH_NAME, FILE_NAME = FILE_NAME))
        # print(f"Epoch {epoch} - Validation Accuracy: {accuracy}")

        epoch_loss = total_loss_current_epoch / len(training_loader)
        print(f"Epoch {epoch} - Validation Loss: {epoch_loss}")
        print("#"*50)
num_training_steps = EPOCHS * len(training_loader)
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(EPOCHS * len(testing_loader)))

GPTtrain(optimizer, model, training_loader, testing_loader, device, lr_scheduler, EPOCHS, LEARNING_RATE)


  0%|          | 100/102225 [01:21<23:15:03,  1.22it/s]
  0%|          | 0/25557 [01:21<?, ?it/s]
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

[A
[A

Epoch: 0, Loss:  2.2694733142852783



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   100  of  34,075. Loss: 2.0098118782043457.   Elapsed: 0:00:13.



[A

0: joseph 病 （ tumblr strangension ） ， 在 某 些 情 况 下 ， 它 会 导 致 癌 症 （ 也 可 以 说 ： 恶 性 肿 瘤 ） ， 并 且 很 可 能 是 恶 性 淋 巴 瘤 。 病 人 在 不 同 情 况 下 ， 有 可 能 出 现 不 同 的 后 果 ， 会 进 一 步 增 加 患 者 与 组 织 之 间 的 矛 盾 。 strategy science for the parents who used to drug as a district or phenomenon like in parallax to spiritual treatment in the pale of the cancer. drug in triborating of a large cell treatment to restricted. treatment is appreciatively successful about all the other medicine of pare to deploy discussion of the clini



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Batch   200  of  34,075. Loss: 1.9783436059951782.   Elapsed: 0:00:27.



[A

0: 争 病 症 及 医 学 研 究 中 ， 目 前 人 口 密 度 还 没 有 发 生 明 显 的 变 化 ， 但 相 关 的 人 群 数 量 增 加 ， 人 均 体 检 人 数 的 持 续 增 长 和 基 层 医 疗 资 源 的 完 善 已 为 临 床 治 疗 提 供 了 有 利 条 件 。 人 口 基 础 数 量 的 增 加 、 医 疗 技 术 能 力 的 增 加 、 基 层 医 疗 卫 生 机 构 能 够 为 群 众 提 供 更 好 的 医 疗 服 务 ， 从 而 提 高 了 基 层 医 疗 卫 生 服 务 的 质 量 。 2. 医 生 在 日 常 医 疗 保 健 工 作 中 ， 应 从 医 生 、 医 生 作 为 中 介 ， 协 助 他 们 提 供 服 务 ， 为 患 者 提 供 专 业 的 医 疗 服 务 ， 医 生 在 日 常 生 活 的 保 健 工 作 中 是 有 着 重



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[AThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: inde

  Batch   300  of  34,075. Loss: 1.3759506940841675.   Elapsed: 0:00:42.


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemmStridedBatched( handle, opa, opb, m, n, k, &alpha, a, lda, stridea, b, ldb, strideb, &beta, c, ldc, stridec, num_batches)`

## T5 Model


In [None]:
import torch
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

# load tokenizer and model 
pretrained_model = "IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese"

special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
    do_lower_case=True,
    max_length=512,
    truncation=True,
    additional_special_tokens=special_tokens,
)
config = T5Config.from_pretrained(pretrained_model)
model = T5ForConditionalGeneration.from_pretrained(pretrained_model, config=config)
model.resize_token_embeddings(len(tokenizer))
model.eval()

In [None]:
# tokenize
text = "病情诊断任务： 指甲上突起的横纹病情描述： 上周发现拇指指甲上突起一道横纹！不疼不痒，突起来的横纹！希望获得的帮助： 在家用什么药！患病多久： 一月内过敏史： 无（2020-03-27填写）既往病史： 【手术】：17年做过肠癌手术！（2020-03-27填写）既往病史： 无（2020-03-23 告诉我这个病人应该看什么医生"
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)

inputs = {
  "input_ids": torch.tensor([encode_dict['input_ids']]).long(),
  "attention_mask": torch.tensor([encode_dict['attention_mask']]).long(),
  }

# generate answer
logits = model.generate(
  input_ids = inputs['input_ids'],
  max_length=100, 
  do_sample= True
  # early_stopping=True,
  )

logits=logits[:,3:]
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
print(predict_label)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("shibing624/prompt-t5-base-chinese")
model = T5ForConditionalGeneration.from_pretrained("shibing624/prompt-t5-base-chinese")
def batch_generate(input_texts, max_length=64):
    features = tokenizer(input_texts, return_tensors='pt')
    outputs = model.generate(input_ids=features['input_ids'],
                             attention_mask=features['attention_mask'],
                             max_length=max_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)
r = batch_generate(["推理：指甲上突起的横纹病情描述： 上周发现拇指指甲上突起一道横纹！不疼不痒，突起来的横纹！希望获得的帮助： 在家用什么药！患病多久： 一月内过敏史： 无（2020-03-27填写）既往病史： 【手术】：17年做过肠癌手术！"])
print(r)

In [None]:
import numpy as np
from transformers import T5Tokenizer,MT5ForConditionalGeneration

pretrain_path = 'IDEA-CCNL/Randeng-T5-784M-QA-Chinese'
tokenizer=T5Tokenizer.from_pretrained(pretrain_path)
model=MT5ForConditionalGeneration.from_pretrained(pretrain_path)

sample={"context":"在柏林,胡格诺派教徒创建了两个新的社区:多罗西恩斯塔特和弗里德里希斯塔特。到1700年,这个城市五分之一的人口讲法语。柏林胡格诺派在他们的教堂服务中保留了将近一个世纪的法语。他们最终决定改用德语,以抗议1806-1807年拿破仑占领普鲁士。他们的许多后代都有显赫的地位。成立了几个教会,如弗雷德里夏(丹麦)、柏林、斯德哥尔摩、汉堡、法兰克福、赫尔辛基和埃姆登的教会。","question":"除了多罗西恩斯塔特,柏林还有哪个新的社区?","idx":1}
plain_text='question:'+sample['question']+'knowledge:'+sample['context'][:500]

res_prefix=tokenizer.encode('answer',add_special_token=False)
res_prefix.append(tokenizer.convert_tokens_to_ids('<extra_id_0>'))
EOS_TOKEN_ID = tokenizer.convert_tokens_to_ids('<extra_id_0>')
res_prefix.append(EOS_TOKEN_ID)
l_rp=len(res_prefix)

tokenized=tokenizer.encode(plain_text,add_special_tokens=False,truncation=True,max_length=500)
tokenized+=res_prefix
batch=[tokenized]*2
input_ids=torch.tensor(np.array(batch),dtype=torch.long)

# Generate answer
pred_ids = model.generate(input_ids=input_ids,max_new_token=500,do_sample=True,top_p=0.9)
pred_tokens=tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
res=pred_tokens.replace('<extra_id_0>','').replace('有答案:','')

## Validation 

In [None]:

def validation( model, testing_loader, device, model_weights_path = None):

    if model_weights_path:
      state_dict = torch.load(PATH_NAME+model_weights_path)
      model.load_state_dict(state_dict)
      # model.classifier.load_state_dict(torch.load(PATH_NAME+'classification_head_weights.bin'))

    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, targets)
            
            sigmoids = torch.sigmoid(outputs.logits).cpu().detach().numpy().tolist()
            idx = np.argmax(sigmoids, axis=-1)
            sigmoids = np.zeros( (len(sigmoids), len(sigmoids[1])) )
            sigmoids[ np.arange(len(sigmoids)), idx] = 1
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(sigmoids)
    return fin_outputs, fin_targets

## Grid Search

In [None]:
param_grid = {'dropout': [0.1, 0.2],
              'learning_rate': [0.0001,0.001, 0.01],
              'batch_size': [32, 64],
              'temperature': [0.9, 0.7]}

from sklearn.metrics import accuracy_score
import torch.optim as optim

for batch_size in param_grid['batch_size']:
    train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    test_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 2
                }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    for learning_rate in param_grid['learning_rate']:
        for dropout_rate in param_grid['dropout']:
            for temperature in param_grid['temperature']:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model=CustomModel(checkpoint=checkpoint, num_labels=10, temperature=temperature, dropout_rate=dropout_rate).to(device)
                optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

                train(optimizer, model, training_loader, testing_loader, device, EPOCHS, LEARNING_RATE)

                

                


In [None]:
from sklearn import metrics

# Define additional metrics
def precision(outputs, targets):
    return metrics.precision_score(targets, outputs, average = 'weighted')

def recall(outputs, targets):
    return metrics.recall_score(targets, outputs, average = 'weighted')

In [None]:
for epoch in range(1):
    outputs, targets = validation(FILE_NAME)
    print('outputs', outputs)
    print('targets', targets)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    precision_score = precision(outputs, targets)
    recall_score = recall(outputs, targets)
      
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Precision Score = {precision_score}")
    print(f"Recall Score = {recall_score}")

In [None]:
torch.save(model.state_dict(), "{PATH_NAME}/roberta-base-additional-classification-layer-bert-medical.bin".format(PATH_NAME=PATH_NAME))