In [5]:
import torch as t


In [36]:
a = t.cuda.device_count()
a

0

In [None]:
# !pip install datasets rouge_score torchmetrics 

In [4]:
from datasets import *
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForLanguageModeling, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [21]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset

class MedicalDialogueDataset(Dataset):
    def __init__(self, split):
        # 加载数据集
        ds = load_dataset("omi-health/medical-dialogue-to-soap-summary", split=split)
        
        # 移除不需要的列
        columns_to_remove = ['messages', 'prompt']
        ds = ds.remove_columns(columns_to_remove)
        
        # 替换换行符并重命名列
        # ds = ds.map(self.replace_newline_with_space)
        ds = ds.rename_column('soap', 'summary')
        
        # 添加ID和格式化摘要
        ds = ds.map(self.add_id, with_indices=True)
        ds = ds.map(self.format_summary)
        
        self.data = ds

    # def replace_newline_with_space(self, example):
        # example['dialogue'] = example['dialogue'].replace('\n', ' ')
        # return example
    
    def add_id(self, example, idx):
        example['id'] = str(idx)
        return example
    
    def format_summary(self, example):
        example['summary'] = example['summary'].replace('S: ', 'Subjective: ')
        example['summary'] = example['summary'].replace('O: ', 'Objective: ')
        example['summary'] = example['summary'].replace('A: ', 'Assessment: ')
        example['summary'] = example['summary'].replace('P: ', 'Plan: ')
        return example

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]  # 获取索引对应的数据项
        ordered_item = {'id': item['id']}  # 创建一个新字典，并首先加入'id'
        ordered_item.update({k: item[k] for k in item if k != 'id'})  # 添加其他字段，排除'id'
        return ordered_item



In [22]:
# 示例初始化
train_data = MedicalDialogueDataset('train')
valid_data = MedicalDialogueDataset('validation')
test_data = MedicalDialogueDataset('test')

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [23]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 9250
valid set size: 500
test set size: 250
{'id': '0', 'dialogue': "Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are a

In [24]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("gauravkoradiya/T5-Finetuned-Summarization-DialogueDataset")



In [29]:
dialogue = "Doctor: What brings you back into the clinic today, miss? Patient: I came in for a refill of my blood pressure medicine. Doctor: It looks like Doctor Kumar followed up with you last time regarding your hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Have you noticed any changes or do you have any concerns regarding these issues? Patient: No. Doctor: Have you had any fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure?Patient: No. Doctor: Great. Also, for our records, how old are you and what race do you identify yourself as?Patient: I am seventy six years old and identify as a white female."
inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
summary = "The patient is a 76-year-old white female who presents to the clinic today originally for hypertension and a med check.  She has a history of hypertension, osteoarthritis, osteoporosis, hypothyroidism, allergic rhinitis and kidney stones.  Since her last visit she has been followed by Dr. Kumar.  Those issues are stable.  She has had no fever or chills, cough, congestion, nausea, vomiting, chest pain, chest pressure."
# 对目标摘要进行编码
targets = tokenizer(summary, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
# 打印输入的令牌ID和对应的文本表示
print('Token IDs:', inputs['input_ids'])
print('Tokens:', tokenizer.convert_ids_to_tokens(inputs['input_ids'][0].tolist()))

Token IDs: tensor([[    0, 41152,    35,  ...,     1,     1,     1]])
Tokens: ['<s>', 'Doctor', ':', 'ĠWhat', 'Ġbrings', 'Ġyou', 'Ġback', 'Ġinto', 'Ġthe', 'Ġclinic', 'Ġtoday', ',', 'Ġmiss', '?', 'ĠPatient', ':', 'ĠI', 'Ġcame', 'Ġin', 'Ġfor', 'Ġa', 'Ġrefill', 'Ġof', 'Ġmy', 'Ġblood', 'Ġpressure', 'Ġmedicine', '.', 'ĠDoctor', ':', 'ĠIt', 'Ġlooks', 'Ġlike', 'ĠDoctor', 'ĠKumar', 'Ġfollowed', 'Ġup', 'Ġwith', 'Ġyou', 'Ġlast', 'Ġtime', 'Ġregarding', 'Ġyour', 'Ġhypertension', ',', 'Ġoste', 'o', 'arth', 'ritis', ',', 'Ġoste', 'op', 'or', 'osis', ',', 'Ġhyp', 'othy', 'roid', 'ism', ',', 'Ġallergic', 'Ġrh', 'in', 'itis', 'Ġand', 'Ġkidney', 'Ġstones', '.', 'Ġ', 'ĠHave', 'Ġyou', 'Ġnoticed', 'Ġany', 'Ġchanges', 'Ġor', 'Ġdo', 'Ġyou', 'Ġhave', 'Ġany', 'Ġconcerns', 'Ġregarding', 'Ġthese', 'Ġissues', '?', 'ĠPatient', ':', 'ĠNo', '.', 'ĠDoctor', ':', 'ĠHave', 'Ġyou', 'Ġhad', 'Ġany', 'Ġfever', 'Ġor', 'Ġch', 'ills', ',', 'Ġcough', ',', 'Ġcongestion', ',', 'Ġnausea', ',', 'Ġvomiting', ',', 'Ġchest', 'Ġpain',

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

In [40]:
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, AdamW

max_input_length = 512
max_target_length = 64
model = AutoModelForSeq2SeqLM.from_pretrained("gauravkoradiya/T5-Finetuned-Summarization-DialogueDataset").to(device).half()
optimizer = AdamW(model.parameters(), lr=5e-5)




In [42]:
def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['dialogue'])
        batch_targets.append(sample['summary'])
    batch_data = tokenizer(
        batch_inputs, 
        padding=True, 
        max_length=max_input_length,
        truncation=True, 
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets, 
            padding=True, 
            max_length=max_target_length,
            truncation=True, 
            return_tensors="pt"
        )["input_ids"]
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx+1:] = -100
        batch_data['labels'] = labels
    return batch_data

In [43]:
train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=collote_fn)

In [44]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])
batch shape: {'input_ids': torch.Size([4, 512]), 'attention_mask': torch.Size([4, 512]), 'decoder_input_ids': torch.Size([4, 64]), 'labels': torch.Size([4, 64])}
{'input_ids': tensor([[    0, 41152,    35,  ...,     1,     1,     1],
        [    0, 41152,    35,  ...,    38,   192,     2],
        [    0, 41152,    35,  ...,  1302,    14,     2],
        [    0, 41152,    35,  ...,   110,   618,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[    2,     0, 47159,  2088,    35,    20,  3186,   690,  2157,    22,
           102,   828,   160,   113,   187,   618, 23655,   183,   290,   511,
            10,   235,  2853, 22362, 36291,     8, 18422,  1988,  6204, 23496,
         37908,  2982, 22580,    13, 10665,  1668,     4,    20,  3186,    34,
            57,  7242, 



In [None]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

In [None]:
import numpy as np
from rouge import Rouge

rouge = Rouge()

def test_loop(dataloader, model):

    preds, labels = [], []
    
    model.eval()
    for batch_data in tqdm(dataloader):
        # batch_data = batch_data.to(device)
        batch_data = {k: v.to(device) for k, v in batch_data.items()}
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
                num_beams=4,
                no_repeat_ngram_size=2,
            ).cpu().numpy()
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [' '.join(pred.strip()) for pred in decoded_preds]
        labels += [' '.join(label.strip()) for label in decoded_labels]
    scores = rouge.get_scores(hyps=preds, refs=labels, avg=True)
    result = {key: value['f'] * 100 for key, value in scores.items()}
    result['avg'] = np.mean(list(result.values()))
    print(f"Rouge1: {result['rouge-1']:>0.2f} Rouge2: {result['rouge-2']:>0.2f} RougeL: {result['rouge-l']:>0.2f}\n")
    return result

In [None]:
from transformers import AdamW, get_scheduler

learning_rate = 2e-5
epoch_num = 10

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_avg_rouge = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_rouge = test_loop(valid_dataloader, model)
    print(valid_rouge)
    rouge_avg = valid_rouge['avg']
    if rouge_avg > best_avg_rouge:
        best_avg_rouge = rouge_avg
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_rouge_{rouge_avg:0.4f}_model_weights.bin')
print("Done!")

## 推理

In [1]:
import pandas as pd
test = pd.read_csv('MTS-Dialog-Combined.csv')

In [6]:
datasets = Dataset.from_pandas(test)

In [9]:
datasets = datasets.train_test_split(test_size= 0.1)
datasets


DatasetDict({
    train: Dataset({
        features: ['ID', 'dialogue', 'summary'],
        num_rows: 1170
    })
    test: Dataset({
        features: ['ID', 'dialogue', 'summary'],
        num_rows: 131
    })
})

In [10]:
from transformers import pipeline
summarizes = pipeline("Summarization", model = model, tokenizer = tokenizer)
conversation = datasets['train'][0]["dialogue"]
predict = summarizes(conversation)
predict

NameError: name 'model' is not defined

In [None]:
label = datasets["train"][0]["summary"]

In [None]:
from rouge import Rouge
rouge.get_score(predict[0]["summary_text"], label)

In [None]:
score_list = []
for dataset in datasets ["test "]:
    predict = summarizer(datasets["dialogue"])
    label = dataset["summary"]
    score = rouge.get_score(predict[0]["summary_text"], label)[0]["rougel_l"]["f"]
    score_list.append(score)

In [None]:
import numpy as np
print(np.mean(score_list))