In [None]:
import torch 
import torch.nn as nn 

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline, set_seed
from transformers import TrainingArguments, Trainer
import matplotlib.pyplot as plt 
from datasets import load_dataset, load_metric

from torch.utils.data import Dataset, DataLoader
import nltk 
from nltk.tokenize import sent_tokenize

from tqdm import tqdm

import pandas as pd 

from rouge import Rouge

In [40]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device 

device(type='cuda')

In [41]:

tokenizer = AutoTokenizer.from_pretrained("Falconsai/medical_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/medical_summarization")

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
path_dataset = "/home/jerlshin/Documents/My_Work/GenAI_Hackathon_16April2024/Patient_Query_Severity/Patient_Doctor_Severity_Dataset.csv"

dataset = pd.read_csv(path_dataset)

In [9]:
selected_columns = ["Description", "Patient"]
df_org = dataset[selected_columns]

df_org.head()

Unnamed: 0,Description,Patient
0,what does abutment of the nerve root mean,hi doctor I am just wondering what is abutting...
1,every time I eat spicy food I poop blood why,hi doctor I am a 26 year old male I am feet an...
2,will nano leo give permanent solution for erec...,hello doctor I am 48 years old I am experienci...
3,will kalarchikai cure multiple ovarian cysts i...,hello doctor I have multiple small cysts in bo...
4,I masturbate only by rubbing the tip of the pe...,hi doctor during masturbation I just rub the t...


In [10]:
SIZE = df_org.shape[0]

# input to the model 
train_texts = list(df_org.Patient[:SIZE//2])
val_texts = list(df_org.Patient[SIZE//2:(3*SIZE)//4 ])
test_texts = list(df_org.Patient[(3*SIZE)//4:])

# output from the model 
train_des = list(df_org.Description[:SIZE//2])
val_des = list(df_org.Description[SIZE//2:(3*SIZE)//4])
test_des = list(df_org.Description[(3*SIZE)//4:])

In [13]:
SIZE == len(train_texts) + len(val_texts) + len(test_texts)

True

In [29]:
train_texts[0]

'hi doctor I am just wondering what is abutting and abutment of the nerve root means in a back issue please explain what treatment is required for annular bulging and tear'

In [24]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_description = tokenizer(train_des, truncation=True, padding=True)
val_description = tokenizer(val_des, truncation=True, padding=True)
test_description = tokenizer(test_des, truncation=True, padding=True)


In [32]:
tokenizer(train_des[0])

{'input_ids': [0, 12196, 473, 4091, 1182, 1757, 9, 5, 10387, 9749, 1266, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [31]:
tokenizer(train_texts[:2])

{'input_ids': [[0, 3592, 3299, 38, 524, 95, 8020, 99, 16, 4091, 1182, 2577, 8, 4091, 1182, 1757, 9, 5, 10387, 9749, 839, 11, 10, 124, 696, 2540, 3922, 99, 1416, 16, 1552, 13, 9915, 8244, 22382, 3923, 8, 7366, 2], [0, 3592, 3299, 38, 524, 10, 973, 76, 793, 2943, 38, 524, 1730, 8, 4877, 6764, 8, 9832, 28080, 2697, 77, 38, 3529, 24042, 689, 38, 36733, 1925, 2128, 77, 38, 33, 10759, 33412, 25, 157, 38, 36733, 10, 410, 828, 9, 1925, 38, 524, 269, 8265, 14, 38, 33, 17735, 1668, 38, 109, 33, 28657, 747, 38, 109, 45, 33, 10, 284, 750, 9, 17735, 1668, 38, 300, 1925, 3457, 626, 94, 363, 2540, 465, 127, 690, 7391, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [33]:
class SummarizationDataset(Dataset):
    def __init__(self, encodings_input, encodings_output):
        self.encodings_input = encodings_input  # encoding of the train, val, test 
        self.encodings_output = encodings_output

    def __getitem__(self, idx): 

        """ Make a dict of the item and the description for the training. for train loop"""
        # return as a tensor 
        
        item = {
            "input_ids": torch.tensor(self.encodings_input["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings_input["input_ids"][idx]),
            "labels": torch.tensor(self.encodings_output["input_ids"][idx]),
        }

        return item

    def __len__(self):
        return len(self.encodings_output["input_ids"]) # len of desc


In [34]:
train_dataloader = SummarizationDataset(train_encodings, train_description)
val_dataloader = SummarizationDataset(val_encodings, val_description)
test_dataset = SummarizationDataset(test_encodings, test_description)

In [38]:
def compute_rouge(predictions, references):
    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

def compute_metrics(pred):
    predictions = pred.predictions
    references = pred.label_ids
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_refs = tokenizer.batch_decode(references, skip_special_tokens=True)
    rouge_scores = compute_rouge(decoded_preds, decoded_refs)
    return rouge_scores

In [None]:
training_args = TrainingArguments(
    output_dir='./QUERY_SUMM_OUTPUT', 
    do_train=True,
    do_eval=True,
    num_train_epochs=20,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    warmup_steps=100,                # setps used for a linear warmup for learning rate 
    logging_strategy='steps',
    logging_dir='./query_summarization',  # tensorboard log dir            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps", 
    fp16=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model, 
    args=training_args,                    
    
    train_dataset=train_dataloader,     
    eval_dataset=val_dataloader,            
    
    compute_metrics=compute_metrics
)

In [2]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM


tokenizer = AutoTokenizer.from_pretrained("TOK_QUERY")
model = AutoModelForSeq2SeqLM.from_pretrained("SUMM_QUERY")

In [4]:
from transformers import pipeline

pipe = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer
)

In [8]:

text = """

In this case, the model is processing more tokens (200) than necessary for the short input (47 tokens). To improve efficiency, you can reduce the max_length parameter when calling the pipe function. Here's how you can modify the code:

"""

238

In [10]:
pipe(text, max_length=33)

[{'summary_text': 'ssss'}]