# Loading Dataset

In [1]:
from datasets import load_dataset, load_metric

# Load the PubMed Summarization dataset
dataset = load_dataset("ccdv/pubmed-summarization")

No config specified, defaulting to: pubmed-summarization/section
Found cached dataset pubmed-summarization (C:/Users/HUZAIFA/.cache/huggingface/datasets/ccdv___pubmed-summarization/section/1.0.0/f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

# Exploring Dataset

In [3]:
# Accessing different splits of the data
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [4]:
print("First example from the train split:")
print(train_dataset[0])

First example from the train split:
{'article': "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . \n in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . \n the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . \n anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . \n snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states \n there are also some reports regarding

In [5]:
print("\nFirst example from the validation split:")
print(validation_dataset[0])


First example from the validation split:
{'article': "venous thromboembolism ( vte ) comprising of deep vein thrombosis ( dvt ) and pulmonary embolism ( pe ) can result in significant mortality , morbidity , and healthcare expenditure . \n approximately , one - third of patients with symptomatic vte manifests pe , whereas two - thirds manifest dvt alone . \n both dvt and pe can be clinically silent ( asymptomatic ) and hence not suspected . \n if undiagnosed , asymptomatic vte can lead to chronic venous disease or recurrent vte and long - term debilitating sequelae such as postthrombotic syndrome and chronic thromboembolic pulmonary hypertension . \n vte is not only disabling but also prolongs hospital stay and increases the cost of treatment . \n along with myocardial infarction and arrhythmia ( due to electrolyte imbalance ) , pe is one of the commonest causes of sudden unexplained deaths in hospitalized patients . \n it is estimated that 20 million cases of lower extremity dvt occu

In [6]:
print("\nFirst example from the test split:")
print(test_dataset[0])


First example from the test split:
{'article': "anxiety affects quality of life in those living with parkinson 's disease ( pd ) more so than overall cognitive status , motor deficits , apathy , and depression [ 13 ] . \n although anxiety and depression are often related and coexist in pd patients , recent research suggests that anxiety rather than depression is the most prominent and prevalent mood disorder in pd [ 5 , 6 ] . yet , \n our current understanding of anxiety and its impact on cognition in pd , as well as its neural basis and best treatment practices , remains meager and lags far behind that of depression . \n overall , neuropsychiatric symptoms in pd have been shown to be negatively associated with cognitive performance . \n for example , higher depression scores have been correlated with lower scores on the mini - mental state exam ( mmse ) [ 8 , 9 ] as well as tests of memory and executive functions ( e.g. , attention ) [ 1014 ] . \n likewise , apathy and anhedonia in p

# Picking Subset of the Dataset

In [7]:
train_dataset = dataset['train'].shuffle(seed=42).select(range(5000))
validation_dataset = dataset['validation'].shuffle(seed=42).select(range(300))
test_dataset = dataset['test'].shuffle(seed=42).select(range(300))

Loading cached shuffled indices for dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-db49f2ccb5ea7bfe.arrow
Loading cached shuffled indices for dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-e8dd4807d86ad8d6.arrow
Loading cached shuffled indices for dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-9961fe22bf737668.arrow


# Preprocessing and Cleaning Text Data

In [11]:
import re

def preprocess(text):
    # Remove any special characters or digits
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    return text

# Apply preprocessing to the dataset
train_dataset = train_dataset.map(lambda x: {'article': preprocess(x['article']), 'abstract': preprocess(x['abstract'])})
validation_dataset = validation_dataset.map(lambda x: {'article': preprocess(x['article']), 'abstract': preprocess(x['abstract'])})
test_dataset = test_dataset.map(lambda x: {'article': preprocess(x['article']), 'abstract': preprocess(x['abstract'])})

Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-1b6e9293a0683460.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-121830eb23a1c1d9.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-f6a0f63aaf3f0651.arrow


In [12]:
train_dataset[0]

{'article': 'atrial fibrillation af is the most common sustained arrhythmia in western countries with an estimated million patients affected by across united states and europe alone atrial fibrillation has a significant impact on morbidity mainly related to symptoms heart failure and thromboembolic events and is the most frequent arrhythmic cause of hospital admission in the usa in addition af is associated with excess mortality independently of thromboembolic complications to date the most effective treatment for af is radiofrequency catheter ablation and pulmonary vein antrum isolation pvai is the mainstay of such an approach the major drawback of catheter ablation of af consists in its potential risk of periprocedural complications with thromboembolic and hemorrhagic complications being among the most common and insidious ones despite the introduction of novel ablation technologies such as open irrigation catheters and the widespread use of systemic anticoagulation with heparin the 

In [13]:
validation_dataset[0]

{'article': 'a year old male patient with interstitial lung disease presented to a local clinic because of difficulty breathing and a subjective fever he had a fever of the fever went down to with the administration of antibiotics the patient was then given oral corticosteroids predinisolone mg day nine days after admission to a local clinic he developed blurred vision in both eyes however ophthalmic examination was not performed initially three days after this decrease in visual acuity he was referred to our hospital on initial examination anterior segment examination revealed severe conjunctival injection severe anterior chamber reaction without hypopyon and mild complicated cataracts in both eyes fundus examination revealed a vitreal reaction and multiple whitish yellow exudates in both eyes a single exudate was seen in the right macula and multiple exudates were observed in the left macula fig we initially suspected a fungal infection such as candida given his history and fundus fi

In [14]:
test_dataset[0]

{'article': 'the birth of a premature infant has long been documented as a stressful event for parents the premature birth of an infant and the following neonatal intensive care cause psychological distress and can have a traumatizing effect on parents the neonatal intensive care unit nicu environment has the potential to exacerbate stress for parents of infants admitted to the nicu mothers have typically been found to have higher levels of distress than fathers and they experience significant levels of stress and depression in the early postpartum period maternal stress can have deleterious effects on mother infant interaction particularly on mothers abilities to form an attachment to their baby maternal stress diminishes the mother s ability to be sensitive to the infant s cues and be responsive in interacting with the infant participating in infant care influences the maternal feelings in a positive direction finding of a study has shown that a mother who is denied the opportunity c

# Model Selection, Tokenize Dataset and Fine-Tuning

In [15]:
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Load the model and tokenizer 
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=150, truncation=True, padding="max_length")
        
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["article", "abstract"])
tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True, remove_columns=["article", "abstract"])
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["article", "abstract"])

Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-eef9bbb3c4ae35d2.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-76bccf99dab2385e.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-d57dbeb660f0fca1.arrow


In [17]:
tokenized_train_dataset[0]

{'input_ids': [21603,
  10,
  44,
  12042,
  25427,
  1092,
  257,
  3,
  9,
  89,
  19,
  8,
  167,
  1017,
  14399,
  1584,
  30793,
  23,
  9,
  16,
  8282,
  1440,
  28,
  46,
  5861,
  770,
  1221,
  4161,
  57,
  640,
  18279,
  2315,
  11,
  3,
  28188,
  2238,
  44,
  12042,
  25427,
  1092,
  257,
  65,
  3,
  9,
  1516,
  1113,
  30,
  8030,
  9824,
  485,
  3,
  4894,
  1341,
  12,
  3976,
  842,
  3338,
  11,
  3,
  8514,
  6310,
  15,
  6310,
  2176,
  984,
  11,
  19,
  8,
  167,
  8325,
  1584,
  30793,
  447,
  1137,
  13,
  2833,
  7209,
  16,
  8,
  178,
  9,
  16,
  811,
  3,
  9,
  89,
  19,
  1968,
  28,
  7240,
  20544,
  13971,
  13,
  3,
  8514,
  6310,
  15,
  6310,
  2176,
  14497,
  12,
  833,
  8,
  167,
  1231,
  1058,
  21,
  3,
  9,
  89,
  19,
  2252,
  30989,
  27594,
  703,
  6105,
  11,
  3,
  26836,
  13290,
  3,
  288,
  2781,
  15997,
  3,
  102,
  900,
  23,
  19,
  8,
  711,
  21545,
  13,
  224,
  46,
  1295,
  8,
  779,
  3314,
  1549,
  13,
  

In [18]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
)

trainer.train()

***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 9
  Number of trainable parameters = 60506624


Epoch,Training Loss,Validation Loss
1,No log,6.284391
2,No log,6.126451
3,No log,6.063614


***** Running Evaluation *****
  Num examples = 3
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=9, training_loss=6.784933725992839, metrics={'train_runtime': 92.5081, 'train_samples_per_second': 0.324, 'train_steps_per_second': 0.097, 'total_flos': 4060254044160.0, 'train_loss': 6.784933725992839, 'epoch': 3.0})

# Saving the Fine-Tined Model

In [19]:
# Save the model and tokenizer
model.save_pretrained("./fine-tuned-t5")
tokenizer.save_pretrained("./fine-tuned-t5")

Configuration saved in ./fine-tuned-t5\config.json
Configuration saved in ./fine-tuned-t5\generation_config.json
Model weights saved in ./fine-tuned-t5\pytorch_model.bin
tokenizer config file saved in ./fine-tuned-t5\tokenizer_config.json
Special tokens file saved in ./fine-tuned-t5\special_tokens_map.json


('./fine-tuned-t5\\tokenizer_config.json',
 './fine-tuned-t5\\special_tokens_map.json',
 './fine-tuned-t5\\spiece.model',
 './fine-tuned-t5\\added_tokens.json')

# Performance Evaluation and Metric

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    return result

# Load ROUGE Metric
metric = load_metric("rouge")

# Evaluate
results = trainer.evaluate(eval_dataset=tokenized_validation_dataset, metric_key_prefix="eval")
print(results)

  metric = load_metric("rouge")
***** Running Evaluation *****
  Num examples = 3
  Batch size = 4


{'eval_loss': 6.0636138916015625, 'eval_runtime': 2.203, 'eval_samples_per_second': 1.362, 'eval_steps_per_second': 0.454, 'epoch': 3.0}
