# Loading Dataset

In [1]:
from datasets import load_dataset

# Load the PubMed Summarization dataset
dataset = load_dataset("ccdv/pubmed-summarization")

No config specified, defaulting to: pubmed-summarization/section
Found cached dataset pubmed-summarization (C:/Users/HUZAIFA/.cache/huggingface/datasets/ccdv___pubmed-summarization/section/1.0.0/f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

# Exploring Dataset

In [3]:
# Accessing different splits of the data
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

# Printing each split
print("First example from the train split:")
print(train_dataset[0])

print("\nFirst example from the validation split:")
print(validation_dataset[0])

print("\nFirst example from the test split:")
print(test_dataset[0])

First example from the train split:
{'article': "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . \n in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . \n the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . \n anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . \n snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states \n there are also some reports regarding

# Preprocessing and Cleaning Text Data

In [4]:
import re

def preprocess(text):
    # Remove any special characters or digits
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    return text

# Apply preprocessing to the dataset
dataset = dataset.map(lambda x: {'article': preprocess(x['article']), 'abstract': preprocess(x['abstract'])})

Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-8ab4d13913cc1dbd.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-c81a8497acb16ec9.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-2386ad7fc2dd3c65.arrow


In [5]:
print(dataset['train'][0])

{'article': 'a recent systematic analysis showed that in million children younger than years were mildly moderately or severely stunted and million were mildly moderately or severely underweight in the developing countries in iran a study among high school girls in sistan and baluchestan showed prevalence of and for underweight overweight and obesity respectively the prevalence of malnutrition among elementary school aged children in tehran varied from to anthropometric study of elementary school students in shiraz revealed that of them suffer from malnutrition and low body weight snack should have kcal energy and could provide g of protein day nowadays school nutrition programs are running as the national programs world wide national school lunch program in the united states there are also some reports regarding school feeding programs in developing countries in vietnam school base program showed an improvement in nutrient intakes in iran a national free food program nffp is implement

In [6]:
print(dataset['test'][0])

{'article': 'anxiety affects quality of life in those living with parkinson s disease pd more so than overall cognitive status motor deficits apathy and depression although anxiety and depression are often related and coexist in pd patients recent research suggests that anxiety rather than depression is the most prominent and prevalent mood disorder in pd yet our current understanding of anxiety and its impact on cognition in pd as well as its neural basis and best treatment practices remains meager and lags far behind that of depression overall neuropsychiatric symptoms in pd have been shown to be negatively associated with cognitive performance for example higher depression scores have been correlated with lower scores on the mini mental state exam mmse as well as tests of memory and executive functions eg attention likewise apathy and anhedonia in pd patients have been associated with executive dysfunction however few studies have specifically investigated the relationship between a

In [7]:
print(dataset['validation'][0])

{'article': 'venous thromboembolism vte comprising of deep vein thrombosis dvt and pulmonary embolism pe can result in significant mortality morbidity and healthcare expenditure approximately one third of patients with symptomatic vte manifests pe whereas two thirds manifest dvt alone both dvt and pe can be clinically silent asymptomatic and hence not suspected if undiagnosed asymptomatic vte can lead to chronic venous disease or recurrent vte and long term debilitating sequelae such as postthrombotic syndrome and chronic thromboembolic pulmonary hypertension vte is not only disabling but also prolongs hospital stay and increases the cost of treatment along with myocardial infarction and arrhythmia due to electrolyte imbalance pe is one of the commonest causes of sudden unexplained deaths in hospitalized patients it is estimated that million cases of lower extremity dvt occur in the usa alone the prevailing notion that the incidence of vte in asians is less than that in the western pop

# Model Selection and Fine-Tuning

In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Load the model and tokenizer 
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-928c5032f2670a12.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-c16a1f1603b00963.arrow
Loading cached processed dataset at C:\Users\HUZAIFA\.cache\huggingface\datasets\ccdv___pubmed-summarization\section\1.0.0\f765ec606c790e8c5694b226814a13f1974ba4ea98280989edaffb152ded5e2b\cache-529e98dfba94d38e.arrow


In [11]:
# Fine-tune the model
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, abstract. If article, abstract are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 119924
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 29981
  Number of trainable parameters = 60506624


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Saving the Fine-Tined Model

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine-tuned-t5")
tokenizer.save_pretrained("./fine-tuned-t5")

# Performance Evaluation and Metric

In [None]:
# Evaluate the model
metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    return result

# Evaluate
results = trainer.evaluate(eval_dataset=tokenized_dataset["validation"], metric_key_prefix="eval", compute_metrics=compute_metrics)
print(results)