In [1]:
import pandas as pd
from datasets import Dataset

# Load train, validation, and test datasets
train_df = pd.read_csv('D:\\data.csv\\cnn_dailymail\\train.csv')
val_df = pd.read_csv('D:\\data.csv\\cnn_dailymail\\validation.csv')
test_df = pd.read_csv('D:\\data.csv\\cnn_dailymail\\test.csv')

In [2]:
train_df=train_df[:8000]
val_df=val_df[:500]
test_df=test_df[:500]

In [3]:
train_df

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
7995,169c8907a7a905036b4f8aec7304e19d4503b8dd,"By . Rob Cooper . PUBLISHED: . 11:01 EST, 7 Ma...",Dam which Ben Miller and Adam Flint cycled dow...
7996,169dc376ac54240e52683830a0054f48af4ba5c4,Severe storms tore through the Midwest and Sou...,Tornadoes touched down in several states this ...
7997,169eab99eec8a418ab60ad29e357ed4412c9f25a,Islamabad (CNN) -- The spotlight on so-called ...,Dania Gharaibeh: Condemning the act and its pe...
7998,169f5f6ab3818fc14b9f2471ee0d8dbd61d5e566,"Pasadena, California (CNN) -- When President B...",Veteran Tracey Cooper-Harris married her same-...


In [4]:
references = test_df['highlights'].tolist()

In [5]:
test_df['article'][0]

"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for sp

In [9]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    inputs = tokenizer(examples['article'], truncation=True, padding='max_length', max_length=512)
    targets = tokenizer(examples['highlights'], truncation=True, padding='max_length', max_length=150)
    return {'input_ids': inputs['input_ids'], 'labels': targets['input_ids']}

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
import torch
from torch.utils.data import DataLoader
# Define DataLoader
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=4, shuffle=False)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
device

device(type='cuda')

In [14]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [15]:
# Move model to GPU
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

# ***Fine-Tune T5 Model***

In [16]:


training_args = TrainingArguments(
    output_dir="D:\\data.csv\\t5_custom_summarizer",
    per_device_train_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate the model every 500 steps
    save_steps=500,  # Save the model every 500 steps
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)




In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.4764,1.05804
1000,1.0706,1.044348
1500,1.0197,1.038672
2000,1.0179,1.032338
2500,1.0195,1.030331
3000,1.0033,1.028054
3500,0.9778,1.026238
4000,1.0044,1.022729
4500,0.9878,1.025038
5000,0.9673,1.024137


In [55]:

model.save_pretrained("./t5_custom_summarizer_model")
tokenizer.save_pretrained("./t5_custom_summarizer")

('./t5_custom_summarizer\\tokenizer_config.json',
 './t5_custom_summarizer\\special_tokens_map.json',
 './t5_custom_summarizer\\spiece.model',
 './t5_custom_summarizer\\added_tokens.json')

# Evaluate on Test Set

In [56]:
loaded_model = T5ForConditionalGeneration.from_pretrained("./t5_custom_summarizer_model")
tokenizer = T5Tokenizer.from_pretrained("./t5_custom_summarizer")

# Example usage for generating a summary
input_text = "Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space in the overhead lockers, crashing elbows and seat back kicking? Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased . Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches . Cynthia Corbertt, a human factors researcher with the Federal Aviation Administration, that it conducts tests on how quickly passengers can leave a plane. But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News. The distance between two seats from one point on a seat to the same point on the seat behind it is known as the pitch. While most airlines stick to a pitch of 31 inches or above, some fall below this. While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches. British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31."
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = loaded_model.generate(input_ids, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Summary: 'The shrinking space on planes is not only uncomfortable - it's putting our health and safety in danger' Some experts are questioning if having such packed out planes is putting passengers at risk. The government is happy to set standards for animals flying on planes, but it doesn't stipulate a minimum amount of space for humans.


In [59]:
input_text

"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xa0'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for sp

In [57]:
# Assuming 'test_references' is a list of reference summaries for the test dataset
test_references = references[0]

In [58]:
test_references

'Experts question if  packed out planes are putting passengers at risk .\nU.S consumer advisory group says minimum space must be stipulated .\nSafety tests conducted on planes with more leg room than airlines offer .'

In [None]:

# Convert predictions to a list
test_predictions = [prediction["generated_text"] for prediction in test_predictions.predictions]

# Compute ROUGE scores
rouge_scores = rouge_metric.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print(rouge_scores)