In [1]:
!pip install transformers
!pip install datasets
!pip install py7zr
!pip install sentencepiece


from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the dataset
dataset = load_dataset("samsum")
train_data = dataset["train"]

# Load the tokenizer for T5-large
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def preprocess_function(examples):
    inputs = examples["dialogue"]
    targets = examples["summary"]
    inputs = [doc + tokenizer.eos_token for doc in inputs]

    # Adjust max_length for T5-large
    model_inputs = tokenizer(inputs, max_length=1024, padding=True, truncation=True, return_tensors="pt")

    # Setup the target for the loss (classification)
    with tokenizer.as_target_tokenizer():
        # Adjust max_length for T5-large
        labels = tokenizer(targets, max_length=300, padding=True, truncation=True, return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess the data
train_data = train_data.map(preprocess_function, batched=True)

# Load the T5-large model
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Rest of the code for fine-tuning and training remains the same


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:0

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("samsum")
train_data = dataset["train"]

# Load the pre-trained T5-base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a custom collate function to pad the sequences within the batch
def collate_fn(batch):
    input_text = [item["dialogue"] for item in batch]
    target_text = [item["summary"] for item in batch]

    model_inputs = tokenizer(input_text, padding=True, return_tensors="pt", truncation=True)
    labels = tokenizer(target_text, padding=True, return_tensors="pt", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"] = model_inputs["labels"].to(device)
    model_inputs["input_ids"] = model_inputs["input_ids"].to(device)
    model_inputs["attention_mask"] = model_inputs["attention_mask"].to(device)

    return model_inputs

# Assuming you have defined train_data and batch_size for the fine-tuning data
batch_size = 8  # Define the batch size as desired
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode for each epoch
    total_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_epoch_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss}")

# Save the fine-tuned model and tokenizer to Google Drive
output_dir = "/content/drive/My Drive/NLP_Project_2/FineTunedModel_T5_Base/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Epoch 1/5, Loss: 0.9600868444327553
Epoch 2/5, Loss: 0.7910868771430853
Epoch 3/5, Loss: 0.7167887672959917
Epoch 4/5, Loss: 0.6504265364344423
Epoch 5/5, Loss: 0.5961128326049975


('/content/drive/My Drive/NLP_Project_2/FineTunedModel_T5_Base/tokenizer_config.json',
 '/content/drive/My Drive/NLP_Project_2/FineTunedModel_T5_Base/special_tokens_map.json',
 '/content/drive/My Drive/NLP_Project_2/FineTunedModel_T5_Base/spiece.model',
 '/content/drive/My Drive/NLP_Project_2/FineTunedModel_T5_Base/added_tokens.json')

In [4]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=ddd45b3fa56e00cff95b41f1a3b3d170df7b58af41094e8d2245dc60311256b5
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [5]:
!pip install sacrebleu
!pip install sentence-transformers


Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/118.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/118.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m


In [9]:
import nltk

# Calculate BLEU score for the validation set
val_bleu_score = nltk.translate.bleu_score.corpus_bleu(val_references, val_predictions)


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [19]:
!pip install bert-score


NotImplementedError: ignored

In [20]:
# Assuming you have defined the validation dataset as 'val_data'
val_data = dataset["validation"]

# Assuming you have defined the batch size for the validation data
val_batch_size = 8

# Create a custom collate function for validation data
val_dataloader = DataLoader(val_data, batch_size=val_batch_size, shuffle=False, collate_fn=collate_fn)

# Move the model to the appropriate device for validation
model.eval()  # Set the model to evaluation mode for validation
model.to(device)

# Define a function to generate summaries using the fine-tuned model
def generate_summary(input_ids, attention_mask):
    summary_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

# Initialize lists to store references and predictions for validation set
val_references = []
val_predictions = []

# Validation loop
with torch.no_grad():  # No need to compute gradients during validation
    for val_batch in val_dataloader:
        val_input_ids = val_batch["input_ids"]
        val_attention_mask = val_batch["attention_mask"]
        val_labels = val_batch["labels"]

        val_input_ids = val_input_ids.to(device)
        val_attention_mask = val_attention_mask.to(device)
        val_labels = val_labels.to(device)

        val_outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
        val_loss = val_outputs.loss

        # Convert generated summaries to text using tokenizer
        val_generated_summaries = generate_summary(val_input_ids, val_attention_mask)

        # Convert tensors to text using tokenizer
        val_ground_truth_summaries = tokenizer.batch_decode(val_labels, skip_special_tokens=True)

        # Append references and predictions for ROUGE calculation
        val_references.extend(val_ground_truth_summaries)
        val_predictions.extend(val_generated_summaries)

# Calculate ROUGE scores for the validation set
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

val_rouge1_scores = []
val_rouge2_scores = []
val_rougeL_scores = []

for generated, reference in zip(val_predictions, val_references):
    scores = scorer.score(generated, reference)
    val_rouge1_scores.append(scores["rouge1"].fmeasure)
    val_rouge2_scores.append(scores["rouge2"].fmeasure)
    val_rougeL_scores.append(scores["rougeL"].fmeasure)

# Print the metrics for the validation set
print("Validation Metrics:")
print("Validation Loss:", val_loss.item())
print("Epochs\tValidation Loss\tRouge-1\tRouge-2\tRouge-L")
print(f"{num_epochs}\t{val_loss.item()}\t{sum(val_rouge1_scores) / len(val_rouge1_scores):.4f}\t{sum(val_rouge2_scores) / len(val_rouge2_scores):.4f}\t{sum(val_rougeL_scores) / len(val_rougeL_scores):.4f}")


Validation Metrics:
Validation Loss: 0.8595929741859436
Epochs	Validation Loss	Rouge-1	Rouge-2	Rouge-L
5	0.8595929741859436	0.4805	0.2476	0.4022
