In [1]:
# Step 1: Set up the environment in Google Colab
!pip install transformers
!pip install datasets
!pip install py7zr

# Step 2: Load the dataset and preprocess
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset
dataset = load_dataset("samsum")
train_data = dataset["train"]

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    inputs = examples["dialogue"]
    targets = examples["summary"]
    inputs = [doc + tokenizer.eos_token for doc in inputs]
    model_inputs = tokenizer(inputs, max_length=512, padding=True, truncation=True, return_tensors="pt")

    # Setup the target for the loss (classification)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=150, padding=True, truncation=True, return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess the data
train_data = train_data.map(preprocess_function, batched=True)


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.0 MB/s[0m eta [36m0:00:0

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



In [10]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSeq2SeqLM, AdamW
from torch.nn.utils.rnn import pad_sequence

# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a custom collate function to pad the sequences within the batch
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    input_ids = pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True)
    attention_mask = pad_sequence([torch.tensor(mask) for mask in attention_mask], batch_first=True)
    labels = pad_sequence([torch.tensor(ids) for ids in labels], batch_first=True)

    return {
        "input_ids": input_ids.to(device),
        "attention_mask": attention_mask.to(device),
        "labels": labels.to(device)
    }

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_fn)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save the fine-tuned model and tokenizer to Google Drive
output_dir = "/content/drive/My Drive/NLP_Project/FineTunedModel/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)




Epoch 1/10, Loss: 0.4791097640991211
Epoch 2/10, Loss: 0.4270494282245636
Epoch 3/10, Loss: 0.8256051540374756
Epoch 4/10, Loss: 0.7209801077842712
Epoch 5/10, Loss: 0.2849361002445221
Epoch 6/10, Loss: 0.5696766972541809
Epoch 7/10, Loss: 0.16968096792697906
Epoch 8/10, Loss: 0.6650962233543396
Epoch 9/10, Loss: 0.49338194727897644
Epoch 10/10, Loss: 0.5467504262924194


('/content/drive/My Drive/NLP_Project/FineTunedModel/tokenizer_config.json',
 '/content/drive/My Drive/NLP_Project/FineTunedModel/special_tokens_map.json',
 '/content/drive/My Drive/NLP_Project/FineTunedModel/tokenizer.json')

In [4]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a0bdf90514de7a3e0d36d9ead3459a8ccb7247ff83870f2eb8b70407a8c504ab
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [6]:
print(test_data[0].keys())


dict_keys(['id', 'dialogue', 'summary'])


In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset
from rouge_score import rouge_scorer

# Load the pre-trained model and tokenizer
model_name = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the test dataset
dataset = load_dataset("samsum")
test_data = dataset["test"]

# Define a function to generate summaries using the fine-tuned model
def generate_summary(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Calculate ROUGE scores
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
references = [data["summary"] for data in test_data]
generated_summaries = [generate_summary(data["dialogue"]) for data in test_data]

# Calculate ROUGE scores for each generated summary
scores = []
for summary, reference in zip(generated_summaries, references):
    score = rouge_scorer.score(summary, reference)
    scores.append(score)

print("ROUGE Scores:")
for idx, score in enumerate(scores, 1):
    print(f"Summary {idx}: {score}")


ROUGE Scores:
Summary 1: {'rouge1': Score(precision=0.0625, recall=0.14285714285714285, fmeasure=0.08695652173913043), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.0625, recall=0.14285714285714285, fmeasure=0.08695652173913043)}
Summary 2: {'rouge1': Score(precision=0.08333333333333333, recall=0.1111111111111111, fmeasure=0.09523809523809525), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.08333333333333333, recall=0.1111111111111111, fmeasure=0.09523809523809525)}
Summary 3: {'rouge1': Score(precision=0.13793103448275862, recall=0.4, fmeasure=0.20512820512820515), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.13793103448275862, recall=0.4, fmeasure=0.20512820512820515)}
Summary 4: {'rouge1': Score(precision=0.09090909090909091, recall=0.125, fmeasure=0.10526315789473685), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.0909090909