In [1]:
# !pip install transformers


In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW


In [3]:
# Define a custom dataset class to load the CSV file
class CricketDataset(Dataset):
    def __init__(self, csv_path, tokenizer):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        over = str(self.data.iloc[index]['Over_num'])
        comment = str(self.data.iloc[index]['Comment'])
        input_text = "summarize a cricket match from ball by ball commentary: " + over + " " + comment
        target_text = self.tokenizer.encode(comment, max_length=128, truncation=True)

        return {
            'input_ids': self.tokenizer.encode(input_text, max_length=512, truncation=True, padding='max_length'),
            'attention_mask': self.tokenizer.encode_plus(input_text, max_length=512, truncation=True, padding='max_length', return_attention_mask=True)['attention_mask'],
            'target_ids': target_text.ids,
            'target_attention_mask': target_text.attention_mask
        }

In [4]:
# !pip install sentencepiece

In [5]:
# Define the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
from google.colab import files
uploaded = files.upload()

Saving IPL_Match_Highlights_Commentary.csv to IPL_Match_Highlights_Commentary.csv


In [7]:
import io
df = pd.read_csv(io.BytesIO(uploaded['IPL_Match_Highlights_Commentary.csv']))

In [8]:
# Define the dataset and dataloader
dataset = CricketDataset('IPL_Match_Highlights_Commentary.csv', tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [9]:
# Define the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-4)



In [10]:
# Train the model for 5 epochs
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        target_attention_mask = batch['target_attention_mask'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=target_ids[:, :-1],
            decoder_attention_mask=target_attention_mask[:, :-1],
            labels=target_ids[:, 1:]
        )

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


AttributeError: ignored

In [None]:
print('Epoch:', epoch+1, '  Loss:', total_loss/len(dataloader)))