In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader

In [3]:
# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [4]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [5]:
# Define a dataset class to preprocess the data
class CricketCommentaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        match_id = row["Match_id"]
        over_num = row["over_num"]
        comment = row["comment"]
        
        # Generate the input sequence for the model
        input_seq = f"Match ID: {match_id} Over Number: {over_num} Comment: {comment}"
        input_seq = input_seq.strip()
        
        # Tokenize the input sequence and truncate to max_len
        input_ids = self.tokenizer.encode(input_seq, truncation=True, max_length=self.max_len, padding="max_length")
        
        return torch.tensor(input_ids)
    

In [6]:
# Define a data loader class to load the preprocessed data
class CricketCommentaryDataLoader(DataLoader):
    def __init__(self, dataset, batch_size):
        super().__init__(dataset, batch_size=batch_size, collate_fn=self.pad_sequence)
        
    def pad_sequence(self, batch):
        # Pad the sequences to the maximum sequence length in the batch
        max_len = max([len(seq) for seq in batch])
        padded_batch = torch.zeros(len(batch), max_len).long()
        
        for i, seq in enumerate(batch):
            padded_batch[i, :len(seq)] = seq
            
        return padded_batch

In [7]:
from google.colab import files
uploaded = files.upload()

Saving Cricket_Commentary.csv to Cricket_Commentary (1).csv


In [8]:
# Load the CSV file with the cricket commentary data
data = pd.read_csv("Cricket_Commentary.csv")

In [9]:
# Truncate comments to maximum length supported by the model
max_length = tokenizer.model_max_length - 2  # Account for special tokens
data['comment'] = data['comment'].str.slice(stop=max_length)

In [10]:
# Initialize the datasets and data loaders
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

train_dataset = CricketCommentaryDataset(train_data, tokenizer, max_len=256)
val_dataset = CricketCommentaryDataset(val_data, tokenizer, max_len=256)

train_dataloader = CricketCommentaryDataLoader(train_dataset, batch_size=8)
val_dataloader = CricketCommentaryDataLoader(val_dataset, batch_size=8)

In [11]:
# Define the training loop function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    
    for input_ids in dataloader:
        input_ids = input_ids.to(device)
        
        # Clear the gradients
        optimizer.zero_grad()
        
        # Get the model predictions
        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]
        
        # Backpropagate the loss and update the weights
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(dataloader)


In [12]:
# Define the evaluation function
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    
    with torch.no_grad():
        for input_ids in dataloader:
            input_ids = input_ids.to(device)
            
            # Get the model predictions
            outputs = model(input_ids, labels=input_ids)
            loss, logits = outputs[:2]
            
            total_loss += loss.item()
            
    return total_loss / len(dataloader)

In [13]:
# Train the model for 5 epochs
num_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
for epoch in range(num_epochs):
    # train_loss = train(model, train_dataloader, optimizer, criterion, device)
    # val_loss = evaluate(model, val_dataloader, criterion, device)
    
    # print(f"Epoch {epoch+1}/{num_epochs} Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")
    
    # Generate a sample summary after every epoch
    sample_match_ids = val_data["Match_id"].unique()[:3] # Generate sample summary for first 3 matches in validation set
    
    for match_id in sample_match_ids:
        match_data = val_data[val_data["Match_id"] == match_id].reset_index(drop=True)
        summary = ""
        
        for i in range(len(match_data)):
            input_seq = f"Match ID: {match_id} Over Number: {match_data.loc[i, 'over_num']} Comment: {match_data.loc[i, 'comment']}"
            input_seq = input_seq.strip()
            input_ids = tokenizer.encode(input_seq, truncation=True, max_length=256, padding="max_length")
            input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
            
            # Generate the summary using the model
            output_ids = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
            output_str = tokenizer.decode(output_ids.squeeze(), skip_special_tokens=True)
            
            summary += f"{output_str} "
        
        print(f"Match ID: {match_id} Summary: {summary}")

IndexError: ignored