In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel,GPT2Tokenizer
data=pd.read_parquet('train-00000-of-00003.parquet')
data

In [None]:
data1=data.iloc[0:100]

In [None]:
tokenizer=GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
#check if the model has any padding token initialized, if not then add a padding token 
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token':tokenizer.eos_token})

In [None]:
model=GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
#tokenizer the input and the target text
import torch
from torch.utils.data import Dataset
class CustomDataset(Dataset):
  def __init__(self,data,tokenizer,max_len):
    self.data=data
    self.tokenizer=tokenizer
    self.max_len=max_len
  def __getitem__(self,index):
    feature=self.data.article[index]
    label=self.data.highlights[index]
    encoding=self.tokenizer.encode_plus(feature,label)
    length=len(encoding['input_ids'])
    target=self.tokenizer.encode(label)
    length_target=len(target)
    diff_1=abs(length-self.max_len)
    diff_2=abs(length_target-self.max_len)
    input_id=encoding['input_ids']+[50256]*diff_1
    mask=encoding['attention_mask']+[0]*diff_1
    targets=target+[50256]*diff_2
    return{
        'input_ids':torch.tensor(input_id),'mask':torch.tensor(mask),'target':torch.tensor(targets)
    }
  def __len__(self):
    return len(data)
x=CustomDataset(data1,tokenizer,max_len=3000)


In [None]:
#initializing optimizer
optimizer=torch.optim.AdamW(model.parameters(),lr=2e-5)

In [None]:
# training part of the code 
batch_size=10
data_len=len(x)
max_len=3000
for epoch in range(3):
    total_loss = 0
    num_batches = 0    

    for batch in range(0,data_len,batch_size):
        end=min(batch+batch_size,data_len)
        batch_samples=[x[i] for i in range(batch,end)]
        batch_loss=0
        print(len(batch_samples))
        for sample in batch_samples:
            sample=dict(sample)
            b=[sample[j][k:k+1000] for k in range(0,max_len,1000) for j in sample]
            for l in range(0,len(b),3):
                input_ids=b[l]
                mask=b[l+1]
                target=b[l+2]
                model.train()
                output=model(input_ids=input_ids,attention_mask=mask,labels=target)    
                loss=output.loss
                batch_loss+=loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        total_loss += batch_loss
        num_batches += 1

        print(f"Epoch {epoch + 1}, Batch {num_batches}, Loss: {batch_loss:.4f}")

    epoch_loss = total_loss / num_batches
    print(f"Epoch {epoch + 1} Loss: {epoch_loss:.4f}")

print(f"Total samples processed: {data_len}")