In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

# Load train and validation datasets
train_df = pd.read_csv('/content/personality.csv')
val_df = pd.read_csv('/content/personality.csv')

# Instantiate the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

class ConversationDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        persona = self.data.iloc[idx]['Persona']
        chat = self.data.iloc[idx]['chat']

        # Encode the input text
        encoding = self.tokenizer.encode_plus(
            persona,
            chat,
            add_special_tokens=True,
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Create datasets and dataloaders
train_dataset = ConversationDataset(train_df, tokenizer)
val_dataset = ConversationDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Find the length of the longest sequence in the batch
        max_len = max(input_ids.size(1), attention_mask.size(1))

        # Pad sequences within the batch to the length of the longest sequence
        input_ids = torch.nn.functional.pad(input_ids, (0, max_len - input_ids.size(1)))
        attention_mask = torch.nn.functional.pad(attention_mask, (0, max_len - attention_mask.size(1)))

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Find the length of the longest sequence in the batch
        max_len = max(input_ids.size(1), attention_mask.size(1))

        # Pad sequences within the batch to the length of the longest sequence
        input_ids = torch.nn.functional.pad(input_ids, (0, max_len - input_ids.size(1)))
        attention_mask = torch.nn.functional.pad(attention_mask, (0, max_len - attention_mask.size(1)))

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        val_loss = outputs.loss

    print(f"Epoch {epoch+1}: Train Loss - {loss.item()}, Val Loss - {val_loss.item()}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2_model")


RuntimeError: stack expects each tensor to be equal size, but got [213] at entry 0 and [196] at entry 1

In [None]:
!pip install transformers