# Importing Libraries

In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# Importing Libraries

In [None]:
df = pd.read_csv('train.csv')
df

# Spliting The Dataset

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Initializing GPT2 Model And Tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token 
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Setting The Device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Creating a custom dataset class for text generation

In [None]:
class CustomDataset(Dataset):
    def __init__(self, context, response, tokenizer, max_length):
        self.context = context
        self.response = response
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.context)

    def __getitem__(self, idx):
        context = str(self.context.iloc[idx])
        response = str(self.response.iloc[idx])

        input_text = f'{context} {response}'
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()  # Use input_ids as labels for language modeling
        }

# Creating instances of the custom dataset

In [None]:
train_dataset = CustomDataset(train_df['context'], train_df['response'], tokenizer, max_length=512)
val_dataset = CustomDataset(val_df['context'], val_df['response'], tokenizer, max_length=512)

# Ceating data loaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Initializing optimizer and training parameters

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

# Training loop

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Training Loss: {average_loss}')

# Validation Loop
model.eval()
val_loss = 0

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validation'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

average_val_loss = val_loss / len(val_loader)
print(f'Validation Loss: {average_val_loss}')


# Saving The Model And Tokenizer

In [None]:
model.save_pretrained('fine_tuned_gpt2_model')
tokenizer.save_pretrained('fine_tuned_gpt2_model')