In [None]:
import os
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    DataCollatorForLanguageModeling,
    AdamW,
)
from datasets import load_dataset
from tqdm import tqdm

# Load the tokenizer and model
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare dataset
data_files = {'train': 'Your_Data.txt'}
datasets = load_dataset('text', data_files=data_files)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=["text"])

# Filter out empty sequences
def filter_empty_examples(example):
    return len(example['input_ids']) > 0

tokenized_datasets = tokenized_datasets.filter(filter_empty_examples)

# Create a data collator that will dynamically pad the inputs received
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create DataLoader
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=2,
    shuffle=True,
    collate_fn=data_collator,
)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of training epochs
epochs = 3

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
model.train()

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        # Debugging statements
        print(f"input_ids shape: {input_ids.shape}")
        print(f"labels shape: {labels.shape}")
        
        # Check for empty sequences
        if input_ids.size(1) == 0:
            print("Empty input_ids encountered. Skipping this batch.")
            continue  # Skip this batch
        
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
    avg_loss = epoch_loss / len(train_dataloader)
    print(f'Average loss: {avg_loss}')

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')
