### Url data train :
https://drive.google.com/drive/folders/1lN7tD2R5Zb4pnzTXx2nA0fjWlIJxAL6N?usp=sharing

In [None]:
pip install torch transformers datasets flash_attn

In [20]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_scheduler
from datasets import load_dataset
import os
import glob
import tqdm
from torch.amp import autocast, GradScaler
from torch.utils.tensorboard import SummaryWriter

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")


In [13]:
##############################################
# 1. Data Loading and Preparation
##############################################
# Load Parquet dataset
class ParquetDataset(Dataset):
    def __init__(self, file_paths, tokenizer, block_size=512):
        self.dataset = []
        for file_path in file_paths:
            self.dataset.extend(load_dataset("parquet", data_files=file_path)["train"])
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Ambil data
        data_entry = self.dataset[idx]

        # Validasi dan ambil teks
        if "text" in data_entry:
            text = data_entry["text"]
        elif "conversations" in data_entry:
            # Gabungkan "role" dan "content" dari "conversations" menjadi satu string
            text = " ".join(f"{convo['role']}: {convo['content']}" for convo in data_entry["conversations"])
        else:
            raise ValueError("Dataset entry must contain 'text' or 'conversations' column")

        # Tokenisasi teks
        tokenized = self.tokenizer(
            text,
            truncation=True,
            max_length=self.block_size,
            padding="max_length",
            return_tensors="pt",
        )

        # Kembalikan input_ids dalam bentuk tensor
        return tokenized["input_ids"].squeeze(0) 


def load_parquet_files_from_directory(directory_path):
    return glob.glob(os.path.join(directory_path, "*.parquet"))

# Hyperparameters
BATCH_SIZE = 2
LEARNING_RATE = 3e-5
EPOCHS = 3
BLOCK_SIZE = 512
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01
MAX_GRAD_NORM = 1.0
DROPOUT_PROB = 0.1

train_files = load_parquet_files_from_directory("/content/drive/MyDrive/data/train")  
test_files = load_parquet_files_from_directory("/content/drive/MyDrive/data/test")    
eval_files = load_parquet_files_from_directory("/content/drive/MyDrive/data/eval")    

In [None]:
##############################################
# 2. Model Initialization and Regularization
##############################################
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

# Set pad_token if not already set (using eos_token or a new pad token)
if tokenizer.pad_token is None:
    # Option 1: Use eos_token as pad_token
    tokenizer.pad_token = tokenizer.eos_token
    # Option 2: Or add a new pad_token and update model (uncomment if preferred)
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # model.resize_token_embeddings(len(tokenizer))

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Apply regularization (dropout)
def add_regularization_to_model(model, dropout_prob):
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = dropout_prob

add_regularization_to_model(model, DROPOUT_PROB)

# Attempt to enable Flash Attention
try:
    from flash_attn import enable_flash_attention
    enable_flash_attention(model)
    print("Flash Attention enabled.")
except ImportError:
    print("Flash Attention is not available. Continuing without it.")

model.to(device)

In [14]:
##############################################
# 3. DataLoader Creation
##############################################
# Prepare datasets and dataloaders
def create_dataloader(file_paths, tokenizer, block_size, batch_size):
    dataset = ParquetDataset(file_paths, tokenizer, block_size=block_size)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

train_dataloader = create_dataloader(train_files, tokenizer, BLOCK_SIZE, BATCH_SIZE)
test_dataloader = create_dataloader(test_files, tokenizer, BLOCK_SIZE, BATCH_SIZE)
eval_dataloader = create_dataloader(eval_files, tokenizer, BLOCK_SIZE, BATCH_SIZE)


In [None]:
##############################################
# 4. Optimizer and Training Setup
##############################################
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

# Mixed precision training
scaler = GradScaler()

# TensorBoard writer
writer = SummaryWriter()

In [None]:
##############################################
# 5. Training Loop with tqdm
##############################################
# Training loop
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    optimizer.zero_grad()
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch + 1}")

    for step, batch in progress_bar:
        batch = batch.to(device)

        with autocast(device_type='cuda'):
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS

        scaler.scale(loss).backward()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        writer.add_scalar("Loss/train", loss.item(), epoch * len(train_dataloader) + step)

        # Update tqdm progress bar description
        progress_bar.set_postfix({"Loss": loss.item()})

    perplexity = torch.exp(torch.tensor(total_loss / len(train_dataloader)))
    print(f"Epoch {epoch + 1} completed. Average Loss: {total_loss / len(train_dataloader)}. Perplexity: {perplexity}.")
    writer.add_scalar("Loss/epoch", total_loss / len(train_dataloader), epoch)

    # Save checkpoint
    if (epoch + 1) % 2 == 0:
        checkpoint_dir = f"./checkpoint_epoch_{epoch + 1}"
        os.makedirs(checkpoint_dir, exist_ok=True)
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Checkpoint saved at {checkpoint_dir}")

writer.close()


In [None]:
##############################################
# 6. Evaluation with tqdm
##############################################
# Evaluation loop
model.eval()
total_eval_loss = 0
progress_bar = tqdm(eval_dataloader, desc="Evaluating", total=len(eval_dataloader))

with torch.no_grad():
    for batch in progress_bar:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss.item()
        total_eval_loss += loss

        # Update tqdm progress bar description
        progress_bar.set_postfix({"Loss": loss})

average_eval_loss = total_eval_loss / len(eval_dataloader)
perplexity = torch.exp(torch.tensor(average_eval_loss))
print(f"Evaluation Loss: {average_eval_loss}, Perplexity: {perplexity}")


In [None]:
##############################################
# 7. Save Final Model
##############################################
# Save the final model
output_dir = "./gpt2_finetuned"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")