In [1]:
# !pip install 'transformers[torch]' datasets wandb

# GPT2 fine-tuning on Masnavi dataset

In [2]:
!nvidia-smi

Tue Mar  4 21:44:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:07:00.0 Off |                    0 |
| N/A   33C    P0             73W /  400W |       1MiB /  40960MiB |    100%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, GPT2Config, GPT2LMHeadModel
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from datasets import load_dataset
import random
import numpy as np
import time
import datetime
import os
import wandb
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2025-03-04 21:44:33.996231: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741124674.014183    3798 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741124674.019648    3798 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
df = pd.read_csv('./masnavi.csv')  # With 25,664 verses

In [5]:
class GPT2Dataset(Dataset):

  def __init__(self, df, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for i, row in df.iterrows():
        
        encodings_dict = tokenizer('<|startoftext|>'+ row['prompt']+ '<|tab|>' + row['completion'] + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

        self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
        self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 


In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium


In [7]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [8]:
dataset = GPT2Dataset(df, tokenizer, max_length=128)

In [9]:
# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

23,097 training samples
2,567 validation samples


In [10]:
batch_size = 128

In [11]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = 4 # Evaluate with this batch size.
        )

In [12]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [13]:
# some parameters I cooked up that work reasonably well

epochs = 10
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [14]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [15]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [16]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [17]:
# Initialize wandb - do this before training
wandb.login(key="5dcb5df4678703c1f6af0964344aa79cc51893d5")
run = wandb.init(
    project="masnavi",  # Choose a project name
    name="training-masnavi",    # Optional: name this specific run
    config={
        "learning_rate": learning_rate,  # Add your hyperparameters
        "epochs": epochs,
        "batch_size": batch_size,
        "model_name": "gpt2",
        # Add any other hyperparameters you want to track
    }
    # Remove the settings parameter
)

# After defining your model but before training
# This is where you control gradient logging:
# wandb.watch(model, log="all", log_freq=100)  # Logs gradients and parameters

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ubuntu/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mminarezaei82[0m ([33mminarezaei82-plexure[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
total_t0 = time.time()

training_stats = []

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    # Add tqdm for training progress
    with tqdm(train_dataloader, unit="batch") as tepoch:
        for batch in tepoch:
    
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)
    
            model.zero_grad()        
    
            outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None
                          )
    
            loss = outputs[0]  
    
            batch_loss = loss.item()
            total_train_loss += batch_loss
    
            # # Get sample every x batches.
            # if step % sample_every == 0 and not step == 0:
            #     elapsed = format_time(time.time() - t0)
            #     print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
            #     model.eval()
            #     sample_outputs = model.generate(
            #                             bos_token_id=random.randint(1,30000),
            #                             do_sample=True,   
            #                             top_k=50, 
            #                             max_length = 200,
            #                             top_p=0.95, 
            #                             num_return_sequences=1
            #                         )
            #     for i, sample_output in enumerate(sample_outputs):
            #           print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            #     model.train()
    
            loss.backward()
    
            optimizer.step()
    
            scheduler.step()
            tepoch.set_postfix(loss=loss.item())
            
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    run.log({'train_loss':avg_train_loss})
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.6f}".format(avg_train_loss))  # Changed to 6 decimal places
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Add tqdm for validation progress
    with tqdm(validation_dataloader, unit="batch") as vepoch:
        for batch in vepoch:
            
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)
            
            with torch.no_grad():        
                outputs = model(
                    b_input_ids,
                    attention_mask=b_masks,
                    labels=b_labels,
                    token_type_ids=None
                )
                
                loss = outputs[0]
                
            batch_loss = loss.item()
            total_eval_loss += batch_loss
            vepoch.set_postfix(loss=loss.item())

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    run.log({'val_loss':avg_val_loss})
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.6f}".format(avg_val_loss))  # Changed to 6 decimal places
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
run.finish()
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.821]



  Average training loss: 2.006683
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.01batch/s, loss=0.766]


  Validation Loss: 0.793254
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.776]



  Average training loss: 0.784107
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.47batch/s, loss=0.704]


  Validation Loss: 0.733813
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.719]



  Average training loss: 0.733242
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.66batch/s, loss=0.682]


  Validation Loss: 0.703753
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.697]



  Average training loss: 0.703311
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 88.02batch/s, loss=0.671]


  Validation Loss: 0.687138
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.682]



  Average training loss: 0.681270
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.69batch/s, loss=0.66] 


  Validation Loss: 0.670796
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.662]



  Average training loss: 0.663124
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 86.72batch/s, loss=0.663]


  Validation Loss: 0.660859
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.648]



  Average training loss: 0.647517
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.24batch/s, loss=0.651]


  Validation Loss: 0.651765
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.624]



  Average training loss: 0.633742
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 86.81batch/s, loss=0.639]


  Validation Loss: 0.645734
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.627]



  Average training loss: 0.622330
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.08batch/s, loss=0.642]


  Validation Loss: 0.642185
  Validation took: 0:00:07

Training...


100%|██████████| 181/181 [02:28<00:00,  1.22batch/s, loss=0.611]



  Average training loss: 0.612821
  Training epoch took: 0:02:29

Running Validation...


100%|██████████| 642/642 [00:07<00:00, 87.23batch/s, loss=0.636]

  Validation Loss: 0.640872
  Validation took: 0:00:07





0,1
train_loss,█▂▂▁▁▁▁▁▁▁
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
train_loss,0.61282
val_loss,0.64087



Training complete!
Total training took 0:26:02 (h:mm:ss)


In [19]:
torch.save(model, "./model_masnavi.pth")

In [24]:
# Set the model to evaluation mode
model.eval()

# Define the input prompt
prompt = "بشنو از نی چون حکایت می‌کند" 
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Move model & tensors to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Generate text
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,  # Avoid padding issues
    eos_token_id=tokenizer.eos_token_id,  # Stop generation correctly
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(generated_text)


بشنو از نی چون حکایت می‌کند<|tab|>مرده بيرون كه در خوف و جنس رند
