## Loading libraries

In [None]:
import time
import json
import os
import logging
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from datetime import datetime
from tqdm.notebook import tqdm

# Local imports
from utils.text_encoder import TextEncoder
from Random_generator import RandomGenerator, TextDecoder

logging.basicConfig(level=logging.INFO)

## Loading h_sample, h_population

In [2]:
h_sample = pd.read_csv('h_sample.csv')
h_population = pd.read_csv('h_population.csv')
print("Data loaded successfully!")

Data loaded successfully!


## Encoding h_sample with Random Ordering

In [3]:
encoder = TextEncoder()

# Encoding
train_texts = encoder.encode_dataset(h_sample)

print("Encoding h_sample is completed.\n\n Test of 'Random Ordering' in first sample (profile):")
for idx, text in enumerate(train_texts[:10], start=1):
    print("------------------------------")
    print(f"Example{idx}: \n\"{text}\"")

Encoding h_sample is completed.

 Test of 'Random Ordering' in first sample (profile):
------------------------------
Example1: 
"Work days is 5 days, Home type is Apartment, Age group is [35,40), Major travel mode is Car, Number of household members is 4, Car ownership of household is Yes, Household monthly income level is 3M-5M KRW, Gender is Male, Education status is Not student, Kid in household is Yes, Work type is Simple labor, Driver license is Yes, Major departure time is Peak."
------------------------------
Example2: 
"Kid in household is No, Household monthly income level is 1M-3M KRW, Driver license is Yes, Age group is [25,30), Major departure time is Peak, Work days is 5 days, Home type is Multi-family, Education status is Not student, Major travel mode is Public Transportation, Car ownership of household is No, Gender is Female, Work type is Manager/Office, Number of household members is 2."
------------------------------
Example3: 
"Work days is Inoccupation/non-regular

## Make the LLM-Random model: Fine-tuning the GPT-2 with random ordering

In [4]:
from torch.utils.data import Dataset, DataLoader
import time
import torch
import json
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
import pandas as pd

class DynamicTextDataset(Dataset):
    """
    For each **getitem** call, it dynamically encodes the corresponding row using TextEncoder.
    Therefore, a different valid linear extension is applied each time.
    """
    def __init__(self, dataframe, encoder):
        self.dataframe = dataframe
        self.encoder = encoder
    def __len__(self):
        return len(self.dataframe)
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        return self.encoder.encode_row(row)

# Creating DataLoader (using DynamicTextDataset for dynamic encoding)
dynamic_dataset = DynamicTextDataset(h_sample, encoder)
batch_size = 8
dataloader = DataLoader(dynamic_dataset, batch_size=batch_size, shuffle=True)

# -------------------------------------------
# Training
# -------------------------------------------
print("\nFine-tuning the LLM-Random model")
generator = RandomGenerator(model_name='distilgpt2')  # gpt2-medium, gpt2-large
learning_rate = 5e-5
epochs = 40
generator.init_optimizer(learning_rate=learning_rate)

training_metrics = {
    'epoch_losses': [],
    'epoch_times': [],
    'epoch_tokens': [],
    'gpu_memory_usage': [],
    'batch_metrics': []
}

results_dir = Path('Training Metrics')
results_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

start_time = time.time()
initial_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

for epoch in range(epochs):
    epoch_start_time = time.time()
    epoch_metrics = {
        'loss': 0,
        'tokens': 0,
        'batches': []
    }
    
    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
    for batch_idx, batch in enumerate(progress_bar):
        inputs = generator.tokenizer(
            batch, 
            return_tensors='pt', 
            padding=True, 
            truncation=True, 
            max_length=512
        ).to(generator.device)
        
        outputs = generator.model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss.item()
        
        generator.optimizer.zero_grad()
        outputs.loss.backward()
        generator.optimizer.step()
        
        batch_tokens = inputs['input_ids'].numel()
        epoch_metrics['batches'].append({
            'batch_idx': batch_idx,
            'loss': loss,
            'tokens': batch_tokens
        })
        epoch_metrics['tokens'] += batch_tokens
        epoch_metrics['loss'] += loss
        
        progress_bar.set_postfix({
            'batch': f'{batch_idx+1}/{len(dataloader)}',
            'loss': f'{loss:.4f}',
            'tokens': batch_tokens
        })
    
    epoch_time = time.time() - epoch_start_time
    avg_loss = epoch_metrics['loss'] / len(dataloader)
    
    training_metrics['epoch_losses'].append(avg_loss)
    training_metrics['epoch_times'].append(epoch_time)
    training_metrics['epoch_tokens'].append(epoch_metrics['tokens'])
    if torch.cuda.is_available():
        current_memory = torch.cuda.memory_allocated()
        training_metrics['gpu_memory_usage'].append((current_memory - initial_memory) / 1e6)
    training_metrics['batch_metrics'].append(epoch_metrics)
    
    print(f"\nEpoch {epoch+1}/{epochs}")
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Epoch Time: {epoch_time:.2f}s")
    print(f"Tokens Processed: {epoch_metrics['tokens']:,}")
    if torch.cuda.is_available():
        current_memory = torch.cuda.memory_allocated()
        print(f"GPU Memory: {(current_memory - initial_memory)/1e6:.2f}MB")
    
    # Save model every 5 epochs
    if (epoch + 1) % 5 == 0:
        epoch_save_path = f"saved_models/LLM-Random_distilgpt2_epoch{epoch+1}"
        generator.save_model(epoch_save_path)
        print(f"Model saved to {epoch_save_path}")
        
training_metrics['total_time'] = time.time() - start_time
training_metrics['total_tokens'] = sum(training_metrics['epoch_tokens'])
training_metrics['peak_memory'] = (torch.cuda.max_memory_allocated() - initial_memory) / 1e6 if torch.cuda.is_available() else 0

print("\nTraining Complete!")
print(f"Total Training Time: {training_metrics['total_time']:.2f}s")
print(f"Total Tokens Processed: {training_metrics['total_tokens']:,}")
print(f"Peak GPU Memory Usage: {training_metrics['peak_memory']:.2f}MB")

metrics_file = results_dir / f'training_metrics_{timestamp}.json'
with open(metrics_file, 'w') as f:
    json.dump(training_metrics, f, indent=2)
print(f"\nTraining metrics saved to {metrics_file}")


Fine-tuning the LLM-Random model
Using device: cuda


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.00:00<?, ?it/s]
Epoch 1/40: 100%|████████████████████████| 6665/6665 [03:10<00:00, 35.03it/s, batch=6665/6665, loss=0.3264, tokens=294]



Epoch 1/40
Average Loss: 0.3634
Epoch Time: 190.25s
Tokens Processed: 5,123,734
GPU Memory: 1080.16MB


Epoch 2/40: 100%|████████████████████████| 6665/6665 [03:14<00:00, 34.33it/s, batch=6665/6665, loss=0.3600, tokens=276]



Epoch 2/40
Average Loss: 0.3411
Epoch Time: 194.12s
Tokens Processed: 5,123,444
GPU Memory: 1079.27MB


Epoch 3/40: 100%|████████████████████████| 6665/6665 [03:13<00:00, 34.48it/s, batch=6665/6665, loss=0.3438, tokens=279]



Epoch 3/40
Average Loss: 0.3388
Epoch Time: 193.29s
Tokens Processed: 5,120,927
GPU Memory: 1077.57MB


Epoch 4/40: 100%|████████████████████████| 6665/6665 [03:14<00:00, 34.21it/s, batch=6665/6665, loss=0.3254, tokens=291]



Epoch 4/40
Average Loss: 0.3374
Epoch Time: 194.83s
Tokens Processed: 5,122,915
GPU Memory: 1078.93MB


Epoch 5/40: 100%|████████████████████████| 6665/6665 [03:09<00:00, 35.20it/s, batch=6665/6665, loss=0.3004, tokens=291]



Epoch 5/40
Average Loss: 0.3366
Epoch Time: 189.34s
Tokens Processed: 5,124,451
GPU Memory: 1080.62MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch5
Model saved to saved_models/LLM-Random_distilgpt2_epoch5


Epoch 6/40: 100%|████████████████████████| 6665/6665 [03:09<00:00, 35.18it/s, batch=6665/6665, loss=0.3287, tokens=282]



Epoch 6/40
Average Loss: 0.3360
Epoch Time: 189.47s
Tokens Processed: 5,123,458
GPU Memory: 1077.89MB


Epoch 7/40: 100%|████████████████████████| 6665/6665 [03:08<00:00, 35.41it/s, batch=6665/6665, loss=0.3597, tokens=288]



Epoch 7/40
Average Loss: 0.3357
Epoch Time: 188.21s
Tokens Processed: 5,122,512
GPU Memory: 1080.69MB


Epoch 8/40: 100%|████████████████████████| 6665/6665 [03:08<00:00, 35.32it/s, batch=6665/6665, loss=0.3166, tokens=291]



Epoch 8/40
Average Loss: 0.3352
Epoch Time: 188.70s
Tokens Processed: 5,124,371
GPU Memory: 1081.99MB


Epoch 9/40: 100%|████████████████████████| 6665/6665 [03:08<00:00, 35.40it/s, batch=6665/6665, loss=0.3342, tokens=288]



Epoch 9/40
Average Loss: 0.3351
Epoch Time: 188.27s
Tokens Processed: 5,122,552
GPU Memory: 1081.13MB


Epoch 10/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.30it/s, batch=6665/6665, loss=0.3301, tokens=279]



Epoch 10/40
Average Loss: 0.3349
Epoch Time: 188.84s
Tokens Processed: 5,123,151
GPU Memory: 1078.32MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch10
Model saved to saved_models/LLM-Random_distilgpt2_epoch10


Epoch 11/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.38it/s, batch=6665/6665, loss=0.3338, tokens=291]



Epoch 11/40
Average Loss: 0.3347
Epoch Time: 188.37s
Tokens Processed: 5,123,587
GPU Memory: 1082.04MB


Epoch 12/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.14it/s, batch=6665/6665, loss=0.3512, tokens=282]



Epoch 12/40
Average Loss: 0.3344
Epoch Time: 189.65s
Tokens Processed: 5,124,482
GPU Memory: 1077.52MB


Epoch 13/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.32it/s, batch=6665/6665, loss=0.3283, tokens=297]



Epoch 13/40
Average Loss: 0.3343
Epoch Time: 188.74s
Tokens Processed: 5,123,497
GPU Memory: 1082.50MB


Epoch 14/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.37it/s, batch=6665/6665, loss=0.3294, tokens=282]



Epoch 14/40
Average Loss: 0.3341
Epoch Time: 188.42s
Tokens Processed: 5,124,578
GPU Memory: 1081.56MB


Epoch 15/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.28it/s, batch=6665/6665, loss=0.3431, tokens=276]



Epoch 15/40
Average Loss: 0.3342
Epoch Time: 188.90s
Tokens Processed: 5,122,036
GPU Memory: 1075.29MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch15
Model saved to saved_models/LLM-Random_distilgpt2_epoch15


Epoch 16/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.42it/s, batch=6665/6665, loss=0.3203, tokens=291]



Epoch 16/40
Average Loss: 0.3339
Epoch Time: 188.15s
Tokens Processed: 5,124,035
GPU Memory: 1081.00MB


Epoch 17/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.18it/s, batch=6665/6665, loss=0.3455, tokens=285]



Epoch 17/40
Average Loss: 0.3339
Epoch Time: 189.48s
Tokens Processed: 5,122,837
GPU Memory: 1081.09MB


Epoch 18/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.43it/s, batch=6665/6665, loss=0.4044, tokens=267]



Epoch 18/40
Average Loss: 0.3338
Epoch Time: 188.15s
Tokens Processed: 5,122,971
GPU Memory: 1075.68MB


Epoch 19/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.26it/s, batch=6665/6665, loss=0.3267, tokens=291]



Epoch 19/40
Average Loss: 0.3335
Epoch Time: 189.04s
Tokens Processed: 5,125,011
GPU Memory: 1080.68MB


Epoch 20/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.19it/s, batch=6665/6665, loss=0.3499, tokens=279]



Epoch 20/40
Average Loss: 0.3336
Epoch Time: 189.43s
Tokens Processed: 5,123,255
GPU Memory: 1078.76MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch20
Model saved to saved_models/LLM-Random_distilgpt2_epoch20


Epoch 21/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.20it/s, batch=6665/6665, loss=0.3287, tokens=285]



Epoch 21/40
Average Loss: 0.3335
Epoch Time: 189.36s
Tokens Processed: 5,124,629
GPU Memory: 1077.97MB


Epoch 22/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.39it/s, batch=6665/6665, loss=0.3198, tokens=297]



Epoch 22/40
Average Loss: 0.3333
Epoch Time: 188.34s
Tokens Processed: 5,125,433
GPU Memory: 1082.86MB


Epoch 23/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.12it/s, batch=6665/6665, loss=0.3180, tokens=288]



Epoch 23/40
Average Loss: 0.3333
Epoch Time: 189.79s
Tokens Processed: 5,123,984
GPU Memory: 1079.33MB


Epoch 24/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.34it/s, batch=6665/6665, loss=0.3350, tokens=294]



Epoch 24/40
Average Loss: 0.3334
Epoch Time: 188.59s
Tokens Processed: 5,122,318
GPU Memory: 1082.41MB


Epoch 25/40: 100%|███████████████████████| 6665/6665 [03:10<00:00, 34.96it/s, batch=6665/6665, loss=0.3224, tokens=282]



Epoch 25/40
Average Loss: 0.3333
Epoch Time: 190.67s
Tokens Processed: 5,123,250
GPU Memory: 1078.51MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch25
Model saved to saved_models/LLM-Random_distilgpt2_epoch25


Epoch 26/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.62it/s, batch=6665/6665, loss=0.3160, tokens=282]



Epoch 26/40
Average Loss: 0.3332
Epoch Time: 187.11s
Tokens Processed: 5,122,986
GPU Memory: 1077.94MB


Epoch 27/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.43it/s, batch=6665/6665, loss=0.3248, tokens=282]



Epoch 27/40
Average Loss: 0.3331
Epoch Time: 188.15s
Tokens Processed: 5,124,210
GPU Memory: 1076.70MB


Epoch 28/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.60it/s, batch=6665/6665, loss=0.3655, tokens=279]



Epoch 28/40
Average Loss: 0.3331
Epoch Time: 187.23s
Tokens Processed: 5,124,431
GPU Memory: 1080.52MB


Epoch 29/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.49it/s, batch=6665/6665, loss=0.3831, tokens=273]



Epoch 29/40
Average Loss: 0.3332
Epoch Time: 187.80s
Tokens Processed: 5,122,265
GPU Memory: 1076.13MB


Epoch 30/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.49it/s, batch=6665/6665, loss=0.3360, tokens=291]



Epoch 30/40
Average Loss: 0.3330
Epoch Time: 187.81s
Tokens Processed: 5,122,819
GPU Memory: 1082.67MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch30
Model saved to saved_models/LLM-Random_distilgpt2_epoch30


Epoch 31/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.47it/s, batch=6665/6665, loss=0.3465, tokens=264]



Epoch 31/40
Average Loss: 0.3329
Epoch Time: 187.91s
Tokens Processed: 5,123,104
GPU Memory: 1074.60MB


Epoch 32/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.63it/s, batch=6665/6665, loss=0.3214, tokens=291]



Epoch 32/40
Average Loss: 0.3328
Epoch Time: 187.05s
Tokens Processed: 5,124,243
GPU Memory: 1082.50MB


Epoch 33/40: 100%|███████████████████████| 6665/6665 [03:06<00:00, 35.69it/s, batch=6665/6665, loss=0.3157, tokens=291]



Epoch 33/40
Average Loss: 0.3328
Epoch Time: 186.76s
Tokens Processed: 5,123,955
GPU Memory: 1080.48MB


Epoch 34/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.61it/s, batch=6665/6665, loss=0.3983, tokens=273]



Epoch 34/40
Average Loss: 0.3327
Epoch Time: 187.19s
Tokens Processed: 5,124,393
GPU Memory: 1077.77MB


Epoch 35/40: 100%|███████████████████████| 6665/6665 [03:05<00:00, 35.86it/s, batch=6665/6665, loss=0.3210, tokens=294]



Epoch 35/40
Average Loss: 0.3326
Epoch Time: 185.88s
Tokens Processed: 5,124,886
GPU Memory: 1081.36MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch35
Model saved to saved_models/LLM-Random_distilgpt2_epoch35


Epoch 36/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.47it/s, batch=6665/6665, loss=0.3668, tokens=279]



Epoch 36/40
Average Loss: 0.3327
Epoch Time: 187.90s
Tokens Processed: 5,123,935
GPU Memory: 1081.04MB


Epoch 37/40: 100%|███████████████████████| 6665/6665 [03:07<00:00, 35.57it/s, batch=6665/6665, loss=0.3309, tokens=288]



Epoch 37/40
Average Loss: 0.3327
Epoch Time: 187.37s
Tokens Processed: 5,123,432
GPU Memory: 1080.70MB


Epoch 38/40: 100%|███████████████████████| 6665/6665 [03:08<00:00, 35.36it/s, batch=6665/6665, loss=0.3353, tokens=282]



Epoch 38/40
Average Loss: 0.3326
Epoch Time: 188.47s
Tokens Processed: 5,123,410
GPU Memory: 1080.18MB


Epoch 39/40: 100%|███████████████████████| 6665/6665 [03:04<00:00, 36.05it/s, batch=6665/6665, loss=0.3418, tokens=282]



Epoch 39/40
Average Loss: 0.3325
Epoch Time: 184.91s
Tokens Processed: 5,123,962
GPU Memory: 1077.29MB


Epoch 40/40: 100%|███████████████████████| 6665/6665 [03:09<00:00, 35.25it/s, batch=6665/6665, loss=0.3305, tokens=282]



Epoch 40/40
Average Loss: 0.3325
Epoch Time: 189.07s
Tokens Processed: 5,122,466
GPU Memory: 1078.28MB
Model saved to saved_models/LLM-Random_distilgpt2_epoch40
Model saved to saved_models/LLM-Random_distilgpt2_epoch40

Training Complete!
Total Training Time: 7554.89s
Total Tokens Processed: 204,941,915
Peak GPU Memory Usage: 2153.47MB

Training metrics saved to Training Metrics\training_metrics_20250505_163534.json
