In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrbchwang[0m ([33mmrbchwang-hanyang-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from datasets import load_dataset #HUGGINGFACE
from transformers import GPT2Tokenizer, AutoTokenizer
from tqdm import tqdm
from typing import Dict, List, Tuple
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# config setting

In [4]:
CONFIG = {
    # 모델 아키텍처 (경량화)
    'vocab_size': 50257,
    'max_seq_length': 512,
    'embedding_dim': 384,      # 작은 임베딩 차원
    'num_heads': 8,            # 8개 헤드
    'num_layers': 6,           # 6개 레이어 (가벼움)
    'ff_dim': 1536,            # 4x embedding_dim
    'hidden_dropout': 0.1,
    'attention_dropout': 0.1,
    
    # 학습 설정
    'batch_size': 32,
    'gradient_accumulation_steps': 1,
    'num_epochs': 10,
    'learning_rate': 7e-4,
    'weight_decay': 0.01,
    'warmup_steps': 500,
    'max_grad_norm': 1.0,
    
    # 데이터
    'dataset_name': 'wikitext',
    'dataset_config': 'wikitext-103-v1',
    'train_split': 'train',
    'val_split': 'validation',
    
    # 기술
    'use_amp': True,           # Mixed Precision Training
    'use_flash_attention': False,  # Ampere 아키텍처 미지원
    'use_gradient_checkpointing': True,
    
    # 체크포인트
    'save_steps': 500,
    'eval_steps': 500,
    'save_total_limit': 3,
}

In [7]:
def init_wandb(config: Dict):
    wandb.init(
        project='llm-rtx3060ti',
        name='GPT-RoPE-wikitext103',
        config=config,
    )
    return wandb.run

In [9]:
class MultiHeadAttention(nn.Module):
    "Multi-Head Attention"

    def __init__(self, embedding_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        assert embedding_dim % num_heads == 0

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.query = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.key = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.value = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.output = nn.Linear(embedding_dim, embedding_dim, bias=True)

        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.head_dim) #d_k

    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        batch_size, seq_length, _ = x.shape

        Q = self.query(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = torch.softmax(scores, dim= -1)
        attention_weights = self.dropout(attention_weights)

        context = torch.matmul(attention_weights, V) #QK/d_k * V 
        context = context.transpose(1,2).contiguous()
        context = context.view(batch_size, seq_length, self.embedding_dim)

        output = self.output(context)

        return output

In [11]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim: int, ff_dim:int, dropout: float=0.1):
        super().__init__()
        self.linear1 = nn.Linear(embedding_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embedding_dim)
        self.activation = nn.GELU() #RELU or GELU
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.linear2(x)

        return x 


In [13]:
import math 

def build_rope_cache(seq_len: int, head_dim: int, device: torch.device):
    """ RoPE
        cos, sin
        generating cache"""
    theta = 1.0 / (10000 ** (torch.arange(0, head_dim,2, device=device).float()/ head_dim))
    seq_idx = torch.arange(seq_len, device=device).float()
    freqs = torch.einsum("i,j->ij", seq_idx, theta)

    cos = freqs.cos()
    sin = freqs.sin()

    cos = torch.stack([cos,cos], dim=-1).reshape(seq_len, -1)
    sin = torch.stack([sin,sin], dim=-1).reshape(seq_len, -1)
    return cos,sin

def apply_rope(x: torch.Tensor, cos:torch.Tensor, sin:torch.Tensor):
    cos = cos[None, None, :, :]
    sin = sin[None, None, :, :]

    x1 = x[..., ::2]
    x2 = x[..., 1::2]
    x_rot = torch.stack([-x2,x1],dim= -1).reshape_as(x)

    return x*cos + x_rot*sin

In [15]:
class MultiHeadAttention(nn.Module):
    "Multi-Head Attention"

    def __init__(self, embedding_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()
        assert embedding_dim % num_heads == 0

        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.query = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.key = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.value = nn.Linear(embedding_dim, embedding_dim, bias=True)
        self.output = nn.Linear(embedding_dim, embedding_dim, bias=True)

        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.head_dim) #d_k

    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        batch_size, seq_length, _ = x.shape

        Q = self.query(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

        #apply RoPE
        device = x.device
        cos,sin = build_rope_cache(seq_length, self.head_dim, device)
        Q = apply_rope(Q,cos, sin)
        K = apply_rope(K, cos, sin)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = torch.softmax(scores, dim= -1)
        attention_weights = self.dropout(attention_weights)

        context = torch.matmul(attention_weights, V) #QK/d_k * V 
        context = context.transpose(1,2).contiguous()
        context = context.view(batch_size, seq_length, self.embedding_dim)

        output = self.output(context)

        return output

In [17]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim: int, num_heads: int, ff_dim: int, dropout: float=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(embedding_dim, num_heads, dropout)
        self.feed_forward = FeedForward(embedding_dim, ff_dim, dropout)

        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x:torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        attention_output = self.attention(self.ln1(x), mask)
        x = x + self.dropout(attention_output)

        ff_output = self.feed_forward(self.ln2(x))
        x = x + self.dropout(ff_output)

        return x 

In [19]:
class GPTModel(nn.Module):
    def __init__(self, config:Dict):
        super().__init__()

        self.vocab_size = config['vocab_size']
        self.embedding_dim = config['embedding_dim']
        self.max_seq_length = config['max_seq_length']

        self.token_embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        
        self.embedding_dropout = nn.Dropout(config['hidden_dropout'])

        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(
                config['embedding_dim'],
                config['num_heads'],
                config['ff_dim'],
                config['hidden_dropout']
            )
            for _ in range(config['num_layers'])
        ])

        self.ln_final = nn.LayerNorm(config['embedding_dim'])
        self.lm_head = nn.Linear(config['embedding_dim'], config['vocab_size'],bias=False)

        #weight tying (sharing parameter)
        self.lm_head.weight = self.token_embedding.weight 

        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
            elif isinstance(module, nn.LayerNorm):
                nn.init.ones_(module.weight)
                nn.init.zeros_(module.bias)
                
    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None) -> torch.Tensor:
        batch_size, seq_length = input_ids.shape

        token_embeds = self.token_embedding(input_ids)
        x = token_embeds
        x = self.embedding_dropout(x)

        causal_mask = torch.tril(torch.ones(seq_length, seq_length, device=input_ids.device))

        for block in self.transformer_blocks:
            x = block(x, causal_mask)

        x = self.ln_final(x)
        logits = self.lm_head(x)

        return logits

    def generate(self, input_ids: torch.Tensor, max_new_tokens: int = 100, 
                 temperature: float = 0.8, top_k: int = 50,
                eos_token_id: int = 50256, repetition_penalty: float = 1.1)->torch.Tensor:

        device = input_ids.device

        for step in range(max_new_tokens):
            input_ids_cond = input_ids[:, -self.max_seq_length:]

            with torch.no_grad():
                logits = self(input_ids_cond)[:,-1,:]

                for i, token_id in enumerate(input_ids[0, -50:]):
                    logits[0,token_id] /= repetition_penalty ** (1.0/(i+1))

                logits /= temperature
    
                if top_k > 0:
                    v, _ = torch.topk(logits, top_k)
                    threshold = v[:,[-1]]
                    logits[logits < threshold] = float('-inf')
    
                probs = torch.softmax(logits, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1)
    
            input_ids = torch.cat([input_ids, next_token_id], dim=1)

            if next_token_id.item() == eos_token_id:
                break

        return input_ids
        

In [25]:
x = torch.randint(0, CONFIG['vocab_size'], (2, 16))  # (batch, seq)
model = GPTModel(CONFIG)
logits = model(x)  # 에러 없이 (2,16,vocab) 나오면 OK

print(logits.shape)

torch.Size([2, 16, 50257])


In [27]:
class TextDataset(Dataset):
    def __init__(self,tokenized_data, block_size: int):
        self.examples = []
        self.block_size = block_size

        all_tokens = []
        for example in tokenized_data['input_ids']:
            all_tokens.extend(example)

        # block_size window sliding
        ## divide all token sequence with block size
        ### result : fixed size chunk
        for i in range(0, len(all_tokens)-block_size, block_size):
            self.examples.append(all_tokens[i:i+block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        tokens = self.examples[idx]
        x = torch.tensor(tokens[:-1], dtype=torch.long)
        y = torch.tensor(tokens[1:], dtype=torch.long)
        return x, y

def load_and_tokenize_dataset(config: Dict, max_examples: int = None):
    dataset = load_dataset(
        config['dataset_name'],
        config['dataset_config'],
        split=config['train_split']
    )

    if max_examples:
        dataset = dataset.select(range(min(max_examples, len(dataset))))

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=config['max_seq_length'],
            padding = 'max_length',
            return_tensors=None,
        )
    
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text'],
        num_proc=4,
    )

    return tokenized_dataset, tokenizer

In [35]:
class Trainer:
    
    def __init__(self, model: nn.Module, config: Dict, device: torch.device):
        self.model = model
        self.config = config
        self.device = device
        
        # Optimizer
        self.optimizer = AdamW(
            model.parameters(),
            lr=config['learning_rate'],
            weight_decay=config['weight_decay']
        )
        
        # Scheduler
        self.scheduler = CosineAnnealingLR(
            self.optimizer,
            T_max=config['num_epochs']
        )
        
        # Loss function
        self.loss_fn = nn.CrossEntropyLoss()
        
        # Gradient scaler (AMP용)
        if config['use_amp']:
            self.scaler = torch.cuda.amp.GradScaler()
        else:
            self.scaler = None
        
        self.global_step = 0
        self.best_loss = float('inf')
    
    def train_epoch(self, train_loader: DataLoader, epoch: int) -> Dict:

        self.model.train()
        total_loss = 0.0
        num_batches = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{self.config['num_epochs']}")
        
        for batch_idx, (input_ids, labels) in enumerate(pbar):
            input_ids = input_ids.to(self.device)
            labels = labels.to(self.device)

            
            # Forward pass with AMP
            if self.config['use_amp']:
                with torch.cuda.amp.autocast():
                    logits = self.model(input_ids)
                    loss = self.loss_fn(logits.reshape(-1, self.config['vocab_size']),
                                      labels.reshape(-1))
                
                # Backward with scaling
                self.scaler.scale(loss).backward()
                
                if (batch_idx + 1) % self.config['gradient_accumulation_steps'] == 0:
                    self.scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                  self.config['max_grad_norm'])
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.optimizer.zero_grad()
                    self.global_step += 1
            else:
                logits = self.model(input_ids)
                loss = self.loss_fn(logits.reshape(-1, self.config['vocab_size']),
                                  labels.reshape(-1))
                
                loss.backward()
                
                if (batch_idx + 1) % self.config['gradient_accumulation_steps'] == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                  self.config['max_grad_norm'])
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    self.global_step += 1
            
            total_loss += loss.item()
            num_batches += 1
            
            # 진행상황 업데이트
            pbar.set_postfix({'loss': f"{loss.item():.4f}"})
            
            # W&B 로깅
            if self.global_step % 100 == 0:
                wandb.log({
                    'train/loss': loss.item(),
                    'train/learning_rate': self.optimizer.param_groups[0]['lr'],
                    'train/epoch': epoch,
                    'global_step': self.global_step,
                })
        
        avg_loss = total_loss / num_batches
        self.scheduler.step()
        
        return {'loss': avg_loss}
    
    @torch.no_grad()
    def evaluate(self, val_loader: DataLoader, epoch: int) -> Dict:
        
        self.model.eval()
        total_loss = 0.0
        num_batches = 0
        
        for input_ids, labels in tqdm(val_loader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            labels = labels.to(self.device)
            
            logits = self.model(input_ids)
            loss = self.loss_fn(logits.reshape(-1, self.config['vocab_size']),
                              labels.reshape(-1))
            
            total_loss += loss.item()
            num_batches += 1
        
        avg_loss = total_loss / num_batches
        perplexity = math.exp(avg_loss)
        
        # W&B 로깅
        wandb.log({
            'val/loss': avg_loss,
            'val/perplexity': perplexity,
            'epoch': epoch,
        })
        
        return {'loss': avg_loss, 'perplexity': perplexity}
    
    def save_checkpoint(self, path: str):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'global_step': self.global_step,
        }, path)
        wandb.save(path)
        print(f"✓ Checkpoint saved: {path}")
    
    def generate_samples(self, tokenizer, num_samples: int = 3) -> List[str]:
        self.model.eval()
        samples = []
        
        prompts = [
            "The future of artificial intelligence",
            "In the beginning,",
            "Machine learning is",
        ]
        
        with torch.no_grad():
            for prompt in prompts[:num_samples]:
                input_ids = torch.tensor(
                    tokenizer.encode(prompt),
                    dtype=torch.long
                ).unsqueeze(0).to(self.device)
                
                output_ids = self.model.generate(
                    input_ids,
                    max_new_tokens=50,
                    temperature=0.8,
                    top_k=50
                )
                
                generated_text = tokenizer.decode(output_ids[0])
                samples.append(generated_text)
        
        return samples    

In [37]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #wandb.login()
    init_wandb(CONFIG)

    tokenized_data, tokenizer = load_and_tokenize_dataset(CONFIG, max_examples=10000)
    dataset = TextDataset(tokenized_data, CONFIG['max_seq_length'])

    train_size = int(0.95*len(dataset))
    val_size = len(dataset)-train_size
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers =0,
        pin_memory = True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers =0,
        pin_memory = True,
    )

    model = GPTModel(CONFIG).to(device)

    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f" Model - Total params: {total_params/1e6:.2f}M, Trainable: {trainable_params/1e6:.2f}M")

    wandb.watch(model, log_freq=100)

    trainer = Trainer(model,CONFIG, device)

    print("Training Start")

    for epoch in range(CONFIG['num_epochs']):
        train_results = trainer.train_epoch(train_loader, epoch)
        print(f"Epoch {epoch+1} - Train Loss: {train_results['loss']:.4f}")

        val_results = trainer.evaluate(val_loader, epoch)
        print(f"Epoch {epoch+1} - Val Loss: {val_results['loss']:.4f}, Perplexity: {val_results['perplexity']:.2f}")

        samples = trainer.generate_samples(tokenizer, num_samples=3)
        for i, sample in enumerate(samples):
            wandb.log({f'sample_{i}': wandb.Html(f"<p>{sample}<p>")})

        if val_results['loss'] < trainer.best_loss:
            trainer.best_loss = val_results['loss']
            trainer.save_checkpoint(f"checkpoint_epoch_{epoch+1}_best.pt")

        print()

    trainer.save_checkpoint("checkpoint_final.pt")

    print("Training completed")

    wandb.finish()


In [39]:
if __name__ == "__main__":
    main()

 Model - Total params: 29.95M, Trainable: 29.95M
Training Start


Epoch 1/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:43<00:00,  2.57s/it, loss=0.8445]


Epoch 1 - Train Loss: 1.0543


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 1 - Val Loss: 0.8043, Perplexity: 2.24




✓ Checkpoint saved: checkpoint_epoch_1_best.pt



Epoch 2/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:38<00:00,  2.55s/it, loss=0.2645]


Epoch 2 - Train Loss: 0.7487


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 2 - Val Loss: 0.7326, Perplexity: 2.08




✓ Checkpoint saved: checkpoint_epoch_2_best.pt



Epoch 3/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:38<00:00,  2.56s/it, loss=0.7531]


Epoch 3 - Train Loss: 0.6797


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 3 - Val Loss: 0.6932, Perplexity: 2.00




✓ Checkpoint saved: checkpoint_epoch_3_best.pt



Epoch 4/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:39<00:00,  2.56s/it, loss=0.6083]


Epoch 4 - Train Loss: 0.6269


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 4 - Val Loss: 0.6695, Perplexity: 1.95




✓ Checkpoint saved: checkpoint_epoch_4_best.pt



Epoch 5/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:39<00:00,  2.56s/it, loss=0.4883]


Epoch 5 - Train Loss: 0.5820


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 5 - Val Loss: 0.6532, Perplexity: 1.92




✓ Checkpoint saved: checkpoint_epoch_5_best.pt



Epoch 6/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:38<00:00,  2.56s/it, loss=0.5008]


Epoch 6 - Train Loss: 0.5426


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 6 - Val Loss: 0.6429, Perplexity: 1.90




✓ Checkpoint saved: checkpoint_epoch_6_best.pt



Epoch 7/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:38<00:00,  2.56s/it, loss=0.4254]


Epoch 7 - Train Loss: 0.5085


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 7 - Val Loss: 0.6390, Perplexity: 1.89




✓ Checkpoint saved: checkpoint_epoch_7_best.pt



Epoch 8/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:38<00:00,  2.56s/it, loss=0.3515]


Epoch 8 - Train Loss: 0.4812


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 8 - Val Loss: 0.6362, Perplexity: 1.89




✓ Checkpoint saved: checkpoint_epoch_8_best.pt



Epoch 9/10: 100%|███████████████████████████████████████████████████████| 297/297 [12:39<00:00,  2.56s/it, loss=0.4582]


Epoch 9 - Train Loss: 0.4617


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 9 - Val Loss: 0.6363, Perplexity: 1.89



Epoch 10/10: 100%|██████████████████████████████████████████████████████| 297/297 [12:39<00:00,  2.56s/it, loss=0.4576]


Epoch 10 - Train Loss: 0.4514


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.04it/s]


Epoch 10 - Val Loss: 0.6362, Perplexity: 1.89



[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


✓ Checkpoint saved: checkpoint_final.pt
Training completed


0,1
epoch,▁▂▃▃▄▅▆▆▇█
global_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
train/epoch,▁▁▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇███
train/learning_rate,█████▇▇▇▇▇▇▆▆▆▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▅▅█▅▅▄▃▃▅▄▄▇▇▄▂▃▄▄▃▁▁▄▃▄▁▃▃▄
val/loss,█▅▃▂▂▁▁▁▁▁
val/perplexity,█▅▃▂▂▁▁▁▁▁

0,1
epoch,9.0
global_step,2900.0
train/epoch,9.0
train/learning_rate,2e-05
train/loss,0.59225
val/loss,0.63619
val/perplexity,1.88926


In [40]:
import torch

def load_model_and_tokenizer(checkpoint_path: str, config: Dict):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 토크나이저 (학습 때와 동일)
    _, tokenizer = load_and_tokenize_dataset(config, max_examples=10)  # 토크나이징은 안 써도 되니 소량만
    tokenizer.pad_token = tokenizer.eos_token

    # 모델 생성 후 체크포인트 로드
    model = GPTModel(config)
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    return model, tokenizer, device


def generate_from_prompts(model, tokenizer, device, prompts, 
                          max_new_tokens: int = 50, 
                          temperature: float = 0.8, 
                          top_k: int = 50):
    results = []
    with torch.no_grad():
        for prompt in prompts:
            input_ids = torch.tensor(
                tokenizer.encode(prompt),
                dtype=torch.long
            ).unsqueeze(0).to(device)

            output_ids = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_k=top_k,
            )

            text = tokenizer.decode(output_ids[0])
            results.append((prompt, text))
    return results

# 1) 사용할 체크포인트 경로 선택
ckpt_path = "checkpoint_final.pt"        

# 2) 모델 + 토크나이저 로드
model, tokenizer, device = load_model_and_tokenizer(ckpt_path, CONFIG)

# 3) 프롬프트 정의
prompts = [
    "The future of artificial intelligence",
    "In the beginning,",
    "Machine learning is",
]

# 4) 문장 생성
samples = generate_from_prompts(model, tokenizer, device, prompts,
                                max_new_tokens=50,
                                temperature=0.8,
                                top_k=50)

# 5) 출력
for i, (prompt, text) in enumerate(samples):
    print(f"=== Sample {i} ===")
    print(f"[Prompt] {prompt}")
    print(f"[Generated] {text}")
    print()


=== Sample 0 ===
[Prompt] The future of artificial intelligence
[Generated] The future of artificial intelligence , the Egyptians called the gods who were to be a tradition of deities and was not known as the gods 's ba . The gods were the deities of their respective deities in other deities , and the gods were said to increase the gods . The gods

=== Sample 1 ===
[Prompt] In the beginning,
[Generated] In the beginning, Baltimore Pike was later ordered to a new ship officer , but no effect of the rear ship attacked and replaced the remainder of the line . The ship was appointed as a major ship and was a well @-@ up area at Mahé 's junction

=== Sample 2 ===
[Prompt] Machine learning is
[Generated] Machine learning is a very useful , but in a cell , are said to be a mildly image of a person 's desire , but there is a well @-@ known woman . Although the male is usually influenced by its bones , the wolf is revealed that



In [41]:
prompts = [
    "South Korea is",
    "In the beginning,",
    "Germany is",
]

# 4) 문장 생성
samples = generate_from_prompts(model, tokenizer, device, prompts,
                                max_new_tokens=50,
                                temperature=0.8,
                                top_k=50)

# 5) 출력
for i, (prompt, text) in enumerate(samples):
    print(f"=== Sample {i} ===")
    print(f"[Prompt] {prompt}")
    print(f"[Generated] {text}")
    print()


=== Sample 0 ===
[Prompt] South Korea is
[Generated] South Korea is a strong tourist occurrence of nuclear airfields , but the <unk> ( April 26 , 1972 ) has been built in the region . The Romanian Land Forces created the National Historic Land Forces in the National Register of Historic Places . 
<|endoftext|>

=== Sample 1 ===
[Prompt] In the beginning,
[Generated] In the beginning, Zealand , the Germans was used as a brief armed vessel in the region . 
<|endoftext|>

=== Sample 2 ===
[Prompt] Germany is
[Generated] Germany is a short @-@ down by the NS @-@ 10 . The song has a low @-@ long @-@ hand , and the Sun @-@ off hand , and his second hand , and the color of the eyes , and

