In [33]:
!pip install sentencepiece datasets
!pip install wandb




In [38]:
import torch
from torch import nn
import numpy as np
import tensorflow as tf
from typing import Optional
import sentencepiece as spm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from random import shuffle
import math
import wandb
from tqdm import tqdm
import torch.nn.functional as F

In [31]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mguhan-karthik25[0m ([33mguhan-karthik25-kumaraguru-college-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
def generate_causal_mask(seq_len: int, device: Optional[torch.device] = None):
    """
    Additive mask (T, T) with 0 for allowed, -1e9 for blocked future positions.
    Suitable to add to attention logits.
    """
    mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
    mask = mask.float().masked_fill(mask, float("-1e9"))
    return mask.to(device) if device is not None else mask

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, max_len=2048):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        self.qkv = nn.Linear(input_dim, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)

        # For cross-attn
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        # RoPE module
        self.rope = RotaryEmbedding(self.head_dim, max_len=max_len)

    def _apply_rope(self, q, k):
        # q, k: (B, H, T, Dh)
        return self.rope(q), self.rope(k)

    def _attention(self, q, k, v, mask=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            if mask.dim() == 2:
                mask = mask.unsqueeze(0).unsqueeze(0)
            scores = scores + mask.to(scores.device)

        weights = torch.softmax(scores, dim=-1)
        return torch.matmul(weights, v)

    def forward(self, x, mask=None):
        B, T, _ = x.shape

        qkv = self.qkv(x)  # (B, T, 3D)
        qkv = qkv.view(B, T, self.num_heads, 3*self.head_dim).permute(0,2,1,3)
        q, k, v = qkv.chunk(3, dim=-1)

        # Apply RoPE
        q, k = self._apply_rope(q, k)

        out = self._attention(q, k, v, mask)
        out = out.permute(0,2,1,3).contiguous().view(B, T, -1)
        return self.out_proj(out)

    def forward_cross_attention(self, q_in, k_in, v_in, mask=None):
        B, Tq, _ = q_in.shape
        _, Tk, _ = k_in.shape

        q = self.q_linear(q_in).view(B, Tq, self.num_heads, self.head_dim).permute(0,2,1,3)
        k = self.k_linear(k_in).view(B, Tk, self.num_heads, self.head_dim).permute(0,2,1,3)
        v = self.v_linear(v_in).view(B, Tk, self.num_heads, self.head_dim).permute(0,2,1,3)

        # Apply RoPE to cross-attn q,k
        q, k = self._apply_rope(q, k)

        out = self._attention(q, k, v, mask)
        out = out.permute(0,2,1,3).contiguous().view(B, Tq, -1)
        return self.out_proj(out)


Positional Encoding
Rotary q,k embeddings



In [8]:
class RotaryEmbedding(nn.Module):
    def __init__(self, head_dim, max_len=2048):
        super().__init__()
        self.head_dim = head_dim

        # Compute base frequencies
        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))

        # Precompute sin/cos tables
        t = torch.arange(max_len)
        freqs = torch.einsum("i,j->ij", t, inv_freq)

        emb = torch.cat((freqs, freqs), dim=-1)  # interleave
        self.register_buffer("cos_emb", emb.cos())
        self.register_buffer("sin_emb", emb.sin())

    def forward(self, x):
        """
        x: (B, H, T, D)
        apply RoPE on last dimension D
        """
        B, H, T, D = x.shape
        cos = self.cos_emb[:T].unsqueeze(0).unsqueeze(0)
        sin = self.sin_emb[:T].unsqueeze(0).unsqueeze(0)

        x1 = x[..., : D//2]
        x2 = x[..., D//2 :]

        # rotate
        x_rot = torch.cat([-x2, x1], dim=-1)
        return (x * cos) + (x_rot * sin)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()

    def forward(self, x):
        return x   # RoPE replaces absolute positional embeddings


**Feed Forward Neural Network**
```
transform the data to higher dimension to capture complex information.
relu activation to introduce non-linearity.
transform back to lower dimension of d_model.
```

In [9]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, activation=nn.ReLU()):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ffn),
            activation,
            nn.Linear(d_ffn, d_model)
        )

    def forward(self, x):
        return self.net(x)



**Encoding layer**
```
structuring the
1.positional encoding
2.multihead attention
3.add and normalization
4.feed forward neural network

```

In [10]:
class EncodingLayer(nn.Module):
    def __init__(self, d_model: int, num_heads: int, d_ffn: int):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_model, d_model, num_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = FeedForwardNetwork(d_model, d_ffn)

    def forward(self, x, src_mask: Optional[torch.Tensor] = None):
        # Pre-norm: LN -> MHA -> resid
        x = x + self.mha(self.ln1(x), mask=src_mask)
        # FFN block
        x = x + self.ff(self.ln2(x))
        return x



**Transformer Encoder**


one or more encoder layers

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_layers: int, num_heads: int, d_ffn: int, max_len: int = 2048):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=max_len)
        self.layers = nn.ModuleList([EncodingLayer(d_model, num_heads, d_ffn) for _ in range(num_layers)])
        self.ln = nn.LayerNorm(d_model)

    def forward(self, input_ids: torch.LongTensor, src_mask: Optional[torch.Tensor] = None):
        """
        input_ids: (B, T)
        returns (B, T, d_model)
        """
        x = self.token_emb(input_ids)  # (B, T, d_model)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return self.ln(x)

**Decoder layer**
```
structuring:
1. Positional encoder for decoder input.
2. multihead attention with mask
3. layer normalization.
4. multihead attention without mask with encoder output and decoder data as input
5. layer normalization
6. feed forward network
7. layer normalization.


In [12]:
class DecodingLayer(nn.Module):
    def __init__(self, d_model: int, num_heads: int, d_ffn: int):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.self_att = MultiHeadAttention(d_model, d_model, num_heads)

        self.ln2 = nn.LayerNorm(d_model)
        self.cross_att = MultiHeadAttention(d_model, d_model, num_heads)

        self.ln3 = nn.LayerNorm(d_model)
        self.ff = FeedForwardNetwork(d_model, d_ffn)

    def forward(self, x: torch.Tensor, enc_output: Optional[torch.Tensor] = None, src_mask: Optional[torch.Tensor] = None, tgt_mask: Optional[torch.Tensor] = None):
        # masked self-attention (pre-norm)
        x = x + self.self_att(self.ln1(x), mask=tgt_mask)

        # cross-attention only if encoder output is provided
        if enc_output is not None:
            x = x + self.cross_att.forward_cross_attention(self.ln2(x), enc_output, enc_output, mask=src_mask)

        # feed-forward
        x = x + self.ff(self.ln3(x))
        return x



**Decoder**

In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_layers: int, num_heads: int, d_ffn: int, max_len: int = 2048):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=max_len)
        self.layers = nn.ModuleList([DecodingLayer(d_model, num_heads, d_ffn) for _ in range(num_layers)])
        self.ln = nn.LayerNorm(d_model)

    def forward(self, input_ids: torch.LongTensor, enc_output: Optional[torch.Tensor] = None, src_mask: Optional[torch.Tensor] = None, tgt_mask: Optional[torch.Tensor] = None):
        B, T = input_ids.shape
        x = self.token_emb(input_ids)
        x = self.pos_enc(x)
        for layer in self.layers:
            x = layer(x, enc_output=enc_output, src_mask=src_mask, tgt_mask=tgt_mask)
        return self.ln(x)


Encoder only wrapper


In [14]:
class EncoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_layers: int, num_heads: int, d_ffn: int, max_len: int = 2048):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, num_layers, num_heads, d_ffn, max_len=max_len)
        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids: torch.LongTensor, src_mask: Optional[torch.Tensor] = None):
        enc = self.encoder(input_ids, src_mask=src_mask)
        enc = self.ln(enc)
        logits = self.head(enc)  # (B, T, V)
        return logits


Decoder only wrapper

In [15]:
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, num_layers: int, num_heads: int, d_ffn: int, max_len: int = 2048):
        super().__init__()
        # build a decoder; token embedding sized to vocab
        self.decoder = Decoder(vocab_size, d_model, num_layers, num_heads, d_ffn, max_len=max_len)
        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids: torch.LongTensor):
        """
        input_ids: (B, T)
        returns logits: (B, T, V)
        """
        B, T = input_ids.shape
        tgt_mask = generate_causal_mask(T, device=input_ids.device)
        dec = self.decoder(input_ids, enc_output=None, src_mask=None, tgt_mask=tgt_mask)
        dec = self.ln(dec)
        logits = self.head(dec)
        return logits


In [16]:
def init_weights_xavier(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, mean=0.0, std=0.02)
    elif isinstance(module, nn.LayerNorm):
        if getattr(module, "weight", None) is not None:
            nn.init.ones_(module.weight)
        if getattr(module, "bias", None) is not None:
            nn.init.zeros_(module.bias)

In [17]:
vocab_size = 2000
d_model = 128
num_layers = 4
num_heads = 4
d_ffn = 512
max_len = 256

In [72]:
model = DecoderOnlyTransformer(vocab_size, d_model, num_layers, num_heads, d_ffn, max_len=max_len)
model.apply(init_weights_xavier)

DecoderOnlyTransformer(
  (decoder): Decoder(
    (token_emb): Embedding(2000, 128)
    (pos_enc): PositionalEncoding()
    (layers): ModuleList(
      (0-3): 4 x DecodingLayer(
        (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (self_att): MultiHeadAttention(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_linear): Linear(in_features=128, out_features=128, bias=True)
          (k_linear): Linear(in_features=128, out_features=128, bias=True)
          (v_linear): Linear(in_features=128, out_features=128, bias=True)
          (rope): RotaryEmbedding()
        )
        (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (cross_att): MultiHeadAttention(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_linear): Linear(in_features=1

In [73]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DecoderOnlyTransformer(
  (decoder): Decoder(
    (token_emb): Embedding(2000, 128)
    (pos_enc): PositionalEncoding()
    (layers): ModuleList(
      (0-3): 4 x DecodingLayer(
        (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (self_att): MultiHeadAttention(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_linear): Linear(in_features=128, out_features=128, bias=True)
          (k_linear): Linear(in_features=128, out_features=128, bias=True)
          (v_linear): Linear(in_features=128, out_features=128, bias=True)
          (rope): RotaryEmbedding()
        )
        (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (cross_att): MultiHeadAttention(
          (qkv): Linear(in_features=128, out_features=384, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_linear): Linear(in_features=1

In [20]:
dataset = load_dataset("roneneldan/TinyStories")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [None]:
corpus_path = "tinystories_corpus.txt"

with open(corpus_path, "w", encoding="utf-8") as f:
    for item in dataset["train"]:
        story = item["text"].replace("\n", " ")
        f.write(story.strip() + "\n")


In [None]:
vocab_size = 2000
model_prefix = "tinystories_sp"

spm.SentencePieceTrainer.Train(
    input=corpus_path,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=1.0,
    model_type="bpe",
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)

In [23]:
sp = spm.SentencePieceProcessor()
sp.load(f"{model_prefix}.model")

True

In [40]:
class TinyStoriesDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, block_size=128):
        self.data = hf_dataset
        self.sp = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        ids = self.sp.encode(text, out_type=int)

        # Always ensure at least block_size+1 tokens
        if len(ids) < self.block_size + 1:
            # pad the sequence
            needed = (self.block_size + 1) - len(ids)
            ids = ids + [0] * needed

        # SAFE: compute max starting index
        max_start = len(ids) - (self.block_size + 1)

        # If max_start == 0 ‚Üí only one valid window
        if max_start == 0:
            start = 0
        else:
            start = torch.randint(0, max_start, (1,)).item()

        chunk = ids[start : start + self.block_size + 1]

        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y


In [25]:
sp.encode("Hello! I am training a tiny transformer.", out_type=int)

[62,
 1692,
 1891,
 81,
 885,
 1438,
 37,
 6,
 1723,
 454,
 47,
 1867,
 1878,
 49,
 1064,
 1873]

In [41]:
block_size = 128
batch_size = 32

train_ds = TinyStoriesDataset(dataset["train"], sp, block_size=128)
val_ds = TinyStoriesDataset(dataset["validation"], sp, block_size=128)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)



In [28]:
inp, tgt = next(iter(train_loader))
inp = inp.to(device)
tgt = tgt.to(device)

logits = model(inp)   # (B, T, vocab_size)
loss = nn.CrossEntropyLoss()(logits.view(-1, vocab_size), tgt.view(-1))
loss


tensor(7.6714, device='cuda:0', grad_fn=<NllLossBackward0>)

In [75]:
config = {
    "epochs": 1,
    "batch_size": 32,
    "block_size": 128,
    "learning_rate": 3e-4,
    "weight_decay": 0.1,
    "warmup_steps": 200,
    "max_steps": 10000,  # safety stop
    "grad_clip": 1.0,
    "log_every": 100,
    "val_every": 1000,
}

wandb.init(project="tiny-gpt-training", config=config)

0,1
lr,‚ñÅ‚ñÇ‚ñÑ‚ñÖ‚ñÖ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá
step,‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/loss,‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
val/loss,‚ñÅ

0,1
lr,0.00028
step,2000.0
train/loss,3.1292
val/loss,3.48248


In [74]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config["learning_rate"],
    weight_decay=config["weight_decay"]
)

# Cosine scheduler with warmup
def get_lr(step):
    if step < config["warmup_steps"]:
        return step / config["warmup_steps"]
    progress = (step - config["warmup_steps"]) / max(1, config["max_steps"] - config["warmup_steps"])
    return 0.5 * (1.0 + math.cos(math.pi * progress))



In [76]:
# ------------------------------
# Device + AMP scaler
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = torch.cuda.amp.GradScaler()

# ------------------------------
# Tracking
# ------------------------------
best_val_loss = float("inf")
steps_since_improve = 0
patience = 1000            # stop if no improvement this long
global_step = 0

# ------------------------------
# TRAINING LOOP
# ------------------------------
for epoch in range(config["epochs"]):
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['epochs']}")

    for inp, tgt in pbar:
        model.train()
        inp, tgt = inp.to(device), tgt.to(device)

        # ------------------------------
        # Learning rate scheduling
        # ------------------------------
        lr = config["learning_rate"] * get_lr(global_step)
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr

        optimizer.zero_grad(set_to_none=True)

        # ------------------------------
        # Forward pass (mixed precision)
        # ------------------------------
        with torch.cuda.amp.autocast(dtype=torch.float16):
            logits = model(inp)
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                tgt.view(-1)
            )

        # Backward
        scaler.scale(loss).backward()

        # Gradient clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_clip"])

        scaler.step(optimizer)
        scaler.update()

        # ------------------------------
        # Logging to W&B
        # ------------------------------
        wandb.log({
            "train/loss": loss.item(),
            "lr": lr,
            "step": global_step
        })

        # Update progress bar
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # ------------------------------
        # PRINT EVERY log_every STEPS
        # ------------------------------
        if global_step % config["log_every"] == 0:
            print(f"\n[TRAIN] Step {global_step} | Loss: {loss.item():.4f} | LR: {lr:.6f}")

        # ------------------------------
        # VALIDATION
        # ------------------------------
        if global_step % config["val_every"] == 0 and global_step > 0:

            print(f"\n[VAL] Running validation at step {global_step}...")
            model.eval()
            val_losses = []

            with torch.no_grad():
                for v_inp, v_tgt in val_loader:
                    v_inp, v_tgt = v_inp.to(device), v_tgt.to(device)
                    with torch.cuda.amp.autocast(dtype=torch.float16):
                        v_logits = model(v_inp)
                        v_loss = F.cross_entropy(
                            v_logits.view(-1, v_logits.size(-1)),
                            v_tgt.view(-1)
                        )
                    val_losses.append(v_loss.item())

            mean_val_loss = sum(val_losses) / len(val_losses)

            print(f"[VAL] Step {global_step} | Val Loss: {mean_val_loss:.4f}")

            wandb.log({"val/loss": mean_val_loss, "step": global_step})

            # ------------------------------
            # SMART CHECKPOINTING
            # ------------------------------
            if mean_val_loss < best_val_loss:
                best_val_loss = mean_val_loss
                steps_since_improve = 0

                ckpt_path = f"best_model_step_{global_step}.pt"
                torch.save(model.state_dict(), ckpt_path)
                wandb.save(ckpt_path)

                print(f"‚úî New Best Model Saved! Val loss improved to {best_val_loss:.4f}")
            else:
                steps_since_improve += config["val_every"]
                print(f"‚úò No improvement for {steps_since_improve} steps")

            # Rolling checkpoint
            torch.save(model.state_dict(), "latest_model.pt")

            # ------------------------------
            # EARLY STOPPING
            # ------------------------------
            if steps_since_improve >= patience:
                print("\n############################")
                print("### EARLY STOPPING FIRED ###")
                print("### No improvement observed ###")
                print("############################\n")

                torch.save(model.state_dict(), "final_model.pt")
                wandb.finish()
                raise SystemExit("EARLY STOPPING TRIGGERED.")

        # ------------------------------
        # UPDATE STEP + STOP IF MAX REACHED
        # ------------------------------
        global_step += 1
        if global_step >= config["max_steps"]:
            break

    if global_step >= config["max_steps"]:
        break

# ------------------------------
# FINAL SAVE
# ------------------------------
torch.save(model.state_dict(), "final_model.pt")
print("\nüéâ Training Complete!")
wandb.finish()


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(dtype=torch.float16):
Epoch 1/1:   0%|          | 3/66242 [00:00<2:13:33,  8.27it/s, loss=7.6095]


[TRAIN] Step 0 | Loss: 7.6084 | LR: 0.000000


Epoch 1/1:   0%|          | 104/66242 [00:06<56:36, 19.47it/s, loss=6.4328]


[TRAIN] Step 100 | Loss: 6.4987 | LR: 0.000150


Epoch 1/1:   0%|          | 203/66242 [00:11<49:54, 22.05it/s, loss=5.4134]


[TRAIN] Step 200 | Loss: 5.3771 | LR: 0.000300


Epoch 1/1:   0%|          | 303/66242 [00:16<1:07:51, 16.20it/s, loss=4.6844]


[TRAIN] Step 300 | Loss: 4.6462 | LR: 0.000300


Epoch 1/1:   1%|          | 405/66242 [00:21<49:59, 21.95it/s, loss=4.2286]


[TRAIN] Step 400 | Loss: 4.3363 | LR: 0.000300


Epoch 1/1:   1%|          | 504/66242 [00:26<51:44, 21.18it/s, loss=4.0772]


[TRAIN] Step 500 | Loss: 4.1990 | LR: 0.000299


Epoch 1/1:   1%|          | 604/66242 [00:31<1:08:04, 16.07it/s, loss=3.9619]


[TRAIN] Step 600 | Loss: 3.8487 | LR: 0.000299


Epoch 1/1:   1%|          | 703/66242 [00:37<51:57, 21.02it/s, loss=3.7039]


[TRAIN] Step 700 | Loss: 3.8600 | LR: 0.000298


Epoch 1/1:   1%|          | 805/66242 [00:41<48:48, 22.35it/s, loss=3.7573]


[TRAIN] Step 800 | Loss: 3.8262 | LR: 0.000297


Epoch 1/1:   1%|‚ñè         | 902/66242 [00:47<1:14:33, 14.60it/s, loss=3.6614]


[TRAIN] Step 900 | Loss: 3.6479 | LR: 0.000296


  with torch.cuda.amp.autocast(dtype=torch.float16):



[TRAIN] Step 1000 | Loss: 3.3487 | LR: 0.000295

[VAL] Running validation at step 1000...


Epoch 1/1:   2%|‚ñè         | 1001/66242 [01:13<42:32:04,  2.35s/it, loss=3.5375]

[VAL] Step 1000 | Val Loss: 3.4878
‚úî New Best Model Saved! Val loss improved to 3.4878


Epoch 1/1:   2%|‚ñè         | 1104/66242 [01:19<53:59, 20.11it/s, loss=3.4845]


[TRAIN] Step 1100 | Loss: 3.3838 | LR: 0.000294


Epoch 1/1:   2%|‚ñè         | 1203/66242 [01:24<50:55, 21.28it/s, loss=3.3855]


[TRAIN] Step 1200 | Loss: 3.3155 | LR: 0.000292


Epoch 1/1:   2%|‚ñè         | 1302/66242 [01:28<50:02, 21.63it/s, loss=3.3642]


[TRAIN] Step 1300 | Loss: 3.2858 | LR: 0.000291


Epoch 1/1:   2%|‚ñè         | 1405/66242 [01:35<50:23, 21.44it/s, loss=3.1658]


[TRAIN] Step 1400 | Loss: 3.2560 | LR: 0.000289


Epoch 1/1:   2%|‚ñè         | 1504/66242 [01:39<50:02, 21.56it/s, loss=3.2093]


[TRAIN] Step 1500 | Loss: 3.2150 | LR: 0.000287


Epoch 1/1:   2%|‚ñè         | 1603/66242 [01:44<1:04:19, 16.75it/s, loss=3.2269]


[TRAIN] Step 1600 | Loss: 3.1539 | LR: 0.000285


Epoch 1/1:   3%|‚ñé         | 1704/66242 [01:50<49:24, 21.77it/s, loss=3.1813]


[TRAIN] Step 1700 | Loss: 3.0738 | LR: 0.000283


Epoch 1/1:   3%|‚ñé         | 1803/66242 [01:55<54:23, 19.75it/s, loss=3.1425]


[TRAIN] Step 1800 | Loss: 3.3045 | LR: 0.000281


Epoch 1/1:   3%|‚ñé         | 1903/66242 [02:00<1:05:41, 16.32it/s, loss=3.0450]


[TRAIN] Step 1900 | Loss: 3.0550 | LR: 0.000278


Epoch 1/1:   3%|‚ñé         | 1999/66242 [02:05<49:12, 21.76it/s, loss=3.1225]


[TRAIN] Step 2000 | Loss: 3.1225 | LR: 0.000276

[VAL] Running validation at step 2000...


Epoch 1/1:   3%|‚ñé         | 2001/66242 [02:26<41:49:55,  2.34s/it, loss=2.9564]

[VAL] Step 2000 | Val Loss: 3.0214
‚úî New Best Model Saved! Val loss improved to 3.0214


Epoch 1/1:   3%|‚ñé         | 2102/66242 [02:32<1:13:37, 14.52it/s, loss=3.0815]


[TRAIN] Step 2100 | Loss: 3.0272 | LR: 0.000273


Epoch 1/1:   3%|‚ñé         | 2203/66242 [02:37<49:46, 21.44it/s, loss=3.0116]


[TRAIN] Step 2200 | Loss: 2.9338 | LR: 0.000270


Epoch 1/1:   3%|‚ñé         | 2303/66242 [02:42<50:15, 21.20it/s, loss=2.9075]


[TRAIN] Step 2300 | Loss: 3.0908 | LR: 0.000267


Epoch 1/1:   4%|‚ñé         | 2403/66242 [02:48<1:16:08, 13.97it/s, loss=2.9526]


[TRAIN] Step 2400 | Loss: 3.0045 | LR: 0.000264


Epoch 1/1:   4%|‚ñç         | 2503/66242 [02:52<48:47, 21.77it/s, loss=2.9542]


[TRAIN] Step 2500 | Loss: 2.9315 | LR: 0.000261


Epoch 1/1:   4%|‚ñç         | 2605/66242 [02:57<49:34, 21.40it/s, loss=3.0590]


[TRAIN] Step 2600 | Loss: 2.9884 | LR: 0.000258


Epoch 1/1:   4%|‚ñç         | 2704/66242 [03:03<56:18, 18.81it/s, loss=2.8960]


[TRAIN] Step 2700 | Loss: 2.7539 | LR: 0.000254


Epoch 1/1:   4%|‚ñç         | 2803/66242 [03:08<49:45, 21.25it/s, loss=2.8717]


[TRAIN] Step 2800 | Loss: 2.7260 | LR: 0.000251


Epoch 1/1:   4%|‚ñç         | 2905/66242 [03:12<46:48, 22.55it/s, loss=2.7526]


[TRAIN] Step 2900 | Loss: 2.7950 | LR: 0.000247


Epoch 1/1:   5%|‚ñç         | 3000/66242 [03:18<50:58, 20.68it/s, loss=2.8931]


[TRAIN] Step 3000 | Loss: 2.8931 | LR: 0.000244

[VAL] Running validation at step 3000...


Epoch 1/1:   5%|‚ñç         | 3001/66242 [03:39<48:04:18,  2.74s/it, loss=2.8161]

[VAL] Step 3000 | Val Loss: 2.8210
‚úî New Best Model Saved! Val loss improved to 2.8210


Epoch 1/1:   5%|‚ñç         | 3103/66242 [03:44<1:05:00, 16.19it/s, loss=2.8616]


[TRAIN] Step 3100 | Loss: 2.8174 | LR: 0.000240


Epoch 1/1:   5%|‚ñç         | 3204/66242 [03:50<47:45, 22.00it/s, loss=2.8060]


[TRAIN] Step 3200 | Loss: 2.7718 | LR: 0.000236


Epoch 1/1:   5%|‚ñç         | 3303/66242 [03:54<49:39, 21.12it/s, loss=2.7246]


[TRAIN] Step 3300 | Loss: 2.8598 | LR: 0.000232


Epoch 1/1:   5%|‚ñå         | 3403/66242 [03:59<1:08:46, 15.23it/s, loss=2.6749]


[TRAIN] Step 3400 | Loss: 2.7459 | LR: 0.000228


Epoch 1/1:   5%|‚ñå         | 3505/66242 [04:05<48:36, 21.51it/s, loss=2.7189]


[TRAIN] Step 3500 | Loss: 2.7964 | LR: 0.000224


Epoch 1/1:   5%|‚ñå         | 3604/66242 [04:10<47:11, 22.12it/s, loss=2.7631]


[TRAIN] Step 3600 | Loss: 2.6458 | LR: 0.000219


Epoch 1/1:   6%|‚ñå         | 3702/66242 [04:15<1:11:24, 14.60it/s, loss=2.7687]


[TRAIN] Step 3700 | Loss: 2.7866 | LR: 0.000215


Epoch 1/1:   6%|‚ñå         | 3803/66242 [04:20<47:44, 21.80it/s, loss=2.6329]


[TRAIN] Step 3800 | Loss: 2.7299 | LR: 0.000211


Epoch 1/1:   6%|‚ñå         | 3902/66242 [04:25<48:27, 21.44it/s, loss=2.5964]


[TRAIN] Step 3900 | Loss: 2.6314 | LR: 0.000206


Epoch 1/1:   6%|‚ñå         | 3999/66242 [04:30<1:16:35, 13.54it/s, loss=2.7093]


[TRAIN] Step 4000 | Loss: 2.7093 | LR: 0.000202

[VAL] Running validation at step 4000...


Epoch 1/1:   6%|‚ñå         | 4001/66242 [04:51<55:10:56,  3.19s/it, loss=2.7056]

[VAL] Step 4000 | Val Loss: 2.7064
‚úî New Best Model Saved! Val loss improved to 2.7064


Epoch 1/1:   6%|‚ñå         | 4103/66242 [04:56<47:10, 21.96it/s, loss=2.6357]


[TRAIN] Step 4100 | Loss: 2.7126 | LR: 0.000197


Epoch 1/1:   6%|‚ñã         | 4203/66242 [05:02<48:33, 21.29it/s, loss=2.6611]


[TRAIN] Step 4200 | Loss: 2.6523 | LR: 0.000193


Epoch 1/1:   6%|‚ñã         | 4305/66242 [05:07<46:26, 22.23it/s, loss=2.7197]


[TRAIN] Step 4300 | Loss: 2.6677 | LR: 0.000188


Epoch 1/1:   7%|‚ñã         | 4404/66242 [05:11<1:00:33, 17.02it/s, loss=2.6956]


[TRAIN] Step 4400 | Loss: 2.6842 | LR: 0.000183


Epoch 1/1:   7%|‚ñã         | 4503/66242 [05:17<49:54, 20.62it/s, loss=2.6400]


[TRAIN] Step 4500 | Loss: 2.8097 | LR: 0.000179


Epoch 1/1:   7%|‚ñã         | 4605/66242 [05:22<48:00, 21.40it/s, loss=2.5035]


[TRAIN] Step 4600 | Loss: 2.7069 | LR: 0.000174


Epoch 1/1:   7%|‚ñã         | 4703/66242 [05:27<1:05:22, 15.69it/s, loss=2.6032]


[TRAIN] Step 4700 | Loss: 2.7062 | LR: 0.000169


Epoch 1/1:   7%|‚ñã         | 4805/66242 [05:33<47:02, 21.76it/s, loss=2.6707]


[TRAIN] Step 4800 | Loss: 2.7291 | LR: 0.000164


Epoch 1/1:   7%|‚ñã         | 4904/66242 [05:37<46:45, 21.86it/s, loss=2.6499]


[TRAIN] Step 4900 | Loss: 2.6425 | LR: 0.000160


Epoch 1/1:   8%|‚ñä         | 5000/66242 [05:43<1:04:50, 15.74it/s, loss=2.6395]


[TRAIN] Step 5000 | Loss: 2.6395 | LR: 0.000155

[VAL] Running validation at step 5000...


Epoch 1/1:   8%|‚ñä         | 5001/66242 [06:04<64:57:46,  3.82s/it, loss=2.7036]

[VAL] Step 5000 | Val Loss: 2.6271
‚úî New Best Model Saved! Val loss improved to 2.6271


Epoch 1/1:   8%|‚ñä         | 5104/66242 [06:09<45:29, 22.40it/s, loss=2.6014]


[TRAIN] Step 5100 | Loss: 2.6303 | LR: 0.000150


Epoch 1/1:   8%|‚ñä         | 5204/66242 [06:15<1:00:17, 16.87it/s, loss=2.5560]


[TRAIN] Step 5200 | Loss: 2.5755 | LR: 0.000145


Epoch 1/1:   8%|‚ñä         | 5305/66242 [06:19<48:01, 21.15it/s, loss=2.5373]


[TRAIN] Step 5300 | Loss: 2.6051 | LR: 0.000140


Epoch 1/1:   8%|‚ñä         | 5404/66242 [06:24<46:42, 21.71it/s, loss=2.7008]


[TRAIN] Step 5400 | Loss: 2.6534 | LR: 0.000136


Epoch 1/1:   8%|‚ñä         | 5505/66242 [06:30<49:22, 20.50it/s, loss=2.6074]


[TRAIN] Step 5500 | Loss: 2.5631 | LR: 0.000131


Epoch 1/1:   8%|‚ñä         | 5604/66242 [06:35<45:21, 22.28it/s, loss=2.7346]


[TRAIN] Step 5600 | Loss: 2.5908 | LR: 0.000126


Epoch 1/1:   9%|‚ñä         | 5703/66242 [06:39<48:45, 20.69it/s, loss=2.4686]


[TRAIN] Step 5700 | Loss: 2.5962 | LR: 0.000121


Epoch 1/1:   9%|‚ñâ         | 5804/66242 [06:46<46:21, 21.73it/s, loss=2.6472]


[TRAIN] Step 5800 | Loss: 2.5355 | LR: 0.000117


Epoch 1/1:   9%|‚ñâ         | 5903/66242 [06:50<46:58, 21.41it/s, loss=2.6424]


[TRAIN] Step 5900 | Loss: 2.5560 | LR: 0.000112


Epoch 1/1:   9%|‚ñâ         | 5999/66242 [06:55<59:18, 16.93it/s, loss=2.5788]


[TRAIN] Step 6000 | Loss: 2.5788 | LR: 0.000107

[VAL] Running validation at step 6000...


Epoch 1/1:   9%|‚ñâ         | 6001/66242 [07:16<49:21:34,  2.95s/it, loss=2.7368]

[VAL] Step 6000 | Val Loss: 2.5693
‚úî New Best Model Saved! Val loss improved to 2.5693


Epoch 1/1:   9%|‚ñâ         | 6105/66242 [07:21<46:27, 21.58it/s, loss=2.5199]


[TRAIN] Step 6100 | Loss: 2.5216 | LR: 0.000103


Epoch 1/1:   9%|‚ñâ         | 6203/66242 [07:27<1:09:56, 14.31it/s, loss=2.6698]


[TRAIN] Step 6200 | Loss: 2.6154 | LR: 0.000098


Epoch 1/1:  10%|‚ñâ         | 6305/66242 [07:32<46:31, 21.47it/s, loss=2.5246]


[TRAIN] Step 6300 | Loss: 2.6677 | LR: 0.000094


Epoch 1/1:  10%|‚ñâ         | 6404/66242 [07:37<45:49, 21.76it/s, loss=2.5925]


[TRAIN] Step 6400 | Loss: 2.5255 | LR: 0.000089


Epoch 1/1:  10%|‚ñâ         | 6503/66242 [07:43<1:11:19, 13.96it/s, loss=2.5033]


[TRAIN] Step 6500 | Loss: 2.6460 | LR: 0.000085


Epoch 1/1:  10%|‚ñâ         | 6604/66242 [07:48<46:24, 21.42it/s, loss=2.5875]


[TRAIN] Step 6600 | Loss: 2.6110 | LR: 0.000081


Epoch 1/1:  10%|‚ñà         | 6703/66242 [07:52<45:30, 21.80it/s, loss=2.5524]


[TRAIN] Step 6700 | Loss: 2.6852 | LR: 0.000076


Epoch 1/1:  10%|‚ñà         | 6805/66242 [07:58<1:00:03, 16.49it/s, loss=2.5153]


[TRAIN] Step 6800 | Loss: 2.5382 | LR: 0.000072


Epoch 1/1:  10%|‚ñà         | 6904/66242 [08:03<46:13, 21.39it/s, loss=2.6546]


[TRAIN] Step 6900 | Loss: 2.6066 | LR: 0.000068


Epoch 1/1:  11%|‚ñà         | 7000/66242 [08:07<46:57, 21.02it/s, loss=2.5531]


[TRAIN] Step 7000 | Loss: 2.5531 | LR: 0.000064

[VAL] Running validation at step 7000...


Epoch 1/1:  11%|‚ñà         | 7001/66242 [08:29<45:32:13,  2.77s/it, loss=2.4677]

[VAL] Step 7000 | Val Loss: 2.5291
‚úî New Best Model Saved! Val loss improved to 2.5291


Epoch 1/1:  11%|‚ñà         | 7103/66242 [08:34<45:46, 21.53it/s, loss=2.5895]


[TRAIN] Step 7100 | Loss: 2.6354 | LR: 0.000060


Epoch 1/1:  11%|‚ñà         | 7202/66242 [08:39<1:04:55, 15.15it/s, loss=2.4906]


[TRAIN] Step 7200 | Loss: 2.5659 | LR: 0.000056


Epoch 1/1:  11%|‚ñà         | 7305/66242 [08:45<45:35, 21.55it/s, loss=2.4452]


[TRAIN] Step 7300 | Loss: 2.5752 | LR: 0.000053


Epoch 1/1:  11%|‚ñà         | 7404/66242 [08:49<46:57, 20.88it/s, loss=2.4135]


[TRAIN] Step 7400 | Loss: 2.5212 | LR: 0.000049


Epoch 1/1:  11%|‚ñà‚ñè        | 7503/66242 [08:55<1:00:57, 16.06it/s, loss=2.5144]


[TRAIN] Step 7500 | Loss: 2.7107 | LR: 0.000046


Epoch 1/1:  11%|‚ñà‚ñè        | 7605/66242 [09:00<45:21, 21.54it/s, loss=2.5496]


[TRAIN] Step 7600 | Loss: 2.6204 | LR: 0.000042


Epoch 1/1:  12%|‚ñà‚ñè        | 7704/66242 [09:05<45:45, 21.32it/s, loss=2.5772]


[TRAIN] Step 7700 | Loss: 2.5775 | LR: 0.000039


Epoch 1/1:  12%|‚ñà‚ñè        | 7803/66242 [09:10<1:06:57, 14.55it/s, loss=2.6621]


[TRAIN] Step 7800 | Loss: 2.4164 | LR: 0.000036


Epoch 1/1:  12%|‚ñà‚ñè        | 7903/66242 [09:16<44:39, 21.77it/s, loss=2.5606]


[TRAIN] Step 7900 | Loss: 2.5152 | LR: 0.000033


Epoch 1/1:  12%|‚ñà‚ñè        | 7999/66242 [09:20<46:48, 20.74it/s, loss=2.4633]


[TRAIN] Step 8000 | Loss: 2.4633 | LR: 0.000030

[VAL] Running validation at step 8000...


Epoch 1/1:  12%|‚ñà‚ñè        | 8001/66242 [09:42<40:00:13,  2.47s/it, loss=2.4608]

[VAL] Step 8000 | Val Loss: 2.5035
‚úî New Best Model Saved! Val loss improved to 2.5035


Epoch 1/1:  12%|‚ñà‚ñè        | 8103/66242 [09:47<45:38, 21.23it/s, loss=2.5603]


[TRAIN] Step 8100 | Loss: 2.5545 | LR: 0.000027


Epoch 1/1:  12%|‚ñà‚ñè        | 8202/66242 [09:51<50:53, 19.01it/s, loss=2.4514]


[TRAIN] Step 8200 | Loss: 2.5716 | LR: 0.000024


Epoch 1/1:  13%|‚ñà‚ñé        | 8304/66242 [09:58<45:25, 21.25it/s, loss=2.5595]


[TRAIN] Step 8300 | Loss: 2.4044 | LR: 0.000022


Epoch 1/1:  13%|‚ñà‚ñé        | 8403/66242 [10:02<45:19, 21.27it/s, loss=2.5738]


[TRAIN] Step 8400 | Loss: 2.4463 | LR: 0.000019


Epoch 1/1:  13%|‚ñà‚ñé        | 8502/66242 [10:07<59:36, 16.15it/s, loss=2.5141]


[TRAIN] Step 8500 | Loss: 2.5133 | LR: 0.000017


Epoch 1/1:  13%|‚ñà‚ñé        | 8605/66242 [10:13<44:00, 21.83it/s, loss=2.4362]


[TRAIN] Step 8600 | Loss: 2.5449 | LR: 0.000015


Epoch 1/1:  13%|‚ñà‚ñé        | 8704/66242 [10:18<45:11, 21.22it/s, loss=2.3813]


[TRAIN] Step 8700 | Loss: 2.4375 | LR: 0.000013


Epoch 1/1:  13%|‚ñà‚ñé        | 8804/66242 [10:23<58:57, 16.24it/s, loss=2.5229]


[TRAIN] Step 8800 | Loss: 2.4703 | LR: 0.000011


Epoch 1/1:  13%|‚ñà‚ñé        | 8904/66242 [10:29<44:49, 21.32it/s, loss=2.4999]


[TRAIN] Step 8900 | Loss: 2.4938 | LR: 0.000009


Epoch 1/1:  14%|‚ñà‚ñé        | 9000/66242 [10:33<42:34, 22.41it/s, loss=2.5038]


[TRAIN] Step 9000 | Loss: 2.5038 | LR: 0.000008

[VAL] Running validation at step 9000...


Epoch 1/1:  14%|‚ñà‚ñé        | 9001/66242 [10:55<43:32:34,  2.74s/it, loss=2.5038]

[VAL] Step 9000 | Val Loss: 2.4911
‚úî New Best Model Saved! Val loss improved to 2.4911


Epoch 1/1:  14%|‚ñà‚ñé        | 9104/66242 [11:00<46:26, 20.51it/s, loss=2.4341]


[TRAIN] Step 9100 | Loss: 2.5406 | LR: 0.000006


Epoch 1/1:  14%|‚ñà‚ñç        | 9203/66242 [11:04<46:01, 20.66it/s, loss=2.4911]


[TRAIN] Step 9200 | Loss: 2.3538 | LR: 0.000005


Epoch 1/1:  14%|‚ñà‚ñç        | 9304/66242 [11:11<50:27, 18.81it/s, loss=2.4700]


[TRAIN] Step 9300 | Loss: 2.4117 | LR: 0.000004


Epoch 1/1:  14%|‚ñà‚ñç        | 9403/66242 [11:15<43:20, 21.86it/s, loss=2.4274]


[TRAIN] Step 9400 | Loss: 2.5149 | LR: 0.000003


Epoch 1/1:  14%|‚ñà‚ñç        | 9505/66242 [11:20<44:22, 21.31it/s, loss=2.3962]


[TRAIN] Step 9500 | Loss: 2.5160 | LR: 0.000002


Epoch 1/1:  14%|‚ñà‚ñç        | 9605/66242 [11:26<44:17, 21.31it/s, loss=2.5366]


[TRAIN] Step 9600 | Loss: 2.4462 | LR: 0.000001


Epoch 1/1:  15%|‚ñà‚ñç        | 9704/66242 [11:31<45:26, 20.74it/s, loss=2.4017]


[TRAIN] Step 9700 | Loss: 2.6009 | LR: 0.000001


Epoch 1/1:  15%|‚ñà‚ñç        | 9803/66242 [11:35<50:25, 18.65it/s, loss=2.4127]


[TRAIN] Step 9800 | Loss: 2.6875 | LR: 0.000000


Epoch 1/1:  15%|‚ñà‚ñç        | 9904/66242 [11:41<43:39, 21.50it/s, loss=2.5456]


[TRAIN] Step 9900 | Loss: 2.4251 | LR: 0.000000


Epoch 1/1:  15%|‚ñà‚ñå        | 9999/66242 [11:46<1:06:13, 14.15it/s, loss=2.5031]



üéâ Training Complete!


0,1
lr,‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà
train/loss,‚ñà‚ñà‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÅ
val/loss,‚ñà‚ñÖ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ

0,1
lr,0.0
step,9999.0
train/loss,2.5031
val/loss,2.49107
