In [6]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchmetrics import Accuracy, Precision, Recall, F1Score
import pandas as pd
from setup_torch import *

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Для CPU уменьшим batch_size; для GPU можно увеличить
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, )
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, )

# ----------------------------
# 4. Модель
# ----------------------------
model = torchvision.models.resnet18(weights=None, num_classes=10)

# ----------------------------
# 5. Устройство
# ----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# ----------------------------
# 6. Метрики (из torchmetrics)
# ----------------------------
num_classes = 10
task = 'multiclass'

metrics = {
    'acc': Accuracy(task=task, num_classes=num_classes).to(device),
    'prec': Precision(task=task, num_classes=num_classes, average='macro').to(device),
    'rec': Recall(task=task, num_classes=num_classes, average='macro').to(device),
    'f1': F1Score(task=task, num_classes=num_classes, average='macro').to(device),
}

# torchmetrics возвращает tensor → оборачиваем в лямбду для совместимости
def make_metric_fn(metric_obj):
    return lambda preds, target: metric_obj(preds, target)

wrapped_metrics = {name: make_metric_fn(metric) for name, metric in metrics.items()}

# ----------------------------
# 7. Оптимизатор и критерий
# ----------------------------
criterion = nn.CrossEntropyLoss()

# Эффективный batch_size = 64 (без аккумуляции)
# Если хотите эмулировать batch_size=256 → accumulation_steps=4 и lr *= 4
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)  # базовый LR

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

# ----------------------------
# 8. Запуск обучения
# ----------------------------
history = fit(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    metrics=wrapped_metrics,
    epochs=20,
    scheduler=scheduler,
    patience=7,
    min_delta=0.001,
    grad_clip=1.0,
    use_amp=(device.type == 'cuda'),      # AMP только на GPU
    ema_decay=0.999,
    device=device,
    checkpoint_path='best_model_checkpoint.pt',
    verbose=True,
    monitor_metric='f1',                  # early stopping по F1
    mode='max',
    accumulation_steps=1,                 # без аккумуляции (можно поставить 2, 4 и т.д.)
    is_distributed=False                  # один GPU или CPU
)

print("\n Обучение завершено!")
print(f"Лучшая эпоха: {history.attrs['best_epoch'] + 1}")
print(f"Лучший F1: {history.attrs['best_valid_score']:.4f}")

# Сохранить историю
history.to_csv('training_history.csv', index=False)

  scaler = torch.cuda.amp.GradScaler() if (use_amp and device != 'cpu') else None


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Training:   0%|          | 0/782 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast() if scaler else torch.no_grad():


Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 01 | Time: 0m 45s | LR: 0.001000
	Train loss: 1.3843 | Train acc: 50.90% | Train prec: 51.46% | Train rec: 51.68% | Train f1: 48.41%
	Val loss: 2.7320 | Val acc: 10.04% | Val prec: 1.01% | Val rec: 10.03% | Val f1: 1.82%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 02 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.9746 | Train acc: 65.98% | Train prec: 66.34% | Train rec: 67.01% | Train f1: 64.22%
	Val loss: 2.3681 | Val acc: 26.51% | Val prec: 13.26% | Val rec: 26.39% | Val f1: 14.05%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 03 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.7918 | Train acc: 72.45% | Train prec: 72.31% | Train rec: 73.49% | Train f1: 70.86%
	Val loss: 1.6335 | Val acc: 41.95% | Val prec: 52.55% | Val rec: 41.87% | Val f1: 37.77%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 04 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.6619 | Train acc: 77.18% | Train prec: 77.01% | Train rec: 78.26% | Train f1: 75.77%
	Val loss: 1.0453 | Val acc: 63.19% | Val prec: 73.69% | Val rec: 63.03% | Val f1: 61.72%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 05 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.5510 | Train acc: 80.89% | Train prec: 80.55% | Train rec: 81.85% | Train f1: 79.52%
	Val loss: 0.7738 | Val acc: 73.31% | Val prec: 75.75% | Val rec: 72.94% | Val f1: 71.62%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 06 | Time: 0m 29s | LR: 0.001000
	Train loss: 0.4579 | Train acc: 84.07% | Train prec: 83.51% | Train rec: 84.83% | Train f1: 82.76%
	Val loss: 0.7206 | Val acc: 75.19% | Val prec: 76.94% | Val rec: 74.72% | Val f1: 73.54%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 07 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.3662 | Train acc: 87.32% | Train prec: 86.83% | Train rec: 88.06% | Train f1: 86.24%
	Val loss: 0.7258 | Val acc: 76.80% | Val prec: 77.53% | Val rec: 76.23% | Val f1: 75.00%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 08 | Time: 0m 29s | LR: 0.001000
	Train loss: 0.2966 | Train acc: 89.63% | Train prec: 88.98% | Train rec: 90.34% | Train f1: 88.61%
	Val loss: 0.6810 | Val acc: 78.87% | Val prec: 78.56% | Val rec: 78.25% | Val f1: 76.66%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

✅ Saved best checkpoint to best_model_checkpoint.pt
Epoch: 09 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.2340 | Train acc: 91.86% | Train prec: 91.37% | Train rec: 92.40% | Train f1: 91.03%
	Val loss: 0.7072 | Val acc: 79.37% | Val prec: 79.29% | Val rec: 78.85% | Val f1: 77.63%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 10 | Time: 0m 28s | LR: 0.001000
	Train loss: 0.1963 | Train acc: 93.18% | Train prec: 92.59% | Train rec: 93.77% | Train f1: 92.46%
	Val loss: 0.7469 | Val acc: 78.82% | Val prec: 78.79% | Val rec: 78.33% | Val f1: 77.00%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 11 | Time: 0m 29s | LR: 0.001000
	Train loss: 0.1660 | Train acc: 94.30% | Train prec: 93.78% | Train rec: 94.74% | Train f1: 93.66%
	Val loss: 0.9056 | Val acc: 76.45% | Val prec: 77.30% | Val rec: 76.01% | Val f1: 74.49%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 12 | Time: 0m 29s | LR: 0.001000
	Train loss: 0.1459 | Train acc: 94.94% | Train prec: 94.46% | Train rec: 95.46% | Train f1: 94.42%
	Val loss: 0.8267 | Val acc: 79.02% | Val prec: 79.11% | Val rec: 78.42% | Val f1: 77.21%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 13 | Time: 0m 28s | LR: 0.000500
	Train loss: 0.1236 | Train acc: 95.69% | Train prec: 95.19% | Train rec: 96.07% | Train f1: 95.15%
	Val loss: 0.9941 | Val acc: 77.49% | Val prec: 79.32% | Val rec: 77.37% | Val f1: 76.14%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 14 | Time: 0m 28s | LR: 0.000500
	Train loss: 0.0470 | Train acc: 98.43% | Train prec: 98.19% | Train rec: 98.61% | Train f1: 98.24%
	Val loss: 1.1015 | Val acc: 78.99% | Val prec: 78.92% | Val rec: 78.77% | Val f1: 77.36%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 15 | Time: 0m 28s | LR: 0.000500
	Train loss: 0.0296 | Train acc: 98.98% | Train prec: 98.77% | Train rec: 99.16% | Train f1: 98.85%
	Val loss: 1.2538 | Val acc: 79.00% | Val prec: 79.19% | Val rec: 78.62% | Val f1: 77.24%


Training:   0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchmetrics import Accuracy, Precision, Recall, F1Score
import pandas as pd

In [2]:
from torch_trainer_v1 import *

In [5]:

# 1. Настройка
device = setup_experiment(seed=42, device_preference="auto")

# 2. Данные
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_ds = datasets.CIFAR10("data", train=True, download=True, transform=transform)
val_ds = datasets.CIFAR10("data", train=False, transform=transform)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

# 3. Модель
model = torchvision.models.resnet18(weights=None, num_classes=10).to(device)
# model = torch.compile(model, mode="reduce-overhead")  # ← главный ускоритель!

# 4. Метрики
num_classes = 10
metrics = {
    "acc": Accuracy(task="multiclass", num_classes=num_classes).to(device),
    "prec": Precision(task="multiclass", num_classes=num_classes, average="macro").to(device),
    "rec": Recall(task="multiclass", num_classes=num_classes, average="macro").to(device),
    "f1": F1Score(task="multiclass", num_classes=num_classes, average="macro").to(device),
}

# Оборачиваем для совместимости
wrapped_metrics = {k: lambda pred, target, fn=fn: fn(pred, target) for k, fn in metrics.items()}

# 5. Оптимизатор
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=3)


# 6. Запуск
history = fit(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler = scheduler,
    criterion=torch.nn.CrossEntropyLoss().to(device),
    metrics=wrapped_metrics,
    device=device,
    epochs=3,
    patience=7,
    monitor_metric="f1",
    mode="max",
    use_amp=(device.type == "cuda"),
    ema_decay=0.999,
    grad_clip=1.0,
    accumulation_steps=1,
    checkpoint_path="best_model.pt",
)

[✓] Device: cuda | Seed: 42 | TF32: True


Train:   0%|          | 0/391 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 01 | Time: 19.8s | LR: 3.00e-04 | Throughput: 2528 samples/s | Train Loss: 1.4298 | Val Loss: 2.3937 | Val f1: 0.0406 ★


Train:   0%|          | 0/391 [00:00<?, ?it/s]

 Async checkpoint saved to best_model.pt


Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 02 | Time: 20.6s | LR: 3.00e-04 | Throughput: 2427 samples/s | Train Loss: 1.0081 | Val Loss: 2.3072 | Val f1: 0.0405 


Train:   0%|          | 0/391 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 03 | Time: 20.9s | LR: 3.00e-04 | Throughput: 2392 samples/s | Train Loss: 0.8078 | Val Loss: 1.8973 | Val f1: 0.2157 ★
 Async checkpoint saved to best_model.pt
