In [1]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchmetrics import Accuracy, Precision, Recall, F1Score
import pandas as pd
from setup_torch import *

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
val_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Для CPU уменьшим batch_size; для GPU можно увеличить
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, )
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, )

# ----------------------------
# 4. Модель
# ----------------------------
model = torchvision.models.resnet18(weights=None, num_classes=10)

# ----------------------------
# 5. Устройство
# ----------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# ----------------------------
# 6. Метрики (из torchmetrics)
# ----------------------------
num_classes = 10
task = 'multiclass'

metrics = {
    'acc': Accuracy(task=task, num_classes=num_classes).to(device),
    'prec': Precision(task=task, num_classes=num_classes, average='macro').to(device),
    'rec': Recall(task=task, num_classes=num_classes, average='macro').to(device),
    'f1': F1Score(task=task, num_classes=num_classes, average='macro').to(device),
}

# torchmetrics возвращает tensor → оборачиваем в лямбду для совместимости
def make_metric_fn(metric_obj):
    return lambda preds, target: metric_obj(preds, target)

wrapped_metrics = {name: make_metric_fn(metric) for name, metric in metrics.items()}

# ----------------------------
# 7. Оптимизатор и критерий
# ----------------------------
criterion = nn.CrossEntropyLoss()

# Эффективный batch_size = 64 (без аккумуляции)
# Если хотите эмулировать batch_size=256 → accumulation_steps=4 и lr *= 4
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)  # базовый LR

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

# ----------------------------
# 8. Запуск обучения
# ----------------------------
history = fit(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    metrics=wrapped_metrics,
    epochs=20,
    scheduler=scheduler,
    patience=7,
    min_delta=0.001,
    grad_clip=1.0,
    use_amp=(device.type == 'cuda'),      # AMP только на GPU
    ema_decay=0.999,
    device=device,
    checkpoint_path='best_model_checkpoint.pt',
    verbose=True,
    monitor_metric='f1',                  # early stopping по F1
    mode='max',
    accumulation_steps=1,                 # без аккумуляции (можно поставить 2, 4 и т.д.)
    is_distributed=False                  # один GPU или CPU
)

print("\n Обучение завершено!")
print(f"Лучшая эпоха: {history.attrs['best_epoch'] + 1}")
print(f"Лучший F1: {history.attrs['best_valid_score']:.4f}")

# Сохранить историю
history.to_csv('training_history.csv', index=False)

ModuleNotFoundError: No module named 'setup_torch'

In [1]:
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchmetrics import Accuracy, Precision, Recall, F1Score
import pandas as pd

In [2]:
from torch_trainer_v1 import *

In [5]:

# 1. Настройка
device = setup_experiment(seed=42, device_preference="auto")

# 2. Данные
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_ds = datasets.CIFAR10("data", train=True, download=True, transform=transform)
val_ds = datasets.CIFAR10("data", train=False, transform=transform)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

# 3. Модель
model = torchvision.models.resnet18(weights=None, num_classes=10).to(device)
# model = torch.compile(model, mode="reduce-overhead")  # ← главный ускоритель!

# 4. Метрики
num_classes = 10
metrics = {
    "acc": Accuracy(task="multiclass", num_classes=num_classes).to(device),
    "prec": Precision(task="multiclass", num_classes=num_classes, average="macro").to(device),
    "rec": Recall(task="multiclass", num_classes=num_classes, average="macro").to(device),
    "f1": F1Score(task="multiclass", num_classes=num_classes, average="macro").to(device),
}

# Оборачиваем для совместимости
wrapped_metrics = {k: lambda pred, target, fn=fn: fn(pred, target) for k, fn in metrics.items()}

# 5. Оптимизатор
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=3)


# 6. Запуск
history = fit(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler = scheduler,
    criterion=torch.nn.CrossEntropyLoss().to(device),
    metrics=wrapped_metrics,
    device=device,
    epochs=3,
    patience=7,
    monitor_metric="f1",
    mode="max",
    use_amp=(device.type == "cuda"),
    ema_decay=0.999,
    grad_clip=1.0,
    accumulation_steps=1,
    checkpoint_path="best_model.pt",
)

[✓] Device: cuda | Seed: 42 | TF32: True


Train:   0%|          | 0/391 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 01 | Time: 19.8s | LR: 3.00e-04 | Throughput: 2528 samples/s | Train Loss: 1.4298 | Val Loss: 2.3937 | Val f1: 0.0406 ★


Train:   0%|          | 0/391 [00:00<?, ?it/s]

 Async checkpoint saved to best_model.pt


Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 02 | Time: 20.6s | LR: 3.00e-04 | Throughput: 2427 samples/s | Train Loss: 1.0081 | Val Loss: 2.3072 | Val f1: 0.0405 


Train:   0%|          | 0/391 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

Epoch 03 | Time: 20.9s | LR: 3.00e-04 | Throughput: 2392 samples/s | Train Loss: 0.8078 | Val Loss: 1.8973 | Val f1: 0.2157 ★
 Async checkpoint saved to best_model.pt
