In [1]:
import torchvision
import torchvision.transforms.v2 as T
import torch
import torch.nn as nn
import torchmetrics
import numpy as np

In [2]:
torch.__version__

'2.9.1+cu128'

In [3]:
toTensor = T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale=True)])

train_and_valid_data = torchvision.datasets.FashionMNIST(
    root="dataset", train=True, transform=toTensor,download=True
)

test_data = torchvision.datasets.FashionMNIST(
    root="dataset", train=False, transform=toTensor, download=True
)

torch.manual_seed(42)
train_data, valid_data = torch.utils.data.random_split(
    train_and_valid_data, [55000, 5000]
)


In [4]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

In [5]:
X_sample, y_sample = train_data[0]
X_sample.shape, X_sample.dtype

(torch.Size([1, 28, 28]), torch.float32)

In [6]:
train_and_valid_data.classes

['T-shirt/top',
 'Trouser',
 'Pullover',
 'Dress',
 'Coat',
 'Sandal',
 'Shirt',
 'Sneaker',
 'Bag',
 'Ankle boot']

#### Building the Classifier

In [7]:
class ImageClassifier(nn.Module):
    def __init__(self, n_inputs, n_hidden1, n_hidden2, n_classes):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Flatten(),
            nn.Linear(n_inputs, n_hidden1),
            nn.ReLU(),
            nn.Linear(n_hidden1, n_hidden2),
            nn.ReLU(),
            nn.Linear(n_hidden2, n_classes)
        )

    def forward(self, X):
        return self.mlp(X)

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [9]:
torch.manual_seed(42)
model = ImageClassifier(n_inputs=1*28*28, n_hidden1=300, n_hidden2=100,
                        n_classes=10).to(device)

xentropy = nn.CrossEntropyLoss()

In [10]:
import torchmetrics

def evaluate_tm(model, data_Loader, metric):
    model.eval()
    metric.reset()
    with torch.inference_mode():
        for X_batch, y_batch in data_Loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()


In [74]:
def train2(model, optimizer, criterion, metric, train_loader, valid_loader, n_epochs):
    history = {"train_losses":[],
               "train_metrics":[],
               "valid_metrics": []}
    for epoch in range(n_epochs):
        total_loss = 0.
        metric.reset()
        for X_batch, y_batch in train_loader:
            model.train()
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric(y_pred, y_batch)
        mean_loss = total_loss / len(train_loader)
        history["train_losses"].append(mean_loss)
        history["train_metrics"].append(metric.compute().item())
        history["valid_metrics"].append(evaluate_tm(model, valid_loader, metric).item())

        print(f"Epoch: {epoch+1}/{n_epochs},",
              f"Train Loss: {history["train_losses"][-1]:.4f}",
              f"Train metric: {history["train_metrics"][-1]:.4f}",
              f"Valid Metrics: {history["valid_metrics"][-1]:.4f}",
              f"Total Loss: {total_loss:.4f}",
              f"Loss.item(): {loss}"
              )
    return history


Un comment the line to train the model and see metrics

In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
n_epochs = 20
# _ = train2(model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs)

In [13]:
model.eval()
X_new, y_new = next(iter(valid_loader))
X_new = X_new[:3].to(device)
with torch.inference_mode():
    y_pred_logits = model(X_new)
y_pred = y_pred_logits.argmax(dim = 1)
y_pred

tensor([5, 5, 5], device='cuda:0')

In [14]:
[train_and_valid_data.classes[idx] for idx in y_pred]

['Sandal', 'Sandal', 'Sandal']

In [15]:
y_new[:3]

tensor([7, 4, 2])

In [16]:
import torch.nn.functional as F
y_proba = F.softmax(y_pred_logits, dim = 1)
if device == "mps":
    y_proba = y_proba.cpu()
y_proba.round(decimals=3)

tensor([[0.1040, 0.1010, 0.0960, 0.0950, 0.1010, 0.1060, 0.1010, 0.0930, 0.0990,
         0.1030],
        [0.1020, 0.1000, 0.0990, 0.0970, 0.0930, 0.1150, 0.1000, 0.0960, 0.1000,
         0.0970],
        [0.1010, 0.1040, 0.0990, 0.0920, 0.0940, 0.1160, 0.0970, 0.0970, 0.1010,
         0.0990]], device='cuda:0')

In [17]:
y_top4_logits, y_top4_indices = torch.topk(y_pred_logits, k = 4, dim = 1)
y_top4_proba = F.softmax(y_top4_logits, dim = 1)
y_top4_proba.round(decimals = 3)

tensor([[0.2550, 0.2510, 0.2490, 0.2440],
        [0.2750, 0.2450, 0.2400, 0.2400],
        [0.2740, 0.2470, 0.2400, 0.2390]], device='cuda:0')

In [18]:
y_top4_indices

tensor([[5, 0, 9, 1],
        [5, 0, 1, 6],
        [5, 1, 0, 8]], device='cuda:0')

### Fine-tuning Hyperparameters with optuna

In [19]:
import optuna

def objective(trial, train_loader, valid_loader):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log = True)
    n_hidden = trial.suggest_int("n_hidden", 20, 300)
    model = ImageClassifier(n_inputs=1*28*28,n_hidden1=n_hidden,
                            n_hidden2=n_hidden,n_classes=10).to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
    xentropy = nn.CrossEntropyLoss()
    accuracy = torchmetrics.Accuracy(task = "multiclass", num_classes=10)
    accuracy = accuracy.to(device)
    best_validation_accuracy = 0.
    
    for epoch in range(n_epochs):
        history = train2(model, optimizer, xentropy, accuracy, train_loader,
                     valid_loader, n_epochs=1)
        validation_accuracy = max(history["valid_metrics"])
        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
        trial.report(validation_accuracy, epoch)

        if trial.should_prune():
            raise optuna.TrialPruned
    
    return best_validation_accuracy

In [20]:
torch.manual_seed(42)
sampler = optuna.samplers.TPESampler(seed=42)
objective_with_data = lambda trial: objective(
    trial, train_loader, valid_loader
)
pruner = optuna.pruners.MedianPruner(n_min_trials=5, n_warmup_steps=0,
                                     interval_steps=1)
study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
study.optimize(objective_with_data, n_trials=5)

[I 2026-01-09 16:12:07,688] A new study created in memory with name: no-name-679b7a6b-09dc-49ea-ba53-c0d0a64b1ee7


Epoch: 1/1, Train Loss: 2.2769 Train metric: 0.1471 Valid Metrics: 0.1860
Epoch: 1/1, Train Loss: 2.2093 Train metric: 0.2794 Valid Metrics: 0.3500
Epoch: 1/1, Train Loss: 2.1164 Train metric: 0.4109 Valid Metrics: 0.4554
Epoch: 1/1, Train Loss: 1.9776 Train metric: 0.5137 Valid Metrics: 0.5560
Epoch: 1/1, Train Loss: 1.7867 Train metric: 0.5826 Valid Metrics: 0.6026
Epoch: 1/1, Train Loss: 1.5775 Train metric: 0.6184 Valid Metrics: 0.6228
Epoch: 1/1, Train Loss: 1.3978 Train metric: 0.6288 Valid Metrics: 0.6326
Epoch: 1/1, Train Loss: 1.2605 Train metric: 0.6360 Valid Metrics: 0.6372
Epoch: 1/1, Train Loss: 1.1572 Train metric: 0.6467 Valid Metrics: 0.6424
Epoch: 1/1, Train Loss: 1.0782 Train metric: 0.6537 Valid Metrics: 0.6436
Epoch: 1/1, Train Loss: 1.0162 Train metric: 0.6611 Valid Metrics: 0.6530
Epoch: 1/1, Train Loss: 0.9665 Train metric: 0.6689 Valid Metrics: 0.6620
Epoch: 1/1, Train Loss: 0.9258 Train metric: 0.6761 Valid Metrics: 0.6700
Epoch: 1/1, Train Loss: 0.8919 Train m

[I 2026-01-09 16:14:03,657] Trial 0 finished with value: 0.7089999914169312 and parameters: {'learning_rate': 0.00031489116479568613, 'n_hidden': 287}. Best is trial 0 with value: 0.7089999914169312.


Epoch: 1/1, Train Loss: 0.7647 Train metric: 0.7196 Valid Metrics: 0.7082


[W 2026-01-09 16:14:04,596] Trial 1 failed with parameters: {'learning_rate': 0.008471801418819975, 'n_hidden': 188} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/krekken/Hands-on-ML/env/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_9915/4164084884.py", line 3, in <lambda>
    objective_with_data = lambda trial: objective(
                                        ^^^^^^^^^^
  File "/tmp/ipykernel_9915/1814299302.py", line 15, in objective
    history = train2(model, optimizer, xentropy, accuracy, train_loader,
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_9915/3590758802.py", line 8, in train2
    for X_batch, y_batch in train_loader:
  File "/home/krekken/Hands-on-ML/env/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 732, in __next__
    

KeyboardInterrupt: 

In [None]:
study.best_params

{'learning_rate': 0.008471801418819975, 'n_hidden': 188}

In [None]:
study.best_value

0.8677999973297119

### Saving and loading PyTroch models

In [None]:
torch.save(model.state_dict(), "my_fashion_mnist_weights.pt")
new_model = ImageClassifier(n_inputs=1*28*28, n_hidden1 = 300, n_hidden2 = 100,
                            n_classes = 10)
loaded_weights = torch.load("my_fashion_mnist_weights.pt", weights_only=True)
new_model.load_state_dict(loaded_weights)


<All keys matched successfully>

#### Compiling and Optimizing a PyTroch model

In [None]:
torchscript_model = torch.jit.trace(model, X_new)

### Exercises

13

In [21]:
x = torch.tensor([1.2], requires_grad=True)
y = torch.tensor([3.4], requires_grad=True)

def f(x,y):
    return torch.sin((x**2) * y)

result = f(x,y)
result.backward()

x.grad.item(), y.grad.item()

(1.489864706993103, 0.26291730999946594)

14

In [22]:
class Dense2(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.zeros(out_features))

    def forward(self, X):
        z = X @ self.weight.T + self.bias
        return F.relu(z)

In [23]:
torch.manual_seed(42)
dense = Dense2(3,5)
X = torch.randn(2,3)
y_pred = dense(X)
y_pred.shape


torch.Size([2, 5])

In [24]:
y_pred_check = F.relu(X @ dense.weight.T + dense.bias)
torch.allclose(y_pred, y_pred_check)

True

Kaiming Initialization

In [25]:
class Dense3(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
        self.bias = nn.Parameter(torch.zeros(out_features))
    
    def forward(self, X):
        z = X @ self.weight.T + self.bias
        return F.relu(z)

In [26]:
torch.manual_seed(42)
dense3 = Dense3(3,5)
X = torch.randn(2,3)
y_pred3 = dense3(X)
y_pred3.shape

torch.Size([2, 5])

In [27]:
y_pred_check = F.relu(X @ dense3.weight.T + dense3.bias)
torch.allclose(y_pred3, y_pred_check)

True

15 - CoverType Dataset

In [28]:
from sklearn.datasets import fetch_covtype
covertype = fetch_covtype(data_home="../datasets/", download_if_missing=True)

In [87]:
X, y = covertype.data, covertype.target

In [88]:
class_names = ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine", "Cottonwood/Willow","Aspen", "Douglas-fir","Krummholz"]

In [89]:
y[0]

np.int32(5)

In [90]:
class_names[y[0]]

'Douglas-fir'

In [91]:
# X = torch.from_numpy(X)

In [92]:
# The target values start from 1 
y = y - 1

In [93]:
# y = y + 1

In [94]:
import pandas as pd

cover_df = pd.DataFrame(X, columns=covertype.feature_names)

In [95]:
cover_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We first convert meters columns to kms

cover_df["Elevation"] = cover_df["Elevation"] / 1000
cover_df["Horizontal_Distance_To_Hydrology"] = cover_df["Horizontal_Distance_To_Hydrology"] / 1000
cover_df["Vertical_Distance_To_Hydrology"] = cover_df["Vertical_Distance_To_Hydrology"] / 1000
cover_df["Horizontal_Distance_To_Roadways"] = cover_df["Horizontal_Distance_To_Roadways"] / 1000
cover_df["Horizontal_Distance_To_Fire_Points"] = cover_df["Horizontal_Distance_To_Fire_Points"] / 1000

Then we scale the hillshade columns as these are 0 - 255 indexed.

In [96]:
cover_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


cover_df["Hillshade_9am"] = cover_df["Hillshade_9am"] / 255
cover_df["Hillshade_Noon"] = cover_df["Hillshade_Noon"] / 255
cover_df["Hillshade_3pm"] = cover_df["Hillshade_3pm"] / 255

In [97]:
cover_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
X = cover_df.values

In [99]:
X.shape, y.shape

((581012, 54), (581012,))

In [100]:
type(X), type(y)

(numpy.ndarray, numpy.ndarray)

In [101]:
mean = X.mean(axis = 0)
std = X.std(axis = 0)
X = (X - mean) / std

X.shape

(581012, 54)

Shuffling and choosing the first 100,000 instances for training.

In [102]:
X_tensor = torch.from_numpy(X).float()
y_tensor = torch.from_numpy(y).to(torch.int64)

In [103]:
X_tensor.shape, y_tensor.shape

(torch.Size([581012, 54]), torch.Size([581012]))

In [104]:
from torch.utils.data import TensorDataset
from torch.utils.data import random_split
torch.manual_seed(42)

covtype_dataset = TensorDataset(X_tensor, y_tensor)

train_size = len(covtype_dataset) * 80 // 100
valid_size = len(covtype_dataset) * 10 // 100
test_size = len(covtype_dataset) - train_size - valid_size

train_data, valid_data, test_data = random_split(covtype_dataset, [train_size, valid_size, test_size])

The below X and y will be our final dataset where which will extract the train, validation and test set.

But before that, let us normalize our dataset.

Creating dataloaders

In [105]:
train_loader = DataLoader(train_data, batch_size=256)
valid_loader = DataLoader(valid_data, batch_size=256)
test_loader = DataLoader(test_data, batch_size=256)

In [106]:
device

'cuda'

In [107]:
class CovTypeClassifier(nn.Module):
    def __init__(self, n_inputs, num_hidden_neurons, n_classes):
        super().__init__()
        layers = [
            Dense3(n_in, n_out)
            for n_in, n_out in zip([n_inputs] + num_hidden_neurons, num_hidden_neurons)
        ] + [nn.Linear(num_hidden_neurons[-1], n_classes)]
        self.layers = nn.ModuleList(layers)

    def forward(self, X):
        for layer in self.layers:
            X = layer(X)
        return X

In [108]:
model = CovTypeClassifier(n_inputs=54, num_hidden_neurons=[200,100,50], n_classes=7).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
loss = nn.CrossEntropyLoss().to(device)
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=7).to(device)

history = train2(model, optimizer, loss, accuracy, train_loader, valid_loader, 40)

Epoch: 1/40, Train Loss: 0.7672 Train metric: 0.6916 Valid Metrics: 0.7338 Total Loss: 1393.2309 Loss.item(): 0.632655680179596
Epoch: 2/40, Train Loss: 0.6205 Train metric: 0.7426 Valid Metrics: 0.7481 Total Loss: 1126.9004 Loss.item(): 0.5522188544273376
Epoch: 3/40, Train Loss: 0.5843 Train metric: 0.7546 Valid Metrics: 0.7586 Total Loss: 1061.0636 Loss.item(): 0.5153619050979614
Epoch: 4/40, Train Loss: 0.5593 Train metric: 0.7641 Valid Metrics: 0.7671 Total Loss: 1015.6307 Loss.item(): 0.49745941162109375
Epoch: 5/40, Train Loss: 0.5391 Train metric: 0.7714 Valid Metrics: 0.7744 Total Loss: 979.0868 Loss.item(): 0.4835699796676636
Epoch: 6/40, Train Loss: 0.5219 Train metric: 0.7784 Valid Metrics: 0.7821 Total Loss: 947.8314 Loss.item(): 0.4691464602947235
Epoch: 7/40, Train Loss: 0.5074 Train metric: 0.7851 Valid Metrics: 0.7879 Total Loss: 921.4667 Loss.item(): 0.45597681403160095
Epoch: 8/40, Train Loss: 0.4949 Train metric: 0.7914 Valid Metrics: 0.7944 Total Loss: 898.7273 Los

In [109]:
test_metrics = evaluate_tm(model, test_loader, accuracy)
test_metrics

tensor(0.8643, device='cuda:0')

Tuning hyperparameters with optuna

In [120]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1.0, log = True)
    n_layers = trial.suggest_int("n_layers", 1, 5)
    n_hidden_neurons = trial.suggest_int("n_hidden_neurons",30, 200)
    covtype_model = CovTypeClassifier(n_inputs=54, num_hidden_neurons=[n_hidden_neurons] * n_layers, n_classes = 7).to(device)
    optimizer = torch.optim.SGD(covtype_model.parameters(), lr = learning_rate)
    xentropy = nn.CrossEntropyLoss()
    accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=7).to(device)

    best_validation_accuracy = 0
    for epoch in range(n_epochs):
        history = train2(covtype_model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs=1)
        validation_accuracy = max(history["valid_metrics"])

        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
        trial.report(validation_accuracy, step = epoch)

        if trial.should_prune():
            raise optuna.TrialPruned()
    return best_validation_accuracy

In [130]:
torch.manual_seed(42)
sampler = optuna.samplers.TPESampler(42)
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials = 10)

Positional arguments ['self', 'consider_prior', 'prior_weight', 'consider_magic_clip', 'consider_endpoints', 'n_startup_trials', 'n_ei_candidates', 'gamma', 'weights', 'seed'] in __init__() have been deprecated since v4.4.0. They will be replaced with the corresponding keyword arguments in v6.0.0, so please use the keyword specification instead. See https://github.com/optuna/optuna/releases/tag/v4.4.0 for details.
  sampler = optuna.samplers.TPESampler(42)
[I 2026-01-09 18:38:43,320] A new study created in memory with name: no-name-580a3335-9c8d-42c1-bb0c-bdb1e331c592


Epoch: 1/1, Train Loss: 1.6214 Train metric: 0.4443 Valid Metrics: 0.5088 Total Loss: 2944.5202 Loss.item(): 1.5545099973678589
Epoch: 1/1, Train Loss: 1.3711 Train metric: 0.5239 Valid Metrics: 0.5427 Total Loss: 2489.9606 Loss.item(): 1.4520944356918335
Epoch: 1/1, Train Loss: 1.2768 Train metric: 0.5510 Valid Metrics: 0.5674 Total Loss: 2318.5948 Loss.item(): 1.3720388412475586
Epoch: 1/1, Train Loss: 1.2049 Train metric: 0.5773 Valid Metrics: 0.5907 Total Loss: 2188.0976 Loss.item(): 1.3077129125595093
Epoch: 1/1, Train Loss: 1.1456 Train metric: 0.5954 Valid Metrics: 0.6019 Total Loss: 2080.3376 Loss.item(): 1.2537262439727783
Epoch: 1/1, Train Loss: 1.0948 Train metric: 0.6038 Valid Metrics: 0.6081 Total Loss: 1988.1478 Loss.item(): 1.2069717645645142
Epoch: 1/1, Train Loss: 1.0505 Train metric: 0.6098 Valid Metrics: 0.6143 Total Loss: 1907.7526 Loss.item(): 1.1657072305679321
Epoch: 1/1, Train Loss: 1.0115 Train metric: 0.6157 Valid Metrics: 0.6212 Total Loss: 1836.9346 Loss.ite

[I 2026-01-09 18:40:02,618] Trial 0 finished with value: 0.6801948547363281 and parameters: {'learning_rate': 0.00011228684920439126, 'n_layers': 3, 'n_hidden_neurons': 149}. Best is trial 0 with value: 0.6801948547363281.


Epoch: 1/1, Train Loss: 0.7907 Train metric: 0.6799 Valid Metrics: 0.6802 Total Loss: 1435.9212 Loss.item(): 0.8862407803535461
Epoch: 1/1, Train Loss: 0.6190 Train metric: 0.7362 Valid Metrics: 0.7753 Total Loss: 1124.0395 Loss.item(): 0.45074692368507385
Epoch: 1/1, Train Loss: 0.4957 Train metric: 0.7902 Valid Metrics: 0.8095 Total Loss: 900.1868 Loss.item(): 0.40645739436149597
Epoch: 1/1, Train Loss: 0.4430 Train metric: 0.8149 Valid Metrics: 0.8245 Total Loss: 804.4723 Loss.item(): 0.37614771723747253
Epoch: 1/1, Train Loss: 0.4067 Train metric: 0.8311 Valid Metrics: 0.8360 Total Loss: 738.5960 Loss.item(): 0.3484655022621155
Epoch: 1/1, Train Loss: 0.3799 Train metric: 0.8434 Valid Metrics: 0.8475 Total Loss: 689.9675 Loss.item(): 0.3328278362751007
Epoch: 1/1, Train Loss: 0.3593 Train metric: 0.8523 Valid Metrics: 0.8533 Total Loss: 652.3983 Loss.item(): 0.3219091296195984
Epoch: 1/1, Train Loss: 0.3422 Train metric: 0.8599 Valid Metrics: 0.8560 Total Loss: 621.5023 Loss.item()

[I 2026-01-09 18:41:29,413] Trial 1 finished with value: 0.8903117179870605 and parameters: {'learning_rate': 0.081660788983419, 'n_layers': 5, 'n_hidden_neurons': 59}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.2477 Train metric: 0.9005 Valid Metrics: 0.8901 Total Loss: 449.7513 Loss.item(): 0.2164967805147171
Epoch: 1/1, Train Loss: 1.8934 Train metric: 0.2837 Valid Metrics: 0.4918 Total Loss: 3438.3242 Loss.item(): 1.6668553352355957
Epoch: 1/1, Train Loss: 1.5262 Train metric: 0.4900 Valid Metrics: 0.4995 Total Loss: 2771.6401 Loss.item(): 1.545752763748169
Epoch: 1/1, Train Loss: 1.4191 Train metric: 0.4954 Valid Metrics: 0.5074 Total Loss: 2577.0818 Loss.item(): 1.5043952465057373
Epoch: 1/1, Train Loss: 1.3685 Train metric: 0.5048 Valid Metrics: 0.5180 Total Loss: 2485.1869 Loss.item(): 1.478681206703186
Epoch: 1/1, Train Loss: 1.3348 Train metric: 0.5166 Valid Metrics: 0.5298 Total Loss: 2424.0310 Loss.item(): 1.4563469886779785
Epoch: 1/1, Train Loss: 1.3076 Train metric: 0.5276 Valid Metrics: 0.5398 Total Loss: 2374.6740 Loss.item(): 1.4351063966751099
Epoch: 1/1, Train Loss: 1.2837 Train metric: 0.5378 Valid Metrics: 0.5513 Total Loss: 2331.1968 Loss.item()

[I 2026-01-09 18:42:53,595] Trial 2 finished with value: 0.6207466125488281 and parameters: {'learning_rate': 5.410274966873259e-05, 'n_layers': 4, 'n_hidden_neurons': 74}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 1.0537 Train metric: 0.6140 Valid Metrics: 0.6207 Total Loss: 1913.5367 Loss.item(): 1.1988117694854736
Epoch: 1/1, Train Loss: 1.5733 Train metric: 0.4362 Valid Metrics: 0.4816 Total Loss: 2857.0565 Loss.item(): 1.4841258525848389
Epoch: 1/1, Train Loss: 1.3312 Train metric: 0.5051 Valid Metrics: 0.5388 Total Loss: 2417.4621 Loss.item(): 1.3662372827529907
Epoch: 1/1, Train Loss: 1.2216 Train metric: 0.5593 Valid Metrics: 0.5856 Total Loss: 2218.4886 Loss.item(): 1.2788829803466797
Epoch: 1/1, Train Loss: 1.1425 Train metric: 0.5919 Valid Metrics: 0.6038 Total Loss: 2074.7656 Loss.item(): 1.2113802433013916
Epoch: 1/1, Train Loss: 1.0809 Train metric: 0.6070 Valid Metrics: 0.6170 Total Loss: 1962.9282 Loss.item(): 1.1573632955551147
Epoch: 1/1, Train Loss: 1.0317 Train metric: 0.6207 Valid Metrics: 0.6286 Total Loss: 1873.4900 Loss.item(): 1.1134366989135742
Epoch: 1/1, Train Loss: 0.9918 Train metric: 0.6321 Valid Metrics: 0.6387 Total Loss: 1801.1289 Loss.ite

[I 2026-01-09 18:44:10,743] Trial 3 finished with value: 0.6846697926521301 and parameters: {'learning_rate': 0.00025904067027019215, 'n_layers': 2, 'n_hidden_neurons': 57}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.7832 Train metric: 0.6834 Valid Metrics: 0.6847 Total Loss: 1422.3329 Loss.item(): 0.8477583527565002
Epoch: 1/1, Train Loss: 1.8360 Train metric: 0.3420 Valid Metrics: 0.4026 Total Loss: 3334.2102 Loss.item(): 1.8164129257202148
Epoch: 1/1, Train Loss: 1.6481 Train metric: 0.4438 Valid Metrics: 0.4693 Total Loss: 2993.0082 Loss.item(): 1.6952406167984009
Epoch: 1/1, Train Loss: 1.5530 Train metric: 0.4803 Valid Metrics: 0.4900 Total Loss: 2820.1792 Loss.item(): 1.6360918283462524
Epoch: 1/1, Train Loss: 1.4973 Train metric: 0.4921 Valid Metrics: 0.4979 Total Loss: 2719.0353 Loss.item(): 1.6001348495483398
Epoch: 1/1, Train Loss: 1.4585 Train metric: 0.4974 Valid Metrics: 0.5011 Total Loss: 2648.5876 Loss.item(): 1.5734277963638306
Epoch: 1/1, Train Loss: 1.4283 Train metric: 0.5006 Valid Metrics: 0.5036 Total Loss: 2593.7436 Loss.item(): 1.5511537790298462
Epoch: 1/1, Train Loss: 1.4031 Train metric: 0.5029 Valid Metrics: 0.5065 Total Loss: 2548.0192 Loss.ite

[I 2026-01-09 18:45:31,700] Trial 4 finished with value: 0.5274608135223389 and parameters: {'learning_rate': 4.039315626348698e-05, 'n_layers': 4, 'n_hidden_neurons': 41}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 1.2090 Train metric: 0.5247 Valid Metrics: 0.5275 Total Loss: 2195.5211 Loss.item(): 1.3523309230804443
Epoch: 1/1, Train Loss: 1.2736 Train metric: 0.5549 Valid Metrics: 0.6270 Total Loss: 2312.9048 Loss.item(): 1.1698559522628784
Epoch: 1/1, Train Loss: 1.0034 Train metric: 0.6339 Valid Metrics: 0.6452 Total Loss: 1822.0914 Loss.item(): 1.0219173431396484
Epoch: 1/1, Train Loss: 0.9101 Train metric: 0.6551 Valid Metrics: 0.6672 Total Loss: 1652.7050 Loss.item(): 0.9437324404716492
Epoch: 1/1, Train Loss: 0.8551 Train metric: 0.6717 Valid Metrics: 0.6777 Total Loss: 1552.8969 Loss.item(): 0.8934961557388306
Epoch: 1/1, Train Loss: 0.8175 Train metric: 0.6806 Valid Metrics: 0.6845 Total Loss: 1484.5161 Loss.item(): 0.8567821979522705
Epoch: 1/1, Train Loss: 0.7900 Train metric: 0.6882 Valid Metrics: 0.6939 Total Loss: 1434.7124 Loss.item(): 0.8300954699516296
Epoch: 1/1, Train Loss: 0.7700 Train metric: 0.6955 Valid Metrics: 0.7013 Total Loss: 1398.3142 Loss.ite

[I 2026-01-09 18:46:51,598] Trial 5 finished with value: 0.7242732644081116 and parameters: {'learning_rate': 0.000531963670579898, 'n_layers': 1, 'n_hidden_neurons': 147}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.6866 Train metric: 0.7226 Valid Metrics: 0.7243 Total Loss: 1246.8748 Loss.item(): 0.7130694389343262
Epoch: 1/1, Train Loss: 1.2191 Train metric: 0.5381 Valid Metrics: 0.6123 Total Loss: 2213.9545 Loss.item(): 1.094815731048584
Epoch: 1/1, Train Loss: 0.8995 Train metric: 0.6364 Valid Metrics: 0.6554 Total Loss: 1633.5654 Loss.item(): 0.9045801758766174
Epoch: 1/1, Train Loss: 0.7865 Train metric: 0.6706 Valid Metrics: 0.6839 Total Loss: 1428.2862 Loss.item(): 0.8131862878799438
Epoch: 1/1, Train Loss: 0.7387 Train metric: 0.6934 Valid Metrics: 0.6993 Total Loss: 1341.4843 Loss.item(): 0.752269446849823
Epoch: 1/1, Train Loss: 0.7099 Train metric: 0.7058 Valid Metrics: 0.7105 Total Loss: 1289.1963 Loss.item(): 0.7090075612068176
Epoch: 1/1, Train Loss: 0.6892 Train metric: 0.7150 Valid Metrics: 0.7184 Total Loss: 1251.5736 Loss.item(): 0.6780322790145874
Epoch: 1/1, Train Loss: 0.6732 Train metric: 0.7221 Valid Metrics: 0.7250 Total Loss: 1222.4889 Loss.item(

[I 2026-01-09 18:48:26,359] Trial 6 finished with value: 0.7613294124603271 and parameters: {'learning_rate': 0.0005924645281328737, 'n_layers': 5, 'n_hidden_neurons': 176}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.5796 Train metric: 0.7596 Valid Metrics: 0.7613 Total Loss: 1052.4674 Loss.item(): 0.5384606719017029


[I 2026-01-09 18:48:30,447] Trial 7 pruned. 


Epoch: 1/1, Train Loss: 1.8352 Train metric: 0.3186 Valid Metrics: 0.4148 Total Loss: 3332.8004 Loss.item(): 1.6880027055740356
Epoch: 1/1, Train Loss: 0.6151 Train metric: 0.7381 Valid Metrics: 0.7523 Total Loss: 1117.0456 Loss.item(): 0.5107054710388184
Epoch: 1/1, Train Loss: 0.5497 Train metric: 0.7620 Valid Metrics: 0.7665 Total Loss: 998.3174 Loss.item(): 0.47460365295410156
Epoch: 1/1, Train Loss: 0.5290 Train metric: 0.7698 Valid Metrics: 0.7704 Total Loss: 960.6778 Loss.item(): 0.47202277183532715
Epoch: 1/1, Train Loss: 0.5204 Train metric: 0.7739 Valid Metrics: 0.7743 Total Loss: 945.0767 Loss.item(): 0.4709990918636322
Epoch: 1/1, Train Loss: 0.5160 Train metric: 0.7757 Valid Metrics: 0.7785 Total Loss: 937.0778 Loss.item(): 0.4651038646697998
Epoch: 1/1, Train Loss: 0.5111 Train metric: 0.7781 Valid Metrics: 0.7797 Total Loss: 928.1360 Loss.item(): 0.47053030133247375
Epoch: 1/1, Train Loss: 0.5065 Train metric: 0.7806 Valid Metrics: 0.7811 Total Loss: 919.8783 Loss.item()

[I 2026-01-09 18:49:48,635] Trial 8 finished with value: 0.7884373664855957 and parameters: {'learning_rate': 0.723817198715453, 'n_layers': 1, 'n_hidden_neurons': 37}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.4837 Train metric: 0.7926 Valid Metrics: 0.7876 Total Loss: 878.3201 Loss.item(): 0.46785929799079895
Epoch: 1/1, Train Loss: 0.5778 Train metric: 0.7540 Valid Metrics: 0.7916 Total Loss: 1049.2761 Loss.item(): 0.42991164326667786
Epoch: 1/1, Train Loss: 0.4594 Train metric: 0.8056 Valid Metrics: 0.8218 Total Loss: 834.3017 Loss.item(): 0.38393089175224304
Epoch: 1/1, Train Loss: 0.4133 Train metric: 0.8269 Valid Metrics: 0.8378 Total Loss: 750.5589 Loss.item(): 0.3615058362483978
Epoch: 1/1, Train Loss: 0.3845 Train metric: 0.8397 Valid Metrics: 0.8460 Total Loss: 698.2707 Loss.item(): 0.3287977874279022
Epoch: 1/1, Train Loss: 0.3643 Train metric: 0.8487 Valid Metrics: 0.8510 Total Loss: 661.6439 Loss.item(): 0.3366223871707916
Epoch: 1/1, Train Loss: 0.3489 Train metric: 0.8555 Valid Metrics: 0.8567 Total Loss: 633.5183 Loss.item(): 0.32302919030189514
Epoch: 1/1, Train Loss: 0.3363 Train metric: 0.8613 Valid Metrics: 0.8602 Total Loss: 610.7516 Loss.item()

[I 2026-01-09 18:51:13,658] Trial 9 finished with value: 0.8850622177124023 and parameters: {'learning_rate': 0.37939618915180745, 'n_layers': 2, 'n_hidden_neurons': 93}. Best is trial 1 with value: 0.8903117179870605.


Epoch: 1/1, Train Loss: 0.2673 Train metric: 0.8917 Valid Metrics: 0.8851 Total Loss: 485.4479 Loss.item(): 0.21110489964485168


In [135]:
tuned_model = CovTypeClassifier(n_inputs=54, num_hidden_neurons=[study.best_params["n_hidden_neurons"]] * study.best_params["n_layers"], n_classes=7).to(device)
optimizer = torch.optim.SGD(tuned_model.parameters(), lr = study.best_params["learning_rate"])
xentropy = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task = "multiclass", num_classes=7).to(device)
tuned_history = train2(tuned_model, optimizer, xentropy, accuracy, train_loader, valid_loader, n_epochs=50)


Epoch: 1/50, Train Loss: 0.6183 Train metric: 0.7378 Valid Metrics: 0.7804 Total Loss: 1122.7506 Loss.item(): 0.4630967974662781
Epoch: 2/50, Train Loss: 0.4905 Train metric: 0.7913 Valid Metrics: 0.8071 Total Loss: 890.7216 Loss.item(): 0.4119083881378174
Epoch: 3/50, Train Loss: 0.4366 Train metric: 0.8170 Valid Metrics: 0.8259 Total Loss: 792.8174 Loss.item(): 0.3939302861690521
Epoch: 4/50, Train Loss: 0.4013 Train metric: 0.8329 Valid Metrics: 0.8398 Total Loss: 728.7733 Loss.item(): 0.39695993065834045
Epoch: 5/50, Train Loss: 0.3769 Train metric: 0.8437 Valid Metrics: 0.8493 Total Loss: 684.3770 Loss.item(): 0.3961508572101593
Epoch: 6/50, Train Loss: 0.3574 Train metric: 0.8528 Valid Metrics: 0.8522 Total Loss: 649.1121 Loss.item(): 0.3849705457687378
Epoch: 7/50, Train Loss: 0.3415 Train metric: 0.8593 Valid Metrics: 0.8565 Total Loss: 620.0989 Loss.item(): 0.3624454140663147
Epoch: 8/50, Train Loss: 0.3279 Train metric: 0.8652 Valid Metrics: 0.8634 Total Loss: 595.4455 Loss.i

In [136]:
X_new = X_tensor[0]
tuned_model(X_new.to(device))

tensor([-1.0704,  2.1556, -5.7544, -4.8794,  7.8187, -3.6812,  0.4009],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [137]:
test_results = evaluate_tm(tuned_model, test_loader, accuracy)

In [138]:
test_results

tensor(0.9126, device='cuda:0')