---

# University of Liverpool

## COMP534 - Applied AI

---

This notebook is associated with Assignment 2. Use it to complete the assignment by following the instructions provided in each section. Each section includes a text cell outlining the requirements. For additional details, refer to Canvas.

Use this first cell to import the necessary libraries.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
import itertools
import random
import math
import optuna  # We use optuna for hyperparameter tuning

  from .autonotebook import tqdm as notebook_tqdm


# 1. **Data Management**


In this part, you need to:

1.  define your experimental protocol (such as k-fold, cross validation, etc)
2.	create the dataloader to load the data; remember to include here any normalization, data augmentation, or other technique used to pre-process the data


In [2]:
# First, checking if we can use the Apple Silicon GPU (MPS). If not, check for CUDA; otherwise, we fall back to CPU.
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Just a quick test: create a random tensor and move it to our chosen device.
x = torch.randn(3, 3).to(device)
print(x)


class CustomDataset(Dataset):
    """
    This is a simple wrapper around our dataset that lets us apply a custom transform on a subset.
    It takes the original dataset and a list of indices, and optionally a custom transform,
    and returns (image, label) pairs for the specified indices.
    """
    def __init__(self, base_dataset, indices, custom_transform=None):
        self.base_dataset = base_dataset
        self.indices = indices
        self.custom_transform = custom_transform

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        img, label = self.base_dataset[self.indices[idx]]
        if self.custom_transform:
            img = self.custom_transform(img)
        return img, label


def get_kfold_loaders(base_dataset, index_list, batch_size, k_folds=5, train_tf=None, val_tf=None):
    """
    This function creates k-fold DataLoaders for training and validation.
    We use scikit-learn's KFold to split the provided indices and then create a loader for each fold.
    """
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_loaders = []
    index_array = np.array(index_list)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(index_array)):
        train_indices = index_array[train_idx].tolist()
        val_indices = index_array[val_idx].tolist()

        train_subset = CustomDataset(base_dataset, train_indices, custom_transform=train_tf)
        val_subset = CustomDataset(base_dataset, val_indices, custom_transform=val_tf)

        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=0)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, num_workers=0)

        fold_loaders.append((train_loader, val_loader))
        print(f"Fold {fold+1}: Train samples = {len(train_subset)}, Val samples = {len(val_subset)}")

    return fold_loaders


def load_data(dataset_path, batch_size=32, k_folds=5):
    """
    This function loads the dataset from the given path, splits it into train/val/test sets,
    applies appropriate transforms, and returns DataLoaders along with class names.
    """
    # These are the normalization values (using ImageNet statistics here)
    mean_vals = [0.485, 0.456, 0.406]
    std_vals  = [0.229, 0.224, 0.225]

    # Set up data augmentation for training images (256x256)
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(256, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean_vals, std_vals)
    ])

    # For validation and testing, we just resize and center crop.
    val_test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(256),
        transforms.ToTensor(),
        transforms.Normalize(mean_vals, std_vals)
    ])

    # Load the dataset (no transforms here yet, we'll apply them later)
    full_data = datasets.ImageFolder(root=dataset_path)
    total_samples = len(full_data)
    indices = list(range(total_samples))
    np.random.shuffle(indices)

    # Split the indices: 70% train, 15% validation, and 15% test.
    train_end = int(0.7 * total_samples)
    val_end = train_end + int(0.15 * total_samples)

    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]

    # Create k-fold loaders for the training set.
    fold_loaders = get_kfold_loaders(full_data, train_indices, batch_size, k_folds,
                                     train_tf=train_transform, val_tf=val_test_transform)

    # Create DataLoaders for global validation and test sets.
    val_dataset = CustomDataset(full_data, val_indices, custom_transform=val_test_transform)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    test_dataset = CustomDataset(full_data, test_indices, custom_transform=val_test_transform)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    print(f"Total samples: {total_samples} | Train: {len(train_indices)}, Val: {len(val_indices)}, Test: {len(test_indices)}")
    print(f"Classes: {full_data.classes}")

    return fold_loaders, val_loader, test_loader, full_data.classes

Using MPS (Apple Silicon GPU)
tensor([[-0.2294, -1.0395, -0.1732],
        [-0.2959, -1.1598, -1.4945],
        [-0.7959, -0.3982, -0.1433]], device='mps:0')


---

# 2. **Neural Networks**

Here, you need to:

1.	propose your own Convolutional Neural Network (CNN) to tackle the problem;
2.	define at least one existing CNN (such as AlexNet, VGG, ResNet, DenseNet, etc) to tackle the problem;
3.	define the necessary components to train the networks (that is, loss function, optimizers, etc);
4.	train your proposed architecture from scratch using your training set;
5.	train the existing architecture using at least 2 different strategies (i.e., trained from scratch, fine-tuning, feature extractor, etc);
6.	for all training procedures, separately plot the loss and accuracy with respect to the epoch/iteration.



In [3]:
class CustomCNN(nn.Module):
    """
    This is our custom CNN model for image classification.
    The architecture includes 5 convolutional layers (with increasing channels), each followed by batch norm, ReLU, and max pooling.
    After the conv layers, we do global average pooling, then pass the result through a fully connected layer (with dropout)
    before the final output layer.
    """
    def __init__(self, num_classes, channels_mult=1.0, fc_size=128, dropout_rate=0.5):
        super(CustomCNN, self).__init__()
        # Layer 1: Convolution, BatchNorm, ReLU, and MaxPool (256 -> 128)
        self.conv1 = nn.Conv2d(3, int(16 * channels_mult), kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(int(16 * channels_mult))
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)

        # Layer 2: Convolution, BatchNorm, ReLU, and MaxPool (128 -> 64)
        self.conv2 = nn.Conv2d(int(16 * channels_mult), int(32 * channels_mult), kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(int(32 * channels_mult))
        self.pool2 = nn.MaxPool2d(2, 2)

        # Layer 3: Convolution, BatchNorm, ReLU, and MaxPool (64 -> 32)
        self.conv3 = nn.Conv2d(int(32 * channels_mult), int(64 * channels_mult), kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(int(64 * channels_mult))
        self.pool3 = nn.MaxPool2d(2, 2)

        # Layer 4: Convolution, BatchNorm, ReLU, and MaxPool (32 -> 16)
        self.conv4 = nn.Conv2d(int(64 * channels_mult), int(128 * channels_mult), kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(int(128 * channels_mult))
        self.pool4 = nn.MaxPool2d(2, 2)

        # Layer 5: Convolution, BatchNorm, ReLU, and MaxPool (16 -> 8)
        self.conv5 = nn.Conv2d(int(128 * channels_mult), int(256 * channels_mult), kernel_size=3, padding=1)
        self.bn5 = nn.BatchNorm2d(int(256 * channels_mult))
        self.pool5 = nn.MaxPool2d(2, 2)

        # Global average pooling to reduce to 1x1 feature map
        self.global_pool = nn.AdaptiveAvgPool2d((1,1))
        # Fully connected layer: first FC then dropout then final FC
        self.fc1 = nn.Linear(int(256 * channels_mult), fc_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(fc_size, num_classes)

    def forward(self, x):
        x = self.pool(self.relu(self.bn1(self.conv1(x))))    # 256 -> 128
        x = self.pool2(self.relu(self.bn2(self.conv2(x))))     # 128 -> 64
        x = self.pool3(self.relu(self.bn3(self.conv3(x))))     # 64 -> 32
        x = self.pool4(self.relu(self.bn4(self.conv4(x))))     # 32 -> 16
        x = self.pool5(self.relu(self.bn5(self.conv5(x))))     # 16 -> 8
        x = self.global_pool(x)                                # Resulting shape: (B, channels, 1, 1)
        x = x.view(x.size(0), -1)                              # Flatten to (B, channels)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


def get_resnet18_model(num_classes, strategy="finetune"):
    """
    This function loads a ResNet18 model. Depending on the strategy, it either loads a pre-trained model for fine-tuning,
    or loads it as a feature extractor (freezing all layers except the final one). Finally, it replaces the last FC layer
    with one that outputs the required number of classes.
    """
    if strategy == "scratch":
        net = models.resnet18(pretrained=False)
    else:
        net = models.resnet18(pretrained=True)

    if strategy == "feature_extractor":
        for param in net.parameters():
            param.requires_grad = False

    # Update the final fully connected layer to match our number of classes
    num_features = net.fc.in_features
    net.fc = nn.Linear(num_features, num_classes)

    return net


def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, model_label="Model"):
    """
    This function trains the given model for a number of epochs.
    It also plots the training and validation loss and accuracy curves.
    """
    # We use a StepLR scheduler to decay the learning rate every 3 epochs
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}
    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total_samples = 0

        # Loop through the training data
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, predictions = torch.max(outputs, 1)
            running_corrects += torch.sum(predictions == labels.data)
            total_samples += images.size(0)

        epoch_loss = running_loss / total_samples
        epoch_acc = running_corrects.double() / total_samples
        history["train_loss"].append(epoch_loss)
        history["train_acc"].append(epoch_acc.item())

        # Evaluate on the validation set
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        total_val = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                _, predictions = torch.max(outputs, 1)
                val_corrects += torch.sum(predictions == labels.data)
                total_val += images.size(0)

        epoch_val_loss = val_loss / total_val
        epoch_val_acc = val_corrects.double() / total_val
        history["val_loss"].append(epoch_val_loss)
        history["val_acc"].append(epoch_val_acc.item())

        print(f"{model_label} Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc*100:.2f}% | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc*100:.2f}%", flush=True)

        # Keep track of the best model based on validation accuracy
        if epoch_val_acc > best_acc:
            best_acc = epoch_val_acc
            best_model_wts = copy.deepcopy(model.state_dict())

        scheduler.step()  # Update the learning rate

    # Load the best model weights we found
    model.load_state_dict(best_model_wts)

    # Plot the loss curves for training and validation
    epochs_range = range(1, num_epochs + 1)
    plt.figure()
    plt.plot(epochs_range, history["train_loss"], label="Train Loss", color='blue')
    plt.plot(epochs_range, history["val_loss"], label="Val Loss", color='red')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"{model_label} Loss Curve")
    plt.legend()
    plt.show()

    # Plot the accuracy curves for training and validation
    plt.figure()
    plt.plot(epochs_range, history["train_acc"], label="Train Accuracy", color='green')
    plt.plot(epochs_range, history["val_acc"], label="Val Accuracy", color='orange')
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title(f"{model_label} Accuracy Curve")
    plt.legend()
    plt.show()

    return model, history


def evaluate_model(model, test_loader, criterion, device):
    """
    This function evaluates the model on the test set.
    It calculates the average loss, accuracy, as well as precision, recall, and F1 score.
    """
    model.eval()
    test_loss = 0.0
    correct_preds = 0
    total_test = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * images.size(0)
            _, predictions = torch.max(outputs, 1)
            correct_preds += torch.sum(predictions == labels.data)
            total_test += images.size(0)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = test_loss / total_test
    test_accuracy = correct_preds.double() / total_test
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"\nTest Loss: {avg_loss:.4f} | Test Accuracy: {test_accuracy*100:.2f}%")
    return avg_loss, test_accuracy, precision, recall, f1


def plot_confusion_matrix(model, test_loader, class_names, device):
    """
    This function computes predictions on the test data and plots the confusion matrix.
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10,10))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=90, fontweight='normal', fontstyle='normal')
    plt.yticks(tick_marks, class_names, fontweight='normal', fontstyle='normal')

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()


def hyperband_search(train_loader, val_loader, num_classes, device, max_trials=10, epochs=10):
    """
    This function performs a simple Hyperband-style search for hyperparameters for our custom CNN.
    It runs for a fixed number of trials and returns the best hyperparameters found.
    """
    best_val_acc = 0.0
    best_params = {}

    for trial in range(max_trials):
        channels_mult = random.choice([0.5, 1.0, 1.5, 2.0])
        fc_size = random.choice([64, 128, 256])
        dropout_rate = random.choice([0.3, 0.5, 0.7])
        lr = random.choice([0.0001, 0.001, 0.01])
        # Pick a weight decay value from a loguniform distribution between 1e-5 and 1e-3
        weight_decay = math.exp(random.uniform(math.log(1e-5), math.log(1e-3)))
        print(f"\nHyperband Trial {trial+1}: channels_mult={channels_mult}, fc_size={fc_size}, dropout_rate={dropout_rate}, lr={lr}, weight_decay={weight_decay:.5f}")

        model = CustomCNN(num_classes, channels_mult=channels_mult, fc_size=fc_size, dropout_rate=dropout_rate).to(device)
        criterion = nn.CrossEntropyLoss()
        # Use AdamW optimizer with the sampled weight decay
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

        # Train the model for a few epochs to see how it performs
        model, history = train_model(model, train_loader, val_loader, criterion, optimizer, epochs, device,
                                     model_label=f"Hyperband Trial {trial+1}")
        trial_val_acc = history["val_acc"][-1]
        print(f"Hyperband Trial {trial+1} Validation Accuracy: {trial_val_acc*100:.2f}%")

        if trial_val_acc > best_val_acc:
            best_val_acc = trial_val_acc
            best_params = {"channels_mult": channels_mult, "fc_size": fc_size, "dropout_rate": dropout_rate, "lr": lr, "weight_decay": weight_decay}

    print("\nBest Hyperparameters:", best_params, "\nWith validation accuracy:", best_val_acc*100, "%")
    return best_params

---

# 3. **Evaluate models**

Here, you need to:

1.	evaluate the model (the best one you obtained in the above stage) on the testing dataset.


In [None]:
if __name__ == "__main__":
    # Main script: set up our device (GPU if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    # Define some parameters for our experiment
    dataset_directory = "Images"
    batch_size_val = 32
    num_epochs_val = 5
    k_folds_val = 5
    

    # Load the dataset and get our DataLoaders for training, validation, and testing
    folds, val_loader_global, test_loader, class_list = load_data(dataset_directory, batch_size=batch_size_val, k_folds=k_folds_val)
    num_classes = len(class_list)

    # For simplicity, we use the first fold for training and validation
    train_loader_fold, val_loader_fold = folds[0]

    # Run Hyperband search to get good hyperparameters for our custom CNN
    print("\nRunning Hyperband Search for Custom CNN model...")
    best_params = hyperband_search(train_loader_fold, val_loader_fold, num_classes, device, max_trials=10, epochs=10)

    # Train our Custom CNN from scratch using the best hyperparameters found
    print("\nNow training Custom CNN with Best Hyperparameters from Hyperband...")
    custom_net = CustomCNN(num_classes, channels_mult=best_params["channels_mult"],
                           fc_size=best_params["fc_size"], dropout_rate=best_params["dropout_rate"]).to(device)
    criterion_custom = nn.CrossEntropyLoss()
    optimizer_custom = optim.AdamW(custom_net.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"])
    custom_net, history_custom = train_model(custom_net, train_loader_fold, val_loader_fold,
                                             criterion_custom, optimizer_custom, num_epochs_val, device,
                                             model_label="Custom CNN")

    # Train ResNet18 using fine-tuning
    print("\nNow training ResNet18 with Fine-tuning...")
    resnet_finetune = get_resnet18_model(num_classes, strategy="finetune").to(device)
    criterion_resnet = nn.CrossEntropyLoss()
    optimizer_resnet = optim.Adam(resnet_finetune.parameters(), lr=0.001)
    resnet_finetune, history_resnet_ft = train_model(resnet_finetune, train_loader_fold, val_loader_fold,
                                                     criterion_resnet, optimizer_resnet, num_epochs_val, device,
                                                     model_label="ResNet18 Fine-tune")

    # Train ResNet18 as a Feature Extractor (only update the final layer)
    print("\nNow training ResNet18 as a Feature Extractor...")
    resnet_feat = get_resnet18_model(num_classes, strategy="feature_extractor").to(device)
    optimizer_resnet_feat = optim.Adam(resnet_feat.fc.parameters(), lr=0.001)
    resnet_feat, history_resnet_feat = train_model(resnet_feat, train_loader_fold, val_loader_fold,
                                                   criterion_resnet, optimizer_resnet_feat, num_epochs_val, device,
                                                   model_label="ResNet18 Feature Extractor")

    # Print validation accuracies for all models
    best_val_acc_custom = history_custom["val_acc"][-1]
    best_val_acc_ft = history_resnet_ft["val_acc"][-1]
    best_val_acc_feat = history_resnet_feat["val_acc"][-1]

    print("\nValidation Accuracy Comparison of the three models:")
    print(f"Custom CNN: {best_val_acc_custom*100:.2f}%")
    print(f"ResNet18 (Fine-tune): {best_val_acc_ft*100:.2f}%")
    print(f"ResNet18 (Feature Extractor): {best_val_acc_feat*100:.2f}%")

    # Plot confusion matrices for each model
    print("\nPlotting Confusion Matrix for Custom CNN...")
    plot_confusion_matrix(custom_net, test_loader, class_list, device)

    print("\nPlotting Confusion Matrix for ResNet18 Fine-tune...")
    plot_confusion_matrix(resnet_finetune, test_loader, class_list, device)

    print("\nPlotting Confusion Matrix for ResNet18 Feature Extractor...")
    plot_confusion_matrix(resnet_feat, test_loader, class_list, device)

    # Choose the best model based on the highest validation accuracy
    best_model = custom_net
    best_model_name = "Custom CNN"
    if best_val_acc_ft > best_val_acc_custom and best_val_acc_ft > best_val_acc_feat:
        best_model = resnet_finetune
        best_model_name = "ResNet18 Fine-tune"
    elif best_val_acc_feat > best_val_acc_custom and best_val_acc_feat > best_val_acc_ft:
        best_model = resnet_feat
        best_model_name = "ResNet18 Feature Extractor"
    print(f"\nSelected Best Model out of three:- {best_model_name}")

    # Evaluate the best model on the test set and print all the metrics
    print(f"\nEvaluating the best model: {best_model_name} on the Test Set...")
    test_loss, test_accuracy, precision, recall, f1 = evaluate_model(best_model, test_loader, criterion_resnet, device)
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print(f"Test Precision: {precision*100:.2f}%")
    print(f"Test Recall: {recall*100:.2f}%")
    print(f"Test F1 Score: {f1*100:.2f}%")

    # Plot confusion matrix for the best model
    print(f"\nPlotting Confusion Matrix for the Best Model: {best_model_name}...")
    plot_confusion_matrix(best_model, test_loader, class_list, device)

Using device: cpu
Fold 1: Train samples = 1176, Val samples = 294
Fold 2: Train samples = 1176, Val samples = 294
Fold 3: Train samples = 1176, Val samples = 294
Fold 4: Train samples = 1176, Val samples = 294
Fold 5: Train samples = 1176, Val samples = 294
Total samples: 2100 | Train: 1470, Val: 315, Test: 315
Classes: ['agricultural', 'airplane', 'baseballdiamond', 'beach', 'buildings', 'chaparral', 'denseresidential', 'forest', 'freeway', 'golfcourse', 'harbor', 'intersection', 'mediumresidential', 'mobilehomepark', 'overpass', 'parkinglot', 'river', 'runway', 'sparseresidential', 'storagetanks', 'tenniscourt']

Running Hyperband Search for Custom CNN model...

Hyperband Trial 1: channels_mult=2.0, fc_size=128, dropout_rate=0.5, lr=0.001, weight_decay=0.00001
