The following code is based on [this tutorial](https://www.youtube.com/watch?v=zp8clK9yCro).

In [2]:
# Import dependencies
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

In [3]:
# Get data
transform = transforms.ToTensor()

mnist_data = datasets.MNIST(root="./data", train=True, download=True, transform=transform)

data_loader = DataLoader(mnist_data, batch_size=64, shuffle=True)

In [4]:
# Get Duckeneers Data locally
from datasets import load_dataset
import os

base_path = "/Users/maksym/git/duckeneers/palettscan/setup/Data/datasets/card_detection/data"
dataset_dict = load_dataset(
    "imagefolder",
    data_files={
        "train": os.path.join(base_path, "train", "**"),
        "test": os.path.join(base_path, "test", "**"),
        "valid": os.path.join(base_path, "valid", "**"),
    },
)

  from .autonotebook import tqdm as notebook_tqdm
Resolving data files: 100%|██████████| 702/702 [00:00<00:00, 22857.60it/s]
Resolving data files: 100%|██████████| 202/202 [00:00<00:00, 150863.50it/s]
Resolving data files: 100%|██████████| 937/937 [00:00<00:00, 17826.24it/s]
Found cached dataset imagefolder (/Users/maksym/.cache/huggingface/datasets/imagefolder/default-9e429bb76d21d33c/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)
100%|██████████| 3/3 [00:00<00:00, 75.69it/s]


In [5]:
from tqdm import tqdm 
from torchvision.transforms.functional import rgb_to_grayscale
images = [rgb_to_grayscale(sample["image"]) for sample in tqdm(dataset_dict["train"])]

tensors = [transform(image) for image in tqdm(images)]


100%|██████████| 700/700 [01:01<00:00, 11.41it/s]
100%|██████████| 700/700 [00:46<00:00, 15.12it/s]


In [6]:
data_loader = DataLoader(tensors, batch_size=32, shuffle=False)

In [7]:
iterator = iter(data_loader)
print(next(iterator))

tensor([[[[0.0157, 0.0157, 0.0157,  ..., 0.0235, 0.0235, 0.0275],
          [0.0157, 0.0157, 0.0118,  ..., 0.0275, 0.0235, 0.0235],
          [0.0157, 0.0157, 0.0157,  ..., 0.0235, 0.0235, 0.0235],
          ...,
          [0.0314, 0.0353, 0.0353,  ..., 0.0706, 0.0745, 0.0667],
          [0.0353, 0.0353, 0.0353,  ..., 0.0706, 0.0667, 0.0667],
          [0.0353, 0.0353, 0.0353,  ..., 0.0667, 0.0627, 0.0706]]],


        [[[0.0667, 0.0627, 0.0667,  ..., 0.0431, 0.0471, 0.0471],
          [0.0627, 0.0627, 0.0667,  ..., 0.0471, 0.0431, 0.0471],
          [0.0667, 0.0627, 0.0706,  ..., 0.0471, 0.0431, 0.0510],
          ...,
          [0.1098, 0.1216, 0.1137,  ..., 0.0667, 0.0667, 0.0706],
          [0.1176, 0.1137, 0.1137,  ..., 0.0667, 0.0706, 0.0667],
          [0.1137, 0.1059, 0.1137,  ..., 0.0706, 0.0706, 0.0667]]],


        [[[0.0392, 0.0392, 0.0431,  ..., 0.0275, 0.0235, 0.0275],
          [0.0392, 0.0471, 0.0471,  ..., 0.0275, 0.0235, 0.0235],
          [0.0431, 0.0431, 0.0431,  ..

In [8]:
# Define a linear AutoEncoder 
class LinearAutoEncoder(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # N (batch size), 2064*3088
        self.encoder = nn.Sequential(
            nn.Linear(2064*3088, 20),
            nn.ReLU()
            # nn.Linear(2064*3088, 2**20), # N, 20264*3088 -> N, 2**20
            # nn.ReLU(),
            # nn.Linear(2**20, 2**14),
            # nn.ReLU(),
            # nn.Linear(2**14, 2**7),
            # nn.ReLU(),
            # nn.Linear(128, 20), # N, 20
        )

        self.decoder = nn.Sequential(
            nn.Linear(20, 2064*3088),
            # nn.Linear(20, 128), # N, 20
            # nn.ReLU(),
            # nn.Linear(2**7, 2**14),
            # nn.ReLU(),
            # nn.Linear(2**14, 2**20),
            # nn.ReLU(),
            # nn.Linear(2**20, 2064*3088), # N, 784
            nn.Sigmoid() # IMPORTTANT! Depending on data we might need different activation here!
        )
# NOTE: Last activation: [0, 1] -> nn.ReLU(), [-1, 1] -> nn.Tanh

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    


In [4]:
# Define a basic AutoEncoder 
class AutoEncoder(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        # N (batch size), 1 (channels), 2064 (height), 3088 (width)
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1), # N, 1, 2064, 3088 -> N, 16, 14, 14
            nn.ReLU(),
            nn.Conv2d(16, 32, 3, stride=2, padding=1), # N, 16, 14, 14 -> N, 32, 7, 7
            nn.ReLU(),
            nn.Conv2d(32, 64, 7), # N, 32, 7, 7 -> N, 64, 1, 1
            nn.ReLU(),
        )

        # decoder input size: N, 64, 1, 1
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 7), #  N, 64, 1, 1 -> N, 32, 7, 7 
            nn.ReLU(),
            # nn.ConvTranspose2d(32, 16, 3), #  N, 32, 7, 7 -> N, 16, 13, 13 THE DIMENSIONS WOULD NOT ADD UP!!
            nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1), #  N, 32, 7, 7 -> N, 16, 14, 14 
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1), #  N, 16, 14, 14 -> N, 1, 28, 28
            nn.Sigmoid() # IMPORTTANT! Depending on data we might need different activation here!
        )
# NOTE: Last activation: [0, 1] -> nn.ReLU(), [-1, 1] -> nn.Tanh
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    

# NOTE: if you use nn.MaxPool2d() to reduce size, its inverse is nn.MaxUnpool2d()

In [9]:
# Instantiate neural net and optimizer
model = LinearAutoEncoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)


In [11]:
# Train the Autoencoder
num_epochs = 10
outputs = []
for epoch in range(num_epochs):
    for img in data_loader:
        recon = model(img)
        loss = criterion(recon, img)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch:{epoch+1}, Loss:{loss.item():.4f}")
    outputs.append((epoch, img, recon))


RuntimeError: mat1 and mat2 shapes cannot be multiplied (66048x3088 and 6373632x20)

In [1]:
# Plot the reconstructed images
for k in range(0, num_epochs, 4):
    plt.axis('off')
    plt.figure(figsize=(9, 2))
    plt.gray()
    imgs = outputs[k][1].detach().numpy()
    recon = outputs[k][2].detach().numpy()

    for i, item in enumerate(imgs):
        if i >= 9: break
        plt.subplot(2, 9, i+1)
        plt.imshow(item[0])

    for i, item in enumerate(recon):
        if i >= 9: break
        plt.subplot(2, 9, 9+i+1)
        plt.imshow(item[0])

NameError: name 'num_epochs' is not defined