## 1. Setup - Upload Source Code

In [None]:
# Tạo thư mục project
!mkdir -p AutoencoderCpu/src
%cd AutoencoderCpu

In [None]:
%%writefile src/common.h
/**
 * @file common.h
 * @brief Common definitions and includes for the Autoencoder project
 */

#ifndef COMMON_H
#define COMMON_H

#include <iostream>
#include <vector>
#include <cmath>
#include <cstring>
#include <fstream>
#include <random>
#include <chrono>
#include <iomanip>
#include <algorithm>
#include <numeric>
#include <cassert>
#include <map>

using DataType = float;

class Tensor4D {
public:
    int batch, height, width, channels;
    std::vector<DataType> data;

    Tensor4D() : batch(0), height(0), width(0), channels(0) {}
    Tensor4D(int n, int h, int w, int c) : batch(n), height(h), width(w), channels(c) {
        data.resize(n * h * w * c, 0.0f);
    }

    size_t size() const { return batch * height * width * channels; }
    size_t memorySize() const { return size() * sizeof(DataType); }

    DataType& at(int n, int h, int w, int c) {
        return data[((n * height + h) * width + w) * channels + c];
    }
    const DataType& at(int n, int h, int w, int c) const {
        return data[((n * height + h) * width + w) * channels + c];
    }

    void fill(DataType value) { std::fill(data.begin(), data.end(), value); }
    
    void randomInit(float scale = 1.0f) {
        std::random_device rd;
        std::mt19937 gen(rd());
        std::normal_distribution<float> dist(0.0f, scale);
        for (auto& val : data) val = dist(gen);
    }

    void copyFrom(const Tensor4D& other) {
        batch = other.batch; height = other.height;
        width = other.width; channels = other.channels;
        data = other.data;
    }
};

class Timer {
private:
    std::chrono::high_resolution_clock::time_point startTime, endTime;
    bool running;
public:
    Timer() : running(false) {}
    void start() { startTime = std::chrono::high_resolution_clock::now(); running = true; }
    void stop() { endTime = std::chrono::high_resolution_clock::now(); running = false; }
    double elapsedMs() const {
        auto end = running ? std::chrono::high_resolution_clock::now() : endTime;
        return std::chrono::duration<double, std::milli>(end - startTime).count();
    }
    double elapsedSec() const { return elapsedMs() / 1000.0; }
};

class Profiler {
public:
    struct Stats { double totalTime = 0; int callCount = 0; double avgTime() const { return callCount > 0 ? totalTime / callCount : 0; } };
    std::map<std::string, Stats> stats;
    void addTime(const std::string& name, double timeMs) { stats[name].totalTime += timeMs; stats[name].callCount++; }
    void reset() { stats.clear(); }
    void printReport() const {
        std::cout << "\n========== PROFILER REPORT ==========\n";
        double total = 0;
        for (const auto& [name, stat] : stats) total += stat.totalTime;
        std::vector<std::pair<std::string, Stats>> sorted(stats.begin(), stats.end());
        std::sort(sorted.begin(), sorted.end(), [](const auto& a, const auto& b) { return a.second.totalTime > b.second.totalTime; });
        std::cout << std::left << std::setw(20) << "Operation" << std::right << std::setw(15) << "Total (ms)" << std::setw(10) << "Calls" << std::setw(15) << "Percentage" << "\n";
        for (const auto& [name, stat] : sorted) {
            double pct = total > 0 ? stat.totalTime / total * 100 : 0;
            std::cout << std::left << std::setw(20) << name << std::right << std::setw(15) << std::fixed << std::setprecision(2) << stat.totalTime << std::setw(10) << stat.callCount << std::setw(14) << pct << "%\n";
        }
        std::cout << "======================================\n";
    }
};

class MemoryTracker {
public:
    size_t weightsMemory = 0, activationsMemory = 0, gradientsMemory = 0, dataMemory = 0;
    void addWeights(size_t bytes) { weightsMemory += bytes; }
    void addActivations(size_t bytes) { activationsMemory += bytes; }
    void addGradients(size_t bytes) { gradientsMemory += bytes; }
    void addData(size_t bytes) { dataMemory += bytes; }
    size_t totalMemory() const { return weightsMemory + activationsMemory + gradientsMemory + dataMemory; }
    void printReport() const {
        std::cout << "\n========== MEMORY USAGE ==========\n";
        std::cout << "Weights:     " << (weightsMemory / 1024.0 / 1024.0) << " MB\n";
        std::cout << "Activations: " << (activationsMemory / 1024.0 / 1024.0) << " MB\n";
        std::cout << "Gradients:   " << (gradientsMemory / 1024.0 / 1024.0) << " MB\n";
        std::cout << "Data:        " << (dataMemory / 1024.0 / 1024.0) << " MB\n";
        std::cout << "TOTAL:       " << (totalMemory() / 1024.0 / 1024.0) << " MB\n";
        std::cout << "==================================\n";
    }
};

extern Profiler gProfiler;
extern MemoryTracker gMemoryTracker;

#endif

In [None]:
%%writefile src/cifar10.h
#ifndef CIFAR10_H
#define CIFAR10_H

#include "common.h"
#include <string>

class CIFAR10Dataset {
public:
    static constexpr int IMAGE_HEIGHT = 32, IMAGE_WIDTH = 32, IMAGE_CHANNELS = 3;
    static constexpr int IMAGE_SIZE = 3072, NUM_CLASSES = 10, IMAGES_PER_BATCH = 10000;
    static constexpr int NUM_TRAIN_BATCHES = 5, TOTAL_TRAIN_IMAGES = 50000, TOTAL_TEST_IMAGES = 10000;

private:
    std::vector<DataType> trainImages, testImages;
    std::vector<int> trainLabels, testLabels;
    bool loaded = false;
    int numTrainImages = 0, numTestImages = 0;

public:
    bool load(const std::string& path, int maxTrainSamples = 0, float testRatio = 0.2f) {
        int targetTrain = (maxTrainSamples > 0 && maxTrainSamples < TOTAL_TRAIN_IMAGES) ? maxTrainSamples : TOTAL_TRAIN_IMAGES;
        int targetTest = (maxTrainSamples > 0) ? std::max(1, (int)(targetTrain * testRatio)) : TOTAL_TEST_IMAGES;
        targetTest = std::min(targetTest, TOTAL_TEST_IMAGES);
        
        std::cout << "Loading CIFAR-10 from: " << path << std::endl;
        if (maxTrainSamples > 0) std::cout << "  [LIMITED] Train: " << targetTrain << ", Test: " << targetTest << std::endl;

        trainImages.resize(targetTrain * IMAGE_SIZE);
        trainLabels.resize(targetTrain);
        
        int loadedTrain = 0;
        for (int batch = 1; batch <= NUM_TRAIN_BATCHES && loadedTrain < targetTrain; batch++) {
            std::string filename = path + "/data_batch_" + std::to_string(batch) + ".bin";
            int toLoad = std::min(IMAGES_PER_BATCH, targetTrain - loadedTrain);
            if (!loadBatch(filename, trainImages.data() + loadedTrain * IMAGE_SIZE, trainLabels.data() + loadedTrain, toLoad)) return false;
            loadedTrain += toLoad;
            std::cout << "  Loaded batch " << batch << " (" << toLoad << " images)" << std::endl;
        }
        numTrainImages = loadedTrain;

        testImages.resize(targetTest * IMAGE_SIZE);
        testLabels.resize(targetTest);
        if (!loadBatch(path + "/test_batch.bin", testImages.data(), testLabels.data(), targetTest)) return false;
        numTestImages = targetTest;
        
        loaded = true;
        gMemoryTracker.addData((trainImages.size() + testImages.size()) * sizeof(DataType));
        std::cout << "Dataset loaded! Train: " << numTrainImages << ", Test: " << numTestImages << std::endl;
        return true;
    }

    Tensor4D getTrainBatch(int startIdx, int batchSize) const {
        Tensor4D batch(batchSize, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS);
        for (int i = 0; i < batchSize; i++) {
            const DataType* src = trainImages.data() + (startIdx + i) * IMAGE_SIZE;
            for (int h = 0; h < IMAGE_HEIGHT; h++)
                for (int w = 0; w < IMAGE_WIDTH; w++)
                    for (int c = 0; c < IMAGE_CHANNELS; c++)
                        batch.at(i, h, w, c) = src[(c * IMAGE_HEIGHT + h) * IMAGE_WIDTH + w];
        }
        return batch;
    }

    void shuffleTrainData() {
        std::vector<int> idx(numTrainImages);
        std::iota(idx.begin(), idx.end(), 0);
        std::random_device rd; std::mt19937 gen(rd());
        std::shuffle(idx.begin(), idx.end(), gen);
        std::vector<DataType> tmpImg(trainImages.size());
        std::vector<int> tmpLbl(trainLabels.size());
        for (int i = 0; i < numTrainImages; i++) {
            std::copy(trainImages.begin() + idx[i] * IMAGE_SIZE, trainImages.begin() + (idx[i] + 1) * IMAGE_SIZE, tmpImg.begin() + i * IMAGE_SIZE);
            tmpLbl[i] = trainLabels[idx[i]];
        }
        trainImages = std::move(tmpImg); trainLabels = std::move(tmpLbl);
    }

    int getNumTrainImages() const { return numTrainImages; }
    int getNumTestImages() const { return numTestImages; }

private:
    bool loadBatch(const std::string& filename, DataType* images, int* labels, int numImages) {
        std::ifstream file(filename, std::ios::binary);
        if (!file) { std::cerr << "Cannot open: " << filename << std::endl; return false; }
        std::vector<unsigned char> buffer(1 + IMAGE_SIZE);
        for (int i = 0; i < numImages; i++) {
            file.read(reinterpret_cast<char*>(buffer.data()), buffer.size());
            labels[i] = buffer[0];
            for (int j = 0; j < IMAGE_SIZE; j++) images[i * IMAGE_SIZE + j] = buffer[1 + j] / 255.0f;
        }
        return true;
    }
};

#endif

In [None]:
%%writefile src/optimizer.h
#ifndef OPTIMIZER_H
#define OPTIMIZER_H

#include "common.h"
#include <cmath>

class AdamOptimizer {
public:
    float learningRate, beta1, beta2, epsilon;
    int timestep = 0;
    std::map<int, std::vector<DataType>> m, v;

    AdamOptimizer(float lr = 0.001f, float b1 = 0.9f, float b2 = 0.999f, float eps = 1e-8f)
        : learningRate(lr), beta1(b1), beta2(b2), epsilon(eps) {}

    void update(std::vector<DataType>& weights, const std::vector<DataType>& gradients, int paramId) {
        if (m.find(paramId) == m.end()) {
            m[paramId].resize(weights.size(), 0.0f);
            v[paramId].resize(weights.size(), 0.0f);
        }
        auto& m_t = m[paramId]; auto& v_t = v[paramId];
        float bc1 = 1.0f - std::pow(beta1, timestep);
        float bc2 = 1.0f - std::pow(beta2, timestep);
        for (size_t i = 0; i < weights.size(); i++) {
            m_t[i] = beta1 * m_t[i] + (1 - beta1) * gradients[i];
            v_t[i] = beta2 * v_t[i] + (1 - beta2) * gradients[i] * gradients[i];
            weights[i] -= learningRate * (m_t[i] / bc1) / (std::sqrt(v_t[i] / bc2) + epsilon);
        }
    }
    void step() { timestep++; }
    void reset() { timestep = 0; m.clear(); v.clear(); }
};

#endif

In [None]:
%%writefile src/layers_cpu.h
#ifndef LAYERS_CPU_H
#define LAYERS_CPU_H

#include "common.h"
#include <cmath>
#include <limits>

class Conv2D {
public:
    int inChannels, outChannels, kernelSize, padding, stride;
    Tensor4D weights, gradWeights, inputCache;
    std::vector<DataType> bias, gradBias;

    Conv2D(int inCh, int outCh, int kSize = 3, int pad = 1, int str = 1)
        : inChannels(inCh), outChannels(outCh), kernelSize(kSize), padding(pad), stride(str) {
        weights = Tensor4D(kernelSize, kernelSize, inChannels, outChannels);
        weights.randomInit(std::sqrt(2.0f / (kernelSize * kernelSize * inChannels)));
        bias.resize(outChannels, 0.0f);
        gradWeights = Tensor4D(kernelSize, kernelSize, inChannels, outChannels);
        gradBias.resize(outChannels, 0.0f);
        gMemoryTracker.addWeights(weights.memorySize() + bias.size() * sizeof(DataType));
        gMemoryTracker.addGradients(gradWeights.memorySize() + gradBias.size() * sizeof(DataType));
    }

    Tensor4D forward(const Tensor4D& input) {
        Timer timer; timer.start();
        inputCache.copyFrom(input);
        int N = input.batch, H_in = input.height, W_in = input.width;
        int H_out = (H_in + 2*padding - kernelSize) / stride + 1;
        int W_out = (W_in + 2*padding - kernelSize) / stride + 1;
        Tensor4D output(N, H_out, W_out, outChannels);
        gMemoryTracker.addActivations(output.memorySize());

        for (int n = 0; n < N; n++)
            for (int oh = 0; oh < H_out; oh++)
                for (int ow = 0; ow < W_out; ow++)
                    for (int oc = 0; oc < outChannels; oc++) {
                        DataType sum = bias[oc];
                        for (int kh = 0; kh < kernelSize; kh++)
                            for (int kw = 0; kw < kernelSize; kw++) {
                                int ih = oh*stride - padding + kh, iw = ow*stride - padding + kw;
                                if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in)
                                    for (int ic = 0; ic < inChannels; ic++)
                                        sum += input.at(n,ih,iw,ic) * weights.at(kh,kw,ic,oc);
                            }
                        output.at(n,oh,ow,oc) = sum;
                    }
        timer.stop(); gProfiler.addTime("Conv2D_forward", timer.elapsedMs());
        return output;
    }

    Tensor4D backward(const Tensor4D& gradOutput) {
        Timer timer; timer.start();
        int N = inputCache.batch, H_in = inputCache.height, W_in = inputCache.width;
        int H_out = gradOutput.height, W_out = gradOutput.width;
        Tensor4D gradInput(N, H_in, W_in, inChannels);
        gradWeights.fill(0); std::fill(gradBias.begin(), gradBias.end(), 0);

        for (int n = 0; n < N; n++)
            for (int oh = 0; oh < H_out; oh++)
                for (int ow = 0; ow < W_out; ow++)
                    for (int oc = 0; oc < outChannels; oc++) {
                        DataType grad = gradOutput.at(n,oh,ow,oc);
                        gradBias[oc] += grad;
                        for (int kh = 0; kh < kernelSize; kh++)
                            for (int kw = 0; kw < kernelSize; kw++) {
                                int ih = oh*stride - padding + kh, iw = ow*stride - padding + kw;
                                if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in)
                                    for (int ic = 0; ic < inChannels; ic++) {
                                        gradWeights.at(kh,kw,ic,oc) += inputCache.at(n,ih,iw,ic) * grad;
                                        gradInput.at(n,ih,iw,ic) += weights.at(kh,kw,ic,oc) * grad;
                                    }
                            }
                    }
        timer.stop(); gProfiler.addTime("Conv2D_backward", timer.elapsedMs());
        return gradInput;
    }

    void updateWeights(float lr) {
        for (size_t i = 0; i < weights.data.size(); i++) weights.data[i] -= lr * gradWeights.data[i];
        for (size_t i = 0; i < bias.size(); i++) bias[i] -= lr * gradBias[i];
    }
};

class ReLU {
public:
    Tensor4D maskCache;
    Tensor4D forward(const Tensor4D& input) {
        Timer timer; timer.start();
        Tensor4D output(input.batch, input.height, input.width, input.channels);
        maskCache = Tensor4D(input.batch, input.height, input.width, input.channels);
        for (size_t i = 0; i < input.data.size(); i++) {
            output.data[i] = input.data[i] > 0 ? input.data[i] : 0;
            maskCache.data[i] = input.data[i] > 0 ? 1.0f : 0.0f;
        }
        gMemoryTracker.addActivations(output.memorySize());
        timer.stop(); gProfiler.addTime("ReLU_forward", timer.elapsedMs());
        return output;
    }
    Tensor4D backward(const Tensor4D& gradOutput) {
        Timer timer; timer.start();
        Tensor4D gradInput(gradOutput.batch, gradOutput.height, gradOutput.width, gradOutput.channels);
        for (size_t i = 0; i < gradOutput.data.size(); i++) gradInput.data[i] = gradOutput.data[i] * maskCache.data[i];
        timer.stop(); gProfiler.addTime("ReLU_backward", timer.elapsedMs());
        return gradInput;
    }
};

class MaxPool2D {
public:
    int poolSize, stride, cachedH_in, cachedW_in;
    Tensor4D maxIndicesH, maxIndicesW;
    MaxPool2D(int size = 2, int str = 2) : poolSize(size), stride(str) {}

    Tensor4D forward(const Tensor4D& input) {
        Timer timer; timer.start();
        int N = input.batch, H_in = input.height, W_in = input.width, C = input.channels;
        cachedH_in = H_in; cachedW_in = W_in;
        int H_out = (H_in - poolSize) / stride + 1, W_out = (W_in - poolSize) / stride + 1;
        Tensor4D output(N, H_out, W_out, C);
        maxIndicesH = Tensor4D(N, H_out, W_out, C); maxIndicesW = Tensor4D(N, H_out, W_out, C);
        gMemoryTracker.addActivations(output.memorySize());

        for (int n = 0; n < N; n++)
            for (int oh = 0; oh < H_out; oh++)
                for (int ow = 0; ow < W_out; ow++)
                    for (int c = 0; c < C; c++) {
                        DataType maxVal = -std::numeric_limits<DataType>::infinity();
                        int maxH = 0, maxW = 0;
                        for (int ph = 0; ph < poolSize; ph++)
                            for (int pw = 0; pw < poolSize; pw++) {
                                int ih = oh*stride + ph, iw = ow*stride + pw;
                                if (input.at(n,ih,iw,c) > maxVal) { maxVal = input.at(n,ih,iw,c); maxH = ih; maxW = iw; }
                            }
                        output.at(n,oh,ow,c) = maxVal;
                        maxIndicesH.at(n,oh,ow,c) = maxH; maxIndicesW.at(n,oh,ow,c) = maxW;
                    }
        timer.stop(); gProfiler.addTime("MaxPool2D_forward", timer.elapsedMs());
        return output;
    }

    Tensor4D backward(const Tensor4D& gradOutput) {
        Timer timer; timer.start();
        Tensor4D gradInput(gradOutput.batch, cachedH_in, cachedW_in, gradOutput.channels);
        gradInput.fill(0);
        for (int n = 0; n < gradOutput.batch; n++)
            for (int oh = 0; oh < gradOutput.height; oh++)
                for (int ow = 0; ow < gradOutput.width; ow++)
                    for (int c = 0; c < gradOutput.channels; c++) {
                        int mh = (int)maxIndicesH.at(n,oh,ow,c), mw = (int)maxIndicesW.at(n,oh,ow,c);
                        gradInput.at(n,mh,mw,c) += gradOutput.at(n,oh,ow,c);
                    }
        timer.stop(); gProfiler.addTime("MaxPool2D_backward", timer.elapsedMs());
        return gradInput;
    }
};

class UpSample2D {
public:
    int scaleFactor, cachedH_in, cachedW_in;
    UpSample2D(int scale = 2) : scaleFactor(scale) {}

    Tensor4D forward(const Tensor4D& input) {
        Timer timer; timer.start();
        cachedH_in = input.height; cachedW_in = input.width;
        int H_out = input.height * scaleFactor, W_out = input.width * scaleFactor;
        Tensor4D output(input.batch, H_out, W_out, input.channels);
        gMemoryTracker.addActivations(output.memorySize());
        for (int n = 0; n < input.batch; n++)
            for (int oh = 0; oh < H_out; oh++)
                for (int ow = 0; ow < W_out; ow++)
                    for (int c = 0; c < input.channels; c++)
                        output.at(n,oh,ow,c) = input.at(n, oh/scaleFactor, ow/scaleFactor, c);
        timer.stop(); gProfiler.addTime("UpSample2D_forward", timer.elapsedMs());
        return output;
    }

    Tensor4D backward(const Tensor4D& gradOutput) {
        Timer timer; timer.start();
        Tensor4D gradInput(gradOutput.batch, cachedH_in, cachedW_in, gradOutput.channels);
        gradInput.fill(0);
        for (int n = 0; n < gradOutput.batch; n++)
            for (int oh = 0; oh < gradOutput.height; oh++)
                for (int ow = 0; ow < gradOutput.width; ow++)
                    for (int c = 0; c < gradOutput.channels; c++)
                        gradInput.at(n, oh/scaleFactor, ow/scaleFactor, c) += gradOutput.at(n,oh,ow,c);
        timer.stop(); gProfiler.addTime("UpSample2D_backward", timer.elapsedMs());
        return gradInput;
    }
};

class MSELoss {
public:
    Tensor4D outputCache, targetCache;
    DataType forward(const Tensor4D& output, const Tensor4D& target) {
        Timer timer; timer.start();
        outputCache.copyFrom(output); targetCache.copyFrom(target);
        DataType loss = 0;
        for (size_t i = 0; i < output.data.size(); i++) {
            DataType diff = output.data[i] - target.data[i];
            loss += diff * diff;
        }
        timer.stop(); gProfiler.addTime("MSELoss_forward", timer.elapsedMs());
        return loss / output.data.size();
    }
    Tensor4D backward() {
        Timer timer; timer.start();
        Tensor4D grad(outputCache.batch, outputCache.height, outputCache.width, outputCache.channels);
        DataType scale = 2.0f / outputCache.data.size();
        for (size_t i = 0; i < outputCache.data.size(); i++)
            grad.data[i] = scale * (outputCache.data[i] - targetCache.data[i]);
        timer.stop(); gProfiler.addTime("MSELoss_backward", timer.elapsedMs());
        return grad;
    }
};

#endif

In [None]:
%%writefile src/autoencoder.h
#ifndef AUTOENCODER_H
#define AUTOENCODER_H

#include "common.h"
#include "layers_cpu.h"
#include "optimizer.h"

class Autoencoder {
public:
    Conv2D enc_conv1{3, 256, 3, 1, 1}, enc_conv2{256, 128, 3, 1, 1};
    ReLU enc_relu1, enc_relu2;
    MaxPool2D enc_pool1{2, 2}, enc_pool2{2, 2};
    Conv2D dec_conv1{128, 128, 3, 1, 1}, dec_conv2{128, 256, 3, 1, 1}, dec_conv3{256, 3, 3, 1, 1};
    ReLU dec_relu1, dec_relu2;
    UpSample2D dec_up1{2}, dec_up2{2};
    MSELoss lossFn;
    Tensor4D dec_conv3_out;

    Autoencoder() {
        std::cout << "Autoencoder initialized!\n";
        std::cout << "Architecture: Input(32,32,3) -> Conv(256) -> Pool -> Conv(128) -> Pool -> LATENT(8,8,128)\n";
        std::cout << "           -> Conv(128) -> Up -> Conv(256) -> Up -> Conv(3) -> Output(32,32,3)\n";
        std::cout << "Total params: ~751,875\n";
    }

    Tensor4D forward(const Tensor4D& input) {
        auto x = enc_relu1.forward(enc_conv1.forward(input));
        x = enc_pool1.forward(x);
        x = enc_relu2.forward(enc_conv2.forward(x));
        x = enc_pool2.forward(x);
        x = dec_relu1.forward(dec_conv1.forward(x));
        x = dec_up1.forward(x);
        x = dec_relu2.forward(dec_conv2.forward(x));
        x = dec_up2.forward(x);
        dec_conv3_out = dec_conv3.forward(x);
        return dec_conv3_out;
    }

    DataType backward(const Tensor4D& target) {
        DataType loss = lossFn.forward(dec_conv3_out, target);
        auto grad = lossFn.backward();
        grad = dec_conv3.backward(grad);
        grad = dec_up2.backward(grad); grad = dec_relu2.backward(grad); grad = dec_conv2.backward(grad);
        grad = dec_up1.backward(grad); grad = dec_relu1.backward(grad); grad = dec_conv1.backward(grad);
        grad = enc_pool2.backward(grad); grad = enc_relu2.backward(grad); grad = enc_conv2.backward(grad);
        grad = enc_pool1.backward(grad); grad = enc_relu1.backward(grad); enc_conv1.backward(grad);
        return loss;
    }

    void updateWeights(float lr) {
        enc_conv1.updateWeights(lr); enc_conv2.updateWeights(lr);
        dec_conv1.updateWeights(lr); dec_conv2.updateWeights(lr); dec_conv3.updateWeights(lr);
    }

    void updateWeightsAdam(AdamOptimizer& opt) {
        opt.update(enc_conv1.weights.data, enc_conv1.gradWeights.data, 0);
        opt.update(enc_conv1.bias, enc_conv1.gradBias, 1);
        opt.update(enc_conv2.weights.data, enc_conv2.gradWeights.data, 2);
        opt.update(enc_conv2.bias, enc_conv2.gradBias, 3);
        opt.update(dec_conv1.weights.data, dec_conv1.gradWeights.data, 4);
        opt.update(dec_conv1.bias, dec_conv1.gradBias, 5);
        opt.update(dec_conv2.weights.data, dec_conv2.gradWeights.data, 6);
        opt.update(dec_conv2.bias, dec_conv2.gradBias, 7);
        opt.update(dec_conv3.weights.data, dec_conv3.gradWeights.data, 8);
        opt.update(dec_conv3.bias, dec_conv3.gradBias, 9);
    }
};

#endif

In [None]:
%%writefile src/main.cpp
#include "common.h"
#include "cifar10.h"
#include "autoencoder.h"

Profiler gProfiler;
MemoryTracker gMemoryTracker;

int main(int argc, char* argv[]) {
    std::cout << "=== AUTOENCODER CPU BASELINE ===\n";
    
    std::string dataPath = argc > 1 ? argv[1] : "./cifar-10-batches-bin";
    int epochs = argc > 2 ? std::stoi(argv[2]) : 3;
    int batchSize = argc > 3 ? std::stoi(argv[3]) : 32;
    int maxSamples = argc > 4 ? std::stoi(argv[4]) : 0;
    bool useAdam = argc > 5 ? (std::string(argv[5]) != "sgd") : true;

    std::cout << "Config: epochs=" << epochs << ", batch=" << batchSize 
              << ", samples=" << (maxSamples > 0 ? std::to_string(maxSamples) : "ALL")
              << ", optimizer=" << (useAdam ? "Adam" : "SGD") << "\n\n";

    CIFAR10Dataset dataset;
    if (!dataset.load(dataPath, maxSamples)) return 1;

    Autoencoder model;
    AdamOptimizer adamOpt(0.001f);
    
    int numImages = dataset.getNumTrainImages();
    int numBatches = (numImages + batchSize - 1) / batchSize;
    
    Timer totalTimer; totalTimer.start();
    
    for (int epoch = 0; epoch < epochs; epoch++) {
        Timer epochTimer; epochTimer.start();
        dataset.shuffleTrainData();
        double epochLoss = 0;
        
        for (int b = 0; b < numBatches; b++) {
            int start = b * batchSize;
            int currBatch = std::min(batchSize, numImages - start);
            
            auto batch = dataset.getTrainBatch(start, currBatch);
            model.forward(batch);
            epochLoss += model.backward(batch);
            
            if (useAdam) { adamOpt.step(); model.updateWeightsAdam(adamOpt); }
            else model.updateWeights(0.001f);
        }
        
        epochTimer.stop();
        std::cout << "Epoch " << (epoch+1) << "/" << epochs 
                  << " | Loss: " << std::fixed << std::setprecision(6) << (epochLoss/numBatches)
                  << " | Time: " << std::setprecision(2) << epochTimer.elapsedSec() << "s\n";
    }
    
    totalTimer.stop();
    std::cout << "\nTotal training time: " << totalTimer.elapsedSec() << " seconds\n";
    
    gProfiler.printReport();
    gMemoryTracker.printReport();
    
    return 0;
}

## 2. Download CIFAR-10 Dataset

In [None]:
# Download và extract CIFAR-10
!wget -q https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
!tar -xzf cifar-10-binary.tar.gz
!ls cifar-10-batches-bin/

## 3. Compile

In [None]:
# Compile với optimization
!g++ -std=c++17 -O3 -march=native -o autoencoder_cpu src/main.cpp
print("✅ Compilation successful!")

## 4. Run Training

Cú pháp: `./autoencoder_cpu <data_path> [epochs] [batch_size] [max_samples] [optimizer]`

In [None]:
# Test nhanh với 100 samples, 2 epochs, Adam optimizer
!./autoencoder_cpu ./cifar-10-batches-bin 2 32 100 adam

In [None]:
# Test với SGD optimizer để so sánh
!./autoencoder_cpu ./cifar-10-batches-bin 2 32 100 sgd

In [None]:
# Train với nhiều samples hơn (500 samples, 3 epochs)
!./autoencoder_cpu ./cifar-10-batches-bin 3 32 500 adam

In [None]:
# ⚠️ FULL DATASET - Rất lâu trên CPU! (chỉ chạy khi cần)
# Uncomment dòng dưới để chạy full dataset
# !./autoencoder_cpu ./cifar-10-batches-bin 5 32 0 adam

## 5. Phân tích kết quả

Sau khi chạy, bạn sẽ thấy:

### Profiler Report:
- **Conv2D_forward** và **Conv2D_backward** chiếm ~99% thời gian
- Đây là bottleneck cần tối ưu trong Phase 2 (GPU)

### Memory Report:
- Weights: ~3 MB
- Activations: Phụ thuộc batch size

### Kết luận cho GPU Optimization:
1. **Ưu tiên 1**: Conv2D operations (>99% time)
2. **Kỹ thuật**: im2col + GEMM, shared memory tiling
3. **Target speedup**: >20x so với CPU