In [1]:
%%writefile load_data.h
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define TRAIN_NUM    50000
#define TEST_NUM     10000
#define IMG_SIZE     (32*32*3)     // 3072

typedef struct {
    float* train_images;   // [50000 * 3072]
    float* test_images;    // [10000 * 3072]
    uint8_t* train_labels; // [50000]
    uint8_t* test_labels;  // [10000]
    int* train_indices;
} Cifar10;

void load_cifar10(Cifar10* data);
void normalize_cifar10(Cifar10* data);
void shuffle_cifar10(Cifar10* data);
void get_next_batch(Cifar10* data, size_t batch_size, size_t batch_id, float* batch_images);
void print_cifar10(Cifar10* data);
void free_cifar10(Cifar10* data);

Writing load_data.h


In [2]:
%%writefile load_data.cu
#include "load_data.h"

static void read_batch(const char* filename, float* images_start, uint8_t* labels) {
    FILE* f = fopen(filename, "rb");
    if (!f) {
        perror(filename);
        exit(EXIT_FAILURE);
    }

    uint8_t buffer[3073];
    for (int i = 0; i < 10000; i++) {
        if (fread(buffer, 1, 3073, f) != 3073) {
            fprintf(stderr, "Error: incomplete read in %s at image %d\n", filename, i);
            fclose(f);
            exit(EXIT_FAILURE);
        }
        labels[i] = buffer[0];
        for (int j = 0; j < 3072; j++) {
            images_start[i * 3072 + j] = (float)buffer[1 + j];  //Covert unit8 to float
        }
    }
    fclose(f);
}

void load_cifar10(Cifar10* data) {
    data->train_images = (float*)malloc(TRAIN_NUM * IMG_SIZE * sizeof(float));
    data->test_images  = (float*)malloc(TEST_NUM  * IMG_SIZE * sizeof(float));
    data->train_labels = (uint8_t*)malloc(TRAIN_NUM * sizeof(uint8_t));
    data->test_labels  = (uint8_t*)malloc(TEST_NUM  * sizeof(uint8_t));

    if (!data->train_images || !data->test_images ||
        !data->train_labels  || !data->test_labels) {
        fprintf(stderr, "ERROR: Memory allocation failed!\n");
        exit(EXIT_FAILURE);
    }

    data->train_indices = (int*)malloc(TRAIN_NUM * sizeof(int));
    for (int i = 0; i < TRAIN_NUM; i++) {
        data->train_indices[i] = i;
    }

    //Load training data
    for (int i = 1; i <= 5; i++) {
        char filename[100];
        snprintf(filename, sizeof(filename), "cifar-10-batches-bin/data_batch_%d.bin", i);
        read_batch(filename,
                   data->train_images + (i-1) * 10000 * IMG_SIZE,
                   data->train_labels + (i-1) * 10000);
    }

    //Load test data
    read_batch("cifar-10-batches-bin/test_batch.bin",
               data->test_images, data->test_labels);

    printf("CIFAR-10 loaded successfully\n");
}

void normalize_cifar10(Cifar10* data) {
    for (size_t i = 0; i < TRAIN_NUM * IMG_SIZE; i++) {
        data->train_images[i] /= 255.0f;
    }
    for (size_t i = 0; i < TEST_NUM * IMG_SIZE; i++) {
        data->test_images[i] /= 255.0f;
    }
}

// Shuffle indices
void shuffle_cifar10(Cifar10* data) {
    for (int i = TRAIN_NUM - 1; i > 0; i--) {
        int j = rand() % (i + 1);
        int temp = data->train_indices[i];
        data->train_indices[i] = data->train_indices[j];
        data->train_indices[j] = temp;
    }
}

void get_next_batch(Cifar10* data, size_t batch_size, size_t batch_id, float* batch_images) {
    size_t start = batch_id * batch_size;
    for (size_t i = 0; i < batch_size; i++) {
        int idx = data->train_indices[start + i];

        memcpy(batch_images + i * IMG_SIZE,
               data->train_images + idx * IMG_SIZE,
               IMG_SIZE * sizeof(float));
    }
}

void print_cifar10(Cifar10* data){
    for (int i = 0; i < 2; i++) {
        printf("Label: %d\n", data->train_labels[i]);
        for (int j = 0; j < IMG_SIZE; j++) {
            printf("%f ", data->train_images[i*IMG_SIZE + j]);
        }
        printf("\n");
    }
}

void free_cifar10(Cifar10* data) {
    free(data->train_images);
    free(data->test_images);
    free(data->train_labels);
    free(data->test_labels);
    free(data->train_indices);

    data->train_images = data->test_images = NULL;
    data->train_labels = data->test_labels = NULL;
    data->train_indices = NULL;
}

Writing load_data.cu


In [3]:
%%writefile gpu_layers.h
#pragma once
#include <cuda_runtime.h>
#include <stdio.h>

#define CHECK_CUDA(call)                                                \
    do {                                                                \
        cudaError_t err = call;                                         \
        if (err != cudaSuccess) {                                       \
            fprintf(stderr, "CUDA Error %s:%d: %s\n",                   \
                    __FILE__, __LINE__, cudaGetErrorString(err));       \
            exit(EXIT_FAILURE);                                         \
        }                                                               \
    } while (0)

// NCHW layout: [N, C, H, W]
__device__ __host__ inline int idx4(int n, int c, int h, int w,
                                    int C, int H, int W) {
    return ((n * C + c) * H + h) * W + w;
}

// ==== KERNEL DECLARATIONS ====

__global__ void conv2d_forward_naive(
    const float* __restrict__ input,    // [N, C_in, H, W]
    const float* __restrict__ weight,   // [C_out, C_in, K, K]
    const float* __restrict__ bias,     // [C_out]
    float* __restrict__ output,         // [N, C_out, H_out, W_out]
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride);

__global__ void relu_forward(float* x, int size);

__global__ void maxpool2x2_forward(
    const float* __restrict__ input,  // [N, C, H, W]
    float* __restrict__ output,       // [N, C, H/2, W/2]
    int N, int C, int H, int W);

__global__ void upsample2x2_forward(
    const float* __restrict__ input,  // [N, C, H, W]
    float* __restrict__ output,       // [N, C, 2H, 2W]
    int N, int C, int H, int W);

__global__ void mse_loss_forward(
    const float* __restrict__ output,
    const float* __restrict__ target,
    float* __restrict__ loss,   // single float on device
    int size);

__global__ void relu_backward(
    const float* __restrict__ x,       // forward output/input to ReLU
    const float* __restrict__ grad_y,  // dL/dy
    float* __restrict__ grad_x,        // dL/dx
    int size);

__global__ void maxpool2x2_backward(
    const float* __restrict__ input,
    const float* __restrict__ grad_out,
    float* __restrict__ grad_in,
    int N, int C, int H, int W);

__global__ void upsample2x2_backward(
    const float* __restrict__ grad_out,
    float* __restrict__ grad_in,
    int N, int C, int H, int W);

__global__ void mse_loss_backward(
    const float* __restrict__ output,
    const float* __restrict__ target,
    float* __restrict__ grad_out,
    int size);

__global__ void conv2d_backward_input_naive(
    const float* __restrict__ dY,
    const float* __restrict__ weight,
    float* __restrict__ dX,
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride);

__global__ void conv2d_backward_weight_naive(
    const float* __restrict__ input,
    const float* __restrict__ dY,
    float* __restrict__ dW,
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride);

__global__ void conv2d_backward_bias_naive(
    const float* __restrict__ dY,
    float* __restrict__ dB,
    int N, int C_out, int H_out, int W_out);

__global__ void sgd_update(
    float* __restrict__ param,
    const float* __restrict__ grad,
    int size,
    float lr);


Writing gpu_layers.h


In [4]:
%%writefile gpu_layers.cu
#include "gpu_layers.h"

// --------------- Conv2D forward ------------------
__global__ void conv2d_forward_naive(
    const float* __restrict__ input,
    const float* __restrict__ weight,
    const float* __restrict__ bias,
    float* __restrict__ output,
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride)
{
    int H_out = (H + 2 * pad - K) / stride + 1;
    int W_out = (W + 2 * pad - K) / stride + 1;

    int w_out = blockIdx.x * blockDim.x + threadIdx.x;
    int h_out = blockIdx.y * blockDim.y + threadIdx.y;
    int nc    = blockIdx.z;

    if (w_out >= W_out || h_out >= H_out) return;

    int n      = nc / C_out;
    int c_out  = nc % C_out;
    if (n >= N) return;

    float sum = bias ? bias[c_out] : 0.0f;

    for (int c_in = 0; c_in < C_in; ++c_in) {
        for (int kh = 0; kh < K; ++kh) {
            for (int kw = 0; kw < K; ++kw) {
                int h_in = h_out * stride + kh - pad;
                int w_in = w_out * stride + kw - pad;
                if (h_in < 0 || h_in >= H || w_in < 0 || w_in >= W)
                    continue;

                int in_idx = ((n * C_in + c_in) * H + h_in) * W + w_in;
                int w_idx = (((c_out * C_in + c_in) * K) + kh) * K + kw;
                sum += weight[w_idx] * input[in_idx];
            }
        }
    }
    int out_idx = ((n * C_out + c_out) * H_out + h_out) * W_out + w_out;
    output[out_idx] = sum;
}

// --------------- ReLU ------------------
__global__ void relu_forward(float* x, int size)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        float v = x[i];
        x[i] = v > 0.0f ? v : 0.0f;
    }
}

// --------------- MaxPool 2x2 ------------------
__global__ void maxpool2x2_forward(
    const float* __restrict__ input,
    float* __restrict__ output,
    int N, int C, int H, int W)
{
    int H_out = H / 2;
    int W_out = W / 2;

    int w_out = blockIdx.x * blockDim.x + threadIdx.x;
    int h_out = blockIdx.y * blockDim.y + threadIdx.y;
    int nc    = blockIdx.z;

    if (w_out >= W_out || h_out >= H_out) return;

    int n = nc / C;
    int c = nc % C;
    if (n >= N) return;

    int h_in0 = h_out * 2;
    int w_in0 = w_out * 2;

    float m = -1e30f;
    for (int dh = 0; dh < 2; ++dh) {
        for (int dw = 0; dw < 2; ++dw) {
            int h_in = h_in0 + dh;
            int w_in = w_in0 + dw;
            int idx = idx4(n, c, h_in, w_in, C, H, W);
            float v = input[idx];
            if (v > m) m = v;
        }
    }

    int out_idx = idx4(n, c, h_out, w_out, C, H_out, W_out);
    output[out_idx] = m;
}

// --------------- UpSample 2x2 ------------------
__global__ void upsample2x2_forward(
    const float* __restrict__ input,
    float* __restrict__ output,
    int N, int C, int H, int W)
{
    int H_out = H * 2;
    int W_out = W * 2;

    int w_out = blockIdx.x * blockDim.x + threadIdx.x;
    int h_out = blockIdx.y * blockDim.y + threadIdx.y;
    int nc    = blockIdx.z;

    if (w_out >= W_out || h_out >= H_out) return;

    int n = nc / C;
    int c = nc % C;
    if (n >= N) return;

    int h_in = h_out / 2;
    int w_in = w_out / 2;

    int idx_in  = idx4(n, c, h_in, w_in, C, H, W);
    int idx_out = idx4(n, c, h_out, w_out, C, H_out, W_out);
    output[idx_out] = input[idx_in];
}

// --------------- MSE loss ------------------
__global__ void mse_loss_forward(
    const float* __restrict__ output,
    const float* __restrict__ target,
    float* __restrict__ loss,
    int size)
{
    extern __shared__ float sdata[];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    float val = 0.0f;
    if (idx < size) {
        float diff = output[idx] - target[idx];
        val = diff * diff;
    }
    sdata[tid] = val;
    __syncthreads();

    // reduce trong block
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) {
        atomicAdd(loss, sdata[0]);
    }
}

// --------------- ReLU backward ------------------
__global__ void relu_backward(
    const float* __restrict__ x,
    const float* __restrict__ grad_y,
    float* __restrict__ grad_x,
    int size)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        float v = x[i];
        grad_x[i] = (v > 0.0f) ? grad_y[i] : 0.0f;
    }
}

// --------------- MaxPool 2x2 backward ------------------
__global__ void maxpool2x2_backward(
    const float* __restrict__ input,
    const float* __restrict__ grad_out,
    float* __restrict__ grad_in,
    int N, int C, int H, int W)
{
    int H_out = H / 2;
    int W_out = W / 2;

    int w_out = blockIdx.x * blockDim.x + threadIdx.x;
    int h_out = blockIdx.y * blockDim.y + threadIdx.y;
    int nc    = blockIdx.z;

    if (w_out >= W_out || h_out >= H_out) return;

    int n = nc / C;
    int c = nc % C;
    if (n >= N) return;

    int h_in0 = h_out * 2;
    int w_in0 = w_out * 2;

    int idx00 = idx4(n, c, h_in0 + 0, w_in0 + 0, C, H, W);
    int idx01 = idx4(n, c, h_in0 + 0, w_in0 + 1, C, H, W);
    int idx10 = idx4(n, c, h_in0 + 1, w_in0 + 0, C, H, W);
    int idx11 = idx4(n, c, h_in0 + 1, w_in0 + 1, C, H, W);

    float v00 = input[idx00];
    float v01 = input[idx01];
    float v10 = input[idx10];
    float v11 = input[idx11];

    float g = grad_out[idx4(n, c, h_out, w_out, C, H_out, W_out)];

    // Tìm max
    float m = v00;
    int max_idx = 0;
    if (v01 > m) { m = v01; max_idx = 1; }
    if (v10 > m) { m = v10; max_idx = 2; }
    if (v11 > m) { m = v11; max_idx = 3; }

    // Chỉ ghi vào vị trí max (grad_in đã được zero trước đó)
    // Mỗi pooling window độc lập, không có conflict
    if (max_idx == 0) grad_in[idx00] = g;
    else if (max_idx == 1) grad_in[idx01] = g;
    else if (max_idx == 2) grad_in[idx10] = g;
    else grad_in[idx11] = g;
}

// --------------- UpSample 2x2 backward ------------------
__global__ void upsample2x2_backward(
    const float* __restrict__ grad_out,
    float* __restrict__ grad_in,
    int N, int C, int H, int W)
{
    int H_out = H * 2;
    int W_out = W * 2;

    int w_in = blockIdx.x * blockDim.x + threadIdx.x;
    int h_in = blockIdx.y * blockDim.y + threadIdx.y;
    int nc   = blockIdx.z;

    if (w_in >= W || h_in >= H) return;

    int n = nc / C;
    int c = nc % C;
    if (n >= N) return;

    int h_out0 = h_in * 2;
    int w_out0 = w_in * 2;

    float sum = 0.0f;
    for (int dh = 0; dh < 2; ++dh) {
        for (int dw = 0; dw < 2; ++dw) {
            int h_out = h_out0 + dh;
            int w_out = w_out0 + dw;
            int idx_o = idx4(n, c, h_out, w_out, C, H_out, W_out);
            sum += grad_out[idx_o];
        }
    }

    int idx_in = idx4(n, c, h_in, w_in, C, H, W);
    grad_in[idx_in] = sum;
}

// --------------- MSE loss backward ------------------
__global__ void mse_loss_backward(
    const float* __restrict__ output,
    const float* __restrict__ target,
    float* __restrict__ grad_out,
    int size)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= size) return;
    grad_out[i] = 2.0f * (output[i] - target[i]) / size;
}

// --------------- Conv2D backward: dX ------------------
__global__ void conv2d_backward_input_naive(
    const float* __restrict__ dY,
    const float* __restrict__ weight,
    float* __restrict__ dX,
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride)
{
    int H_out = (H + 2 * pad - K) / stride + 1;
    int W_out = (W + 2 * pad - K) / stride + 1;

    int w = blockIdx.x * blockDim.x + threadIdx.x;
    int h = blockIdx.y * blockDim.y + threadIdx.y;
    int nc = blockIdx.z;

    if (w >= W || h >= H) return;

    int n = nc / C_in;
    int c_in = nc % C_in;
    if (n >= N) return;

    float sum = 0.0f;
    for (int c_out = 0; c_out < C_out; ++c_out) {
        for (int kh = 0; kh < K; ++kh) {
            for (int kw = 0; kw < K; ++kw) {
                int h_out = h + pad - kh;
                int w_out = w + pad - kw;

                if (h_out % stride != 0 || w_out % stride != 0) continue;

                h_out /= stride;
                w_out /= stride;

                if (h_out < 0 || h_out >= H_out ||
                    w_out < 0 || w_out >= W_out)
                    continue;

                int dy_idx = idx4(n, c_out, h_out, w_out,
                                  C_out, H_out, W_out);

                int kh_flip = K - 1 - kh;
                int kw_flip = K - 1 - kw;
                int w_idx = (((c_out * C_in + c_in) * K) + kh_flip) * K + kw_flip;
                sum += dY[dy_idx] * weight[w_idx];
            }
        }
    }

    int dx_idx = idx4(n, c_in, h, w, C_in, H, W);
    dX[dx_idx] = sum;
}

// --------------- Conv2D backward: dW ------------------
// Mỗi thread tính toàn bộ gradient cho 1 weight element
// Không có conflict vì mỗi thread ghi vào vị trí riêng biệt
__global__ void conv2d_backward_weight_naive(
    const float* __restrict__ input,
    const float* __restrict__ dY,
    float* __restrict__ dW,
    int N, int C_in, int H, int W,
    int C_out, int K, int pad, int stride)
{
    int H_out = (H + 2 * pad - K) / stride + 1;
    int W_out = (W + 2 * pad - K) / stride + 1;

    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = C_out * C_in * K * K;
    if (idx >= total) return;

    int kw = idx % K;
    int tmp = idx / K;
    int kh = tmp % K;
    tmp /= K;
    int c_in = tmp % C_in;
    int c_out = tmp / C_in;

    float sum = 0.0f;
    for (int n = 0; n < N; ++n) {
        for (int h_out = 0; h_out < H_out; ++h_out) {
            for (int w_out = 0; w_out < W_out; ++w_out) {
                int h_in = h_out * stride + kh - pad;
                int w_in = w_out * stride + kw - pad;
                if (h_in < 0 || h_in >= H || w_in < 0 || w_in >= W)
                    continue;

                int in_idx = idx4(n, c_in, h_in, w_in, C_in, H, W);
                int dy_idx = idx4(n, c_out, h_out, w_out,
                                  C_out, H_out, W_out);
                sum += dY[dy_idx] * input[in_idx];
            }
        }
    }
    dW[idx] += sum;
}

// --------------- Conv2D backward: dB ------------------
// Mỗi thread tính gradient cho 1 bias element
__global__ void conv2d_backward_bias_naive(
    const float* __restrict__ dY,
    float* __restrict__ dB,
    int N, int C_out, int H_out, int W_out)
{
    int c_out = blockIdx.x * blockDim.x + threadIdx.x;
    if (c_out >= C_out) return;

    float sum = 0.0f;
    for (int n = 0; n < N; ++n) {
        for (int h = 0; h < H_out; ++h) {
            for (int w = 0; w < W_out; ++w) {
                int idx = idx4(n, c_out, h, w, C_out, H_out, W_out);
                sum += dY[idx];
            }
        }
    }
    dB[c_out] += sum;
}

// --------------- SGD update ------------------
__global__ void sgd_update(
    float* __restrict__ param,
    const float* __restrict__ grad,
    int size,
    float lr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        param[i] -= lr * grad[i];
    }
}

Writing gpu_layers.cu


In [5]:
%%writefile gpu_autoencoder.h
// header for GPUAutoencoder (the struct + declarations)
#pragma once
#include "gpu_layers.h"

// latent 128 x 8 x 8
static const int AE_LATENT_C   = 128;
static const int AE_LATENT_H   = 8;
static const int AE_LATENT_W   = 8;
static const int AE_LATENT_DIM = AE_LATENT_C * AE_LATENT_H * AE_LATENT_W;

// This autoencoder matches the project architecture exactly.
// Layout: NCHW [batch, channels, height, width]
struct GPUAutoencoder {
    int N;   // batch size
    int H;   // 32
    int W;   // 32;

    // --- Conv layer parameters ---
    // conv1: 3 -> 256 (3x3)
    float *d_w1, *d_b1;
    // conv2: 256 -> 128
    float *d_w2, *d_b2;
    // conv3: 128 -> 128
    float *d_w3, *d_b3;
    // conv4: 128 -> 256
    float *d_w4, *d_b4;
    // conv5: 256 -> 3
    float *d_w5, *d_b5;

    // --- Activations ---
    // Input batch
    float *d_x0;   // [N, 3, 32, 32]

    // Encoder
    float *d_h1;   // conv1 out: [N, 256, 32, 32]
    float *d_p1;   // pool1   : [N, 256, 16, 16]
    float *d_h2;   // conv2   : [N, 128, 16, 16]
    float *d_p2;   // pool2   : [N, 128,  8,  8]   (latent)

    // Decoder
    float *d_h3;   // conv3   : [N, 128,  8,  8]
    float *d_u1;   // up1     : [N, 128, 16, 16]
    float *d_h4;   // conv4   : [N, 256, 16, 16]
    float *d_u2;   // up2     : [N, 256, 32, 32]
    float *d_out;  // conv5   : [N,   3, 32, 32]

    // Loss buffer
    float *d_loss; // single float on device

    // ---- gradients for activations ----
    float *d_gx0;
    float *d_gh1;
    float *d_gp1;
    float *d_gh2;
    float *d_gp2;
    float *d_gh3;
    float *d_gu1;
    float *d_gh4;
    float *d_gu2;
    float *d_gout;

    // ---- gradients for weights ----
    float *d_gw1, *d_gb1;
    float *d_gw2, *d_gb2;
    float *d_gw3, *d_gb3;
    float *d_gw4, *d_gb4;
    float *d_gw5, *d_gb5;
};

// API
void gpu_autoencoder_init(GPUAutoencoder *ae, int batch_size);
void gpu_autoencoder_free(GPUAutoencoder *ae);

void gpu_autoencoder_copy_weights_to_host(
    GPUAutoencoder *ae,
    float *h_w1, float *h_b1,
    float *h_w2, float *h_b2,
    float *h_w3, float *h_b3,
    float *h_w4, float *h_b4,
    float *h_w5, float *h_b5);

void gpu_autoencoder_copy_weights_to_device(
    GPUAutoencoder *ae,
    const float *h_w1, const float *h_b1,
    const float *h_w2, const float *h_b2,
    const float *h_w3, const float *h_b3,
    const float *h_w4, const float *h_b4,
    const float *h_w5, const float *h_b5);


// Forward on GPU:
//   h_input  : host pointer [N * 3 * 32 * 32]
//   h_output : host pointer [N * 3 * 32 * 32]
//   returns loss value (MSE(x_hat, x)) if compute_loss=true;
//   otherwise returns 0.0f.
float gpu_autoencoder_forward(
    GPUAutoencoder *ae,
    const float *h_input,
    float *h_output,
    bool compute_loss = true);

void gpu_autoencoder_backward(GPUAutoencoder *ae, float lr);

void gpu_autoencoder_save_weights(GPUAutoencoder *ae, const char *filename);

void gpu_autoencoder_load_weights(GPUAutoencoder *ae, const char *filename);

// encode only: lấy latent [N_batch, 128, 8, 8] -> h_latent [N_batch, AE_LATENT_DIM]
void gpu_autoencoder_encode_batch(
    GPUAutoencoder *ae,
    const float *h_input,
    float *h_latent,
    int N_batch);


Writing gpu_autoencoder.h


In [6]:
%%writefile gpu_autoencoder.cu
// your GPUAutoencoder implementation
#include <cstdlib>
#include <cstdio>
#include <cmath>
#include <cuda_runtime.h>
#include "gpu_layers.h"
#include "gpu_autoencoder.h"

static inline float rand_uniform(float min_val, float max_val) {
    float r = (float)rand() / (float)RAND_MAX;
    return min_val + r * (max_val - min_val);
}


void gpu_autoencoder_init(GPUAutoencoder *ae, int batch_size) {
    ae->N = batch_size;
    ae->H = 32;
    ae->W = 32;

    const int N = ae->N;
    const int H = ae->H;
    const int W = ae->W;

    // ---------- allocate weights ----------
    const int K = 3;

    int C_in1 = 3,   C_out1 = 256;
    int C_in2 = 256, C_out2 = 128;
    int C_in3 = 128, C_out3 = 128;
    int C_in4 = 128, C_out4 = 256;
    int C_in5 = 256, C_out5 = 3;

    size_t w1_bytes = C_out1 * C_in1 * K * K * sizeof(float);
    size_t b1_bytes = C_out1 * sizeof(float);
    size_t w2_bytes = C_out2 * C_in2 * K * K * sizeof(float);
    size_t b2_bytes = C_out2 * sizeof(float);
    size_t w3_bytes = C_out3 * C_in3 * K * K * sizeof(float);
    size_t b3_bytes = C_out3 * sizeof(float);
    size_t w4_bytes = C_out4 * C_in4 * K * K * sizeof(float);
    size_t b4_bytes = C_out4 * sizeof(float);
    size_t w5_bytes = C_out5 * C_in5 * K * K * sizeof(float);
    size_t b5_bytes = C_out5 * sizeof(float);

    CHECK_CUDA(cudaMalloc(&ae->d_w1, w1_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_b1, b1_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_w2, w2_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_b2, b2_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_w3, w3_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_b3, b3_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_w4, w4_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_b4, b4_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_w5, w5_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_b5, b5_bytes));

    // init weights on host
    // find max weight bytes
    size_t max_w_bytes = w1_bytes;
    if (w2_bytes > max_w_bytes) max_w_bytes = w2_bytes;
    if (w3_bytes > max_w_bytes) max_w_bytes = w3_bytes;
    if (w4_bytes > max_w_bytes) max_w_bytes = w4_bytes;
    if (w5_bytes > max_w_bytes) max_w_bytes = w5_bytes;

    // find max bias bytes
    size_t max_b_bytes = b1_bytes;
    if (b2_bytes > max_b_bytes) max_b_bytes = b2_bytes;
    if (b3_bytes > max_b_bytes) max_b_bytes = b3_bytes;
    if (b4_bytes > max_b_bytes) max_b_bytes = b4_bytes;
    if (b5_bytes > max_b_bytes) max_b_bytes = b5_bytes;

    float *h_w = (float*)malloc(max_w_bytes);
    float *h_b = (float*)malloc(max_b_bytes);

    auto init_wb = [&](float *d_w, size_t w_bytes, float *d_b, size_t b_bytes) {
        size_t w_cnt = w_bytes / sizeof(float);
        size_t b_cnt = b_bytes / sizeof(float);
        for (size_t i = 0; i < w_cnt; ++i) h_w[i] = rand_uniform(-0.05f, 0.05f);
        for (size_t i = 0; i < b_cnt; ++i) h_b[i] = rand_uniform(-0.05f, 0.05f);
        CHECK_CUDA(cudaMemcpy(d_w, h_w, w_bytes, cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(d_b, h_b, b_bytes, cudaMemcpyHostToDevice));
    };

    init_wb(ae->d_w1, w1_bytes, ae->d_b1, b1_bytes);
    init_wb(ae->d_w2, w2_bytes, ae->d_b2, b2_bytes);
    init_wb(ae->d_w3, w3_bytes, ae->d_b3, b3_bytes);
    init_wb(ae->d_w4, w4_bytes, ae->d_b4, b4_bytes);
    init_wb(ae->d_w5, w5_bytes, ae->d_b5, b5_bytes);

    free(h_w);
    free(h_b);

    // ---------- allocate activations ----------
    size_t bytes_x0  = N * 3   * 32 * 32 * sizeof(float);
    size_t bytes_h1  = N * 256 * 32 * 32 * sizeof(float);
    size_t bytes_p1  = N * 256 * 16 * 16 * sizeof(float);
    size_t bytes_h2  = N * 128 * 16 * 16 * sizeof(float);
    size_t bytes_p2  = N * 128 *  8 *  8 * sizeof(float);
    size_t bytes_h3  = N * 128 *  8 *  8 * sizeof(float);
    size_t bytes_u1  = N * 128 * 16 * 16 * sizeof(float);
    size_t bytes_h4  = N * 256 * 16 * 16 * sizeof(float);
    size_t bytes_u2  = N * 256 * 32 * 32 * sizeof(float);
    size_t bytes_out = N * 3   * 32 * 32 * sizeof(float);

    CHECK_CUDA(cudaMalloc(&ae->d_x0,  bytes_x0));
    CHECK_CUDA(cudaMalloc(&ae->d_h1,  bytes_h1));
    CHECK_CUDA(cudaMalloc(&ae->d_p1,  bytes_p1));
    CHECK_CUDA(cudaMalloc(&ae->d_h2,  bytes_h2));
    CHECK_CUDA(cudaMalloc(&ae->d_p2,  bytes_p2));
    CHECK_CUDA(cudaMalloc(&ae->d_h3,  bytes_h3));
    CHECK_CUDA(cudaMalloc(&ae->d_u1,  bytes_u1));
    CHECK_CUDA(cudaMalloc(&ae->d_h4,  bytes_h4));
    CHECK_CUDA(cudaMalloc(&ae->d_u2,  bytes_u2));
    CHECK_CUDA(cudaMalloc(&ae->d_out, bytes_out));

    // loss buffer
    CHECK_CUDA(cudaMalloc(&ae->d_loss, sizeof(float)));

    // ---------- allocate activation gradients ----------
    CHECK_CUDA(cudaMalloc(&ae->d_gx0,  bytes_x0));
    CHECK_CUDA(cudaMalloc(&ae->d_gh1,  bytes_h1));
    CHECK_CUDA(cudaMalloc(&ae->d_gp1,  bytes_p1));
    CHECK_CUDA(cudaMalloc(&ae->d_gh2,  bytes_h2));
    CHECK_CUDA(cudaMalloc(&ae->d_gp2,  bytes_p2));
    CHECK_CUDA(cudaMalloc(&ae->d_gh3,  bytes_h3));
    CHECK_CUDA(cudaMalloc(&ae->d_gu1,  bytes_u1));
    CHECK_CUDA(cudaMalloc(&ae->d_gh4,  bytes_h4));
    CHECK_CUDA(cudaMalloc(&ae->d_gu2,  bytes_u2));
    CHECK_CUDA(cudaMalloc(&ae->d_gout, bytes_out));

    // ---------- allocate weight gradients ----------
    CHECK_CUDA(cudaMalloc(&ae->d_gw1, w1_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gb1, b1_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gw2, w2_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gb2, b2_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gw3, w3_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gb3, b3_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gw4, w4_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gb4, b4_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gw5, w5_bytes));
    CHECK_CUDA(cudaMalloc(&ae->d_gb5, b5_bytes));
}

void gpu_autoencoder_free(GPUAutoencoder *ae) {
    // weights
    cudaFree(ae->d_w1); cudaFree(ae->d_b1);
    cudaFree(ae->d_w2); cudaFree(ae->d_b2);
    cudaFree(ae->d_w3); cudaFree(ae->d_b3);
    cudaFree(ae->d_w4); cudaFree(ae->d_b4);
    cudaFree(ae->d_w5); cudaFree(ae->d_b5);

    // activations
    cudaFree(ae->d_x0);
    cudaFree(ae->d_h1);
    cudaFree(ae->d_p1);
    cudaFree(ae->d_h2);
    cudaFree(ae->d_p2);
    cudaFree(ae->d_h3);
    cudaFree(ae->d_u1);
    cudaFree(ae->d_h4);
    cudaFree(ae->d_u2);
    cudaFree(ae->d_out);

    cudaFree(ae->d_loss);

    // activation gradients
    cudaFree(ae->d_gx0);
    cudaFree(ae->d_gh1);
    cudaFree(ae->d_gp1);
    cudaFree(ae->d_gh2);
    cudaFree(ae->d_gp2);
    cudaFree(ae->d_gh3);
    cudaFree(ae->d_gu1);
    cudaFree(ae->d_gh4);
    cudaFree(ae->d_gu2);
    cudaFree(ae->d_gout);

    // weight gradients
    cudaFree(ae->d_gw1); cudaFree(ae->d_gb1);
    cudaFree(ae->d_gw2); cudaFree(ae->d_gb2);
    cudaFree(ae->d_gw3); cudaFree(ae->d_gb3);
    cudaFree(ae->d_gw4); cudaFree(ae->d_gb4);
    cudaFree(ae->d_gw5); cudaFree(ae->d_gb5);
}

void gpu_autoencoder_copy_weights_to_host(
    GPUAutoencoder *ae,
    float *h_w1, float *h_b1,
    float *h_w2, float *h_b2,
    float *h_w3, float *h_b3,
    float *h_w4, float *h_b4,
    float *h_w5, float *h_b5)
{
    const int K = 3;
    int C_in1 = 3,   C_out1 = 256;
    int C_in2 = 256, C_out2 = 128;
    int C_in3 = 128, C_out3 = 128;
    int C_in4 = 128, C_out4 = 256;
    int C_in5 = 256, C_out5 = 3;

    size_t w1_bytes = C_out1 * C_in1 * K * K * sizeof(float);
    size_t b1_bytes = C_out1 * sizeof(float);
    size_t w2_bytes = C_out2 * C_in2 * K * K * sizeof(float);
    size_t b2_bytes = C_out2 * sizeof(float);
    size_t w3_bytes = C_out3 * C_in3 * K * K * sizeof(float);
    size_t b3_bytes = C_out3 * sizeof(float);
    size_t w4_bytes = C_out4 * C_in4 * K * K * sizeof(float);
    size_t b4_bytes = C_out4 * sizeof(float);
    size_t w5_bytes = C_out5 * C_in5 * K * K * sizeof(float);
    size_t b5_bytes = C_out5 * sizeof(float);

    CHECK_CUDA(cudaMemcpy(h_w1, ae->d_w1, w1_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_b1, ae->d_b1, b1_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_w2, ae->d_w2, w2_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_b2, ae->d_b2, b2_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_w3, ae->d_w3, w3_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_b3, ae->d_b3, b3_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_w4, ae->d_w4, w4_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_b4, ae->d_b4, b4_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_w5, ae->d_w5, w5_bytes, cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaMemcpy(h_b5, ae->d_b5, b5_bytes, cudaMemcpyDeviceToHost));
}

void gpu_autoencoder_copy_weights_to_device(
    GPUAutoencoder *ae,
    const float *h_w1, const float *h_b1,
    const float *h_w2, const float *h_b2,
    const float *h_w3, const float *h_b3,
    const float *h_w4, const float *h_b4,
    const float *h_w5, const float *h_b5)
{
    const int K = 3;
    int C_in1 = 3,   C_out1 = 256;
    int C_in2 = 256, C_out2 = 128;
    int C_in3 = 128, C_out3 = 128;
    int C_in4 = 128, C_out4 = 256;
    int C_in5 = 256, C_out5 = 3;

    size_t w1_bytes = C_out1 * C_in1 * K * K * sizeof(float);
    size_t b1_bytes = C_out1 * sizeof(float);
    size_t w2_bytes = C_out2 * C_in2 * K * K * sizeof(float);
    size_t b2_bytes = C_out2 * sizeof(float);
    size_t w3_bytes = C_out3 * C_in3 * K * K * sizeof(float);
    size_t b3_bytes = C_out3 * sizeof(float);
    size_t w4_bytes = C_out4 * C_in4 * K * K * sizeof(float);
    size_t b4_bytes = C_out4 * sizeof(float);
    size_t w5_bytes = C_out5 * C_in5 * K * K * sizeof(float);
    size_t b5_bytes = C_out5 * sizeof(float);

    CHECK_CUDA(cudaMemcpy(ae->d_w1, h_w1, w1_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_b1, h_b1, b1_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_w2, h_w2, w2_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_b2, h_b2, b2_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_w3, h_w3, w3_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_b3, h_b3, b3_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_w4, h_w4, w4_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_b4, h_b4, b4_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_w5, h_w5, w5_bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(ae->d_b5, h_b5, b5_bytes, cudaMemcpyHostToDevice));
}

float gpu_autoencoder_forward(
    GPUAutoencoder *ae,
    const float *h_input,
    float *h_output,
    bool compute_loss)
{
    const int N = ae->N;
    const int H = ae->H;
    const int W = ae->W;
    const int K = 3;
    const int pad = 1;
    const int stride = 1;

    // ------------- copy input to device -------------
    size_t in_bytes = N * 3 * H * W * sizeof(float);
    CHECK_CUDA(cudaMemcpy(ae->d_x0, h_input, in_bytes, cudaMemcpyHostToDevice));

    dim3 block2d(16, 16);

    // ========= ENCODER =========
    // conv1: 3 -> 256, same 32x32
    {
        int C_in = 3, C_out = 256;
        int H_out = 32, W_out = 32;
        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N * C_out);

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_x0, ae->d_w1, ae->d_b1, ae->d_h1,
            N, C_in, H, W, C_out, K, pad, stride);

        // ReLU
        int size = N * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h1, size);

        // MaxPool 2x2 -> 16x16
        int Hp = 16, Wp = 16;
        dim3 gridPool(
            (Wp + block2d.x - 1) / block2d.x,
            (Hp + block2d.y - 1) / block2d.y,
            N * C_out);

        maxpool2x2_forward<<<gridPool, block2d>>>(
            ae->d_h1, ae->d_p1,
            N, C_out, H_out, W_out);
    }

    // conv2: 256 -> 128, 16x16, then pool -> 8x8
    {
        int C_in = 256, C_out = 128;
        int H_in = 16, W_in = 16;
        int H_out = 16, W_out = 16;
        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N * C_out);

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_p1, ae->d_w2, ae->d_b2, ae->d_h2,
            N, C_in, H_in, W_in, C_out, K, pad, stride);

        int size = N * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h2, size);

        // pool -> 8x8
        int Hp = 8, Wp = 8;
        dim3 gridPool(
            (Wp + block2d.x - 1) / block2d.x,
            (Hp + block2d.y - 1) / block2d.y,
            N * C_out);

        maxpool2x2_forward<<<gridPool, block2d>>>(
            ae->d_h2, ae->d_p2,
            N, C_out, H_out, W_out);
    }

    // LATENT is ae->d_p2: [N, 128, 8, 8]

    // ========= DECODER =========
    // conv3: 128 -> 128, 8x8
    {
        int C_in = 128, C_out = 128;
        int H_in = 8, W_in = 8;
        int H_out = 8, W_out = 8;

        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N * C_out);

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_p2, ae->d_w3, ae->d_b3, ae->d_h3,
            N, C_in, H_in, W_in, C_out, K, pad, stride);

        int size = N * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h3, size);

        // upsample 8x8 -> 16x16
        int Hu = 16, Wu = 16;
        dim3 gridUp(
            (Wu + block2d.x - 1) / block2d.x,
            (Hu + block2d.y - 1) / block2d.y,
            N * C_out);

        upsample2x2_forward<<<gridUp, block2d>>>(
            ae->d_h3, ae->d_u1,
            N, C_out, H_in, W_in);
    }

    // conv4: 128 -> 256, 16x16, then upsample 16->32
    {
        int C_in = 128, C_out = 256;
        int H_in = 16, W_in = 16;
        int H_out = 16, W_out = 16;

        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N * C_out);

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_u1, ae->d_w4, ae->d_b4, ae->d_h4,
            N, C_in, H_in, W_in, C_out, K, pad, stride);

        int size = N * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h4, size);

        // upsample 16x16 -> 32x32
        int Hu = 32, Wu = 32;
        dim3 gridUp(
            (Wu + block2d.x - 1) / block2d.x,
            (Hu + block2d.y - 1) / block2d.y,
            N * C_out);

        upsample2x2_forward<<<gridUp, block2d>>>(
            ae->d_h4, ae->d_u2,
            N, C_out, H_in, W_in);
    }

    // conv5: 256 -> 3, 32x32 (no activation, usually MSE on raw output)
    {
        int C_in = 256, C_out = 3;
        int H_in = 32, W_in = 32;
        int H_out = 32, W_out = 32;

        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N * C_out);

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_u2, ae->d_w5, ae->d_b5, ae->d_out,
            N, C_in, H_in, W_in, C_out, K, pad, stride);
    }

    // ------------- (optional) compute MSE loss -------------
    float loss_value = 0.0f;
        if (compute_loss) {
        int size = N * 3 * 32 * 32;
        CHECK_CUDA(cudaMemset(ae->d_loss, 0, sizeof(float)));

        int t = 256;
        int b = (size + t - 1) / t;
        size_t shmem_bytes = t * sizeof(float);

        // kernel giờ trả về SUM(diff^2) vào d_loss
        mse_loss_forward<<<b, t, shmem_bytes>>>(
            ae->d_out, ae->d_x0, ae->d_loss, size);

        float loss_sum = 0.0f;
        CHECK_CUDA(cudaMemcpy(&loss_sum, ae->d_loss,
                              sizeof(float),
                              cudaMemcpyDeviceToHost));

        loss_value = loss_sum / size;  // MSE = sum / size
    }


    // ------------- copy output back to host -------------
    size_t out_bytes = N * 3 * 32 * 32 * sizeof(float);
    CHECK_CUDA(cudaMemcpy(h_output, ae->d_out,
                          out_bytes,
                          cudaMemcpyDeviceToHost));

    return loss_value;
}

void gpu_autoencoder_backward(GPUAutoencoder *ae, float lr)
{
    const int N = ae->N;
    const int H0 = ae->H; // 32
    const int W0 = ae->W; // 32
    const int K = 3;
    const int pad = 1;
    const int stride = 1;

    // Zero all gradient buffers
    CHECK_CUDA(cudaMemset(ae->d_gw1, 0, 256 * 3 * K * K * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gb1, 0, 256 * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gw2, 0, 128 * 256 * K * K * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gb2, 0, 128 * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gw3, 0, 128 * 128 * K * K * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gb3, 0, 128 * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gw4, 0, 256 * 128 * K * K * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gb4, 0, 256 * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gw5, 0, 3 * 256 * K * K * sizeof(float)));
    CHECK_CUDA(cudaMemset(ae->d_gb5, 0, 3 * sizeof(float)));

    dim3 block2d(16, 16);

    // ===== 1. dL/dout (MSE) =====
    int size_out = N * 3 * 32 * 32;
    {
        int t = 256;
        int b = (size_out + t - 1) / t;
        mse_loss_backward<<<b, t>>>(
            ae->d_out, ae->d_x0, ae->d_gout, size_out);
        CHECK_CUDA(cudaDeviceSynchronize());
    }

    // ===== 2. Backward conv5: 256->3, 32x32 =====
    {
        int C_in = 256, C_out = 3;
        int H = 32, W = 32;

        dim3 gridIn(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C_in);

        conv2d_backward_input_naive<<<gridIn, block2d>>>(
            ae->d_gout, ae->d_w5, ae->d_gu2,
            N, C_in, H, W, C_out, K, pad, stride);

        int num_w = C_out * C_in * K * K;
        int t = 256;
        int b = (num_w + t - 1) / t;
        conv2d_backward_weight_naive<<<b, t>>>(
            ae->d_u2, ae->d_gout, ae->d_gw5,
            N, C_in, H, W, C_out, K, pad, stride);

        int tb = 256;
        int bb = (C_out + tb - 1) / tb;
        conv2d_backward_bias_naive<<<bb, tb>>>(
            ae->d_gout, ae->d_gb5,
            N, C_out, H, W);

        sgd_update<<<b, t>>>(ae->d_w5, ae->d_gw5, num_w, lr);

        int bbp = (C_out + t - 1) / t;
        sgd_update<<<bbp, t>>>(ae->d_b5, ae->d_gb5, C_out, lr);
    }

    // ===== 3. UpSample2x2 backward =====
    {
        int C = 256;
        int H = 16, W = 16;

        dim3 grid(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C);

        upsample2x2_backward<<<grid, block2d>>>(
            ae->d_gu2, ae->d_gh4,
            N, C, H, W);
    }

    // ===== 4. ReLU backward h4 =====
    {
        int size = N * 256 * 16 * 16;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_backward<<<b, t>>>(
            ae->d_h4, ae->d_gh4, ae->d_gh4, size);
    }

    // ===== 5. conv4 backward =====
    {
        int C_in = 128, C_out = 256;
        int H = 16, W = 16;

        dim3 gridIn(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C_in);

        conv2d_backward_input_naive<<<gridIn, block2d>>>(
            ae->d_gh4, ae->d_w4, ae->d_gu1,
            N, C_in, H, W, C_out, K, pad, stride);

        int num_w = C_out * C_in * K * K;
        int t = 256;
        int b = (num_w + t - 1) / t;
        conv2d_backward_weight_naive<<<b, t>>>(
            ae->d_u1, ae->d_gh4, ae->d_gw4,
            N, C_in, H, W, C_out, K, pad, stride);

        int tb = 256;
        int bb = (C_out + tb - 1) / tb;
        conv2d_backward_bias_naive<<<bb, tb>>>(
            ae->d_gh4, ae->d_gb4,
            N, C_out, H, W);

        sgd_update<<<b, t>>>(ae->d_w4, ae->d_gw4, num_w, lr);

        int bbp = (C_out + t - 1) / t;
        sgd_update<<<bbp, t>>>(ae->d_b4, ae->d_gb4, C_out, lr);
    }

    // ===== 6. UpSample2x2 backward =====
    {
        int C = 128;
        int H = 8, W = 8;

        dim3 grid(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C);

        upsample2x2_backward<<<grid, block2d>>>(
            ae->d_gu1, ae->d_gh3,
            N, C, H, W);
    }

    // ===== 7. ReLU backward h3 =====
    {
        int size = N * 128 * 8 * 8;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_backward<<<b, t>>>(
            ae->d_h3, ae->d_gh3, ae->d_gh3, size);
    }

    // ===== 8. conv3 backward =====
    {
        int C_in = 128, C_out = 128;
        int H = 8, W = 8;

        dim3 gridIn(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C_in);

        conv2d_backward_input_naive<<<gridIn, block2d>>>(
            ae->d_gh3, ae->d_w3, ae->d_gp2,
            N, C_in, H, W, C_out, K, pad, stride);

        int num_w = C_out * C_in * K * K;
        int t = 256;
        int b = (num_w + t - 1) / t;
        conv2d_backward_weight_naive<<<b, t>>>(
            ae->d_p2, ae->d_gh3, ae->d_gw3,
            N, C_in, H, W, C_out, K, pad, stride);

        int tb = 256;
        int bb = (C_out + tb - 1) / tb;
        conv2d_backward_bias_naive<<<bb, tb>>>(
            ae->d_gh3, ae->d_gb3,
            N, C_out, H, W);

        sgd_update<<<b, t>>>(ae->d_w3, ae->d_gw3, num_w, lr);

        int bbp = (C_out + t - 1) / t;
        sgd_update<<<bbp, t>>>(ae->d_b3, ae->d_gb3, C_out, lr);
    }

    // ===== 9. MaxPool2x2 backward: P2 <- H2 =====
    {
        int C = 128;
        int H = 16, W = 16;

        CHECK_CUDA(cudaMemset(ae->d_gh2, 0, N * C * H * W * sizeof(float)));

        dim3 grid(
            (W/2 + block2d.x - 1) / block2d.x,
            (H/2 + block2d.y - 1) / block2d.y,
            N * C);

        maxpool2x2_backward<<<grid, block2d>>>(
            ae->d_h2, ae->d_gp2, ae->d_gh2,
            N, C, H, W);
    }

    // ===== 10. ReLU backward h2 =====
    {
        int size = N * 128 * 16 * 16;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_backward<<<b, t>>>(
            ae->d_h2, ae->d_gh2, ae->d_gh2, size);
    }

    // ===== 11. conv2 backward =====
    {
        int C_in = 256, C_out = 128;
        int H = 16, W = 16;

        dim3 gridIn(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C_in);

        conv2d_backward_input_naive<<<gridIn, block2d>>>(
            ae->d_gh2, ae->d_w2, ae->d_gp1,
            N, C_in, H, W, C_out, K, pad, stride);

        int num_w = C_out * C_in * K * K;
        int t = 256;
        int b = (num_w + t - 1) / t;
        conv2d_backward_weight_naive<<<b, t>>>(
            ae->d_p1, ae->d_gh2, ae->d_gw2,
            N, C_in, H, W, C_out, K, pad, stride);

        int tb = 256;
        int bb = (C_out + tb - 1) / tb;
        conv2d_backward_bias_naive<<<bb, tb>>>(
            ae->d_gh2, ae->d_gb2,
            N, C_out, H, W);

        sgd_update<<<b, t>>>(ae->d_w2, ae->d_gw2, num_w, lr);

        int bbp = (C_out + t - 1) / t;
        sgd_update<<<bbp, t>>>(ae->d_b2, ae->d_gb2, C_out, lr);
    }

    // ===== 12. MaxPool2x2 backward: P1 <- H1 =====
    {
        int C = 256;
        int H = 32, W = 32;

        // Zero gradient buffer
        CHECK_CUDA(cudaMemset(ae->d_gh1, 0, N * C * H * W * sizeof(float)));

        dim3 grid(
            (W/2 + block2d.x - 1) / block2d.x,
            (H/2 + block2d.y - 1) / block2d.y,
            N * C);

        maxpool2x2_backward<<<grid, block2d>>>(
            ae->d_h1, ae->d_gp1, ae->d_gh1,
            N, C, H, W);
    }

    // ===== 13. ReLU backward h1 =====
    {
        int size = N * 256 * 32 * 32;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_backward<<<b, t>>>(
            ae->d_h1, ae->d_gh1, ae->d_gh1, size);
    }

    // ===== 14. conv1 backward =====
    {
        int C_in = 3, C_out = 256;
        int H = 32, W = 32;

        dim3 gridIn(
            (W + block2d.x - 1) / block2d.x,
            (H + block2d.y - 1) / block2d.y,
            N * C_in);

        conv2d_backward_input_naive<<<gridIn, block2d>>>(
            ae->d_gh1, ae->d_w1, ae->d_gx0,
            N, C_in, H, W, C_out, K, pad, stride);

        int num_w = C_out * C_in * K * K;
        int t = 256;
        int b = (num_w + t - 1) / t;
        conv2d_backward_weight_naive<<<b, t>>>(
            ae->d_x0, ae->d_gh1, ae->d_gw1,
            N, C_in, H, W, C_out, K, pad, stride);

        int tb = 256;
        int bb = (C_out + tb - 1) / tb;
        conv2d_backward_bias_naive<<<bb, tb>>>(
            ae->d_gh1, ae->d_gb1,
            N, C_out, H, W);

        sgd_update<<<b, t>>>(ae->d_w1, ae->d_gw1, num_w, lr);

        int bbp = (C_out + t - 1) / t;
        sgd_update<<<bbp, t>>>(ae->d_b1, ae->d_gb1, C_out, lr);
    }
}

void gpu_autoencoder_save_weights(GPUAutoencoder *ae, const char *filename)
{
    const int K = 3;
    int C_in1 = 3,   C_out1 = 256;
    int C_in2 = 256, C_out2 = 128;
    int C_in3 = 128, C_out3 = 128;
    int C_in4 = 128, C_out4 = 256;
    int C_in5 = 256, C_out5 = 3;

    size_t w1_cnt = C_out1 * C_in1 * K * K;
    size_t b1_cnt = C_out1;
    size_t w2_cnt = C_out2 * C_in2 * K * K;
    size_t b2_cnt = C_out2;
    size_t w3_cnt = C_out3 * C_in3 * K * K;
    size_t b3_cnt = C_out3;
    size_t w4_cnt = C_out4 * C_in4 * K * K;
    size_t b4_cnt = C_out4;
    size_t w5_cnt = C_out5 * C_in5 * K * K;
    size_t b5_cnt = C_out5;

    float *h_w1 = (float*)malloc(w1_cnt * sizeof(float));
    float *h_b1 = (float*)malloc(b1_cnt * sizeof(float));
    float *h_w2 = (float*)malloc(w2_cnt * sizeof(float));
    float *h_b2 = (float*)malloc(b2_cnt * sizeof(float));
    float *h_w3 = (float*)malloc(w3_cnt * sizeof(float));
    float *h_b3 = (float*)malloc(b3_cnt * sizeof(float));
    float *h_w4 = (float*)malloc(w4_cnt * sizeof(float));
    float *h_b4 = (float*)malloc(b4_cnt * sizeof(float));
    float *h_w5 = (float*)malloc(w5_cnt * sizeof(float));
    float *h_b5 = (float*)malloc(b5_cnt * sizeof(float));

    gpu_autoencoder_copy_weights_to_host(
        ae,
        h_w1, h_b1,
        h_w2, h_b2,
        h_w3, h_b3,
        h_w4, h_b4,
        h_w5, h_b5);

    FILE *f = fopen(filename, "wb");
    if (!f) {
        fprintf(stderr, "Cannot open %s for writing\n", filename);
    } else {
        fwrite(h_w1, sizeof(float), w1_cnt, f);
        fwrite(h_b1, sizeof(float), b1_cnt, f);
        fwrite(h_w2, sizeof(float), w2_cnt, f);
        fwrite(h_b2, sizeof(float), b2_cnt, f);
        fwrite(h_w3, sizeof(float), w3_cnt, f);
        fwrite(h_b3, sizeof(float), b3_cnt, f);
        fwrite(h_w4, sizeof(float), w4_cnt, f);
        fwrite(h_b4, sizeof(float), b4_cnt, f);
        fwrite(h_w5, sizeof(float), w5_cnt, f);
        fwrite(h_b5, sizeof(float), b5_cnt, f);
        fclose(f);
        printf("Saved weights to %s\n", filename);
    }

    free(h_w1); free(h_b1);
    free(h_w2); free(h_b2);
    free(h_w3); free(h_b3);
    free(h_w4); free(h_b4);
    free(h_w5); free(h_b5);
}

void gpu_autoencoder_load_weights(GPUAutoencoder *ae, const char *filename)
{
    const int K = 3;
    int C_in1 = 3,   C_out1 = 256;
    int C_in2 = 256, C_out2 = 128;
    int C_in3 = 128, C_out3 = 128;
    int C_in4 = 128, C_out4 = 256;
    int C_in5 = 256, C_out5 = 3;

    size_t w1_cnt = C_out1 * C_in1 * K * K;
    size_t b1_cnt = C_out1;
    size_t w2_cnt = C_out2 * C_in2 * K * K;
    size_t b2_cnt = C_out2;
    size_t w3_cnt = C_out3 * C_in3 * K * K;
    size_t b3_cnt = C_out3;
    size_t w4_cnt = C_out4 * C_in4 * K * K;
    size_t b4_cnt = C_out4;
    size_t w5_cnt = C_out5 * C_in5 * K * K;
    size_t b5_cnt = C_out5;

    float *h_w1 = (float*)malloc(w1_cnt * sizeof(float));
    float *h_b1 = (float*)malloc(b1_cnt * sizeof(float));
    float *h_w2 = (float*)malloc(w2_cnt * sizeof(float));
    float *h_b2 = (float*)malloc(b2_cnt * sizeof(float));
    float *h_w3 = (float*)malloc(w3_cnt * sizeof(float));
    float *h_b3 = (float*)malloc(b3_cnt * sizeof(float));
    float *h_w4 = (float*)malloc(w4_cnt * sizeof(float));
    float *h_b4 = (float*)malloc(b4_cnt * sizeof(float));
    float *h_w5 = (float*)malloc(w5_cnt * sizeof(float));
    float *h_b5 = (float*)malloc(b5_cnt * sizeof(float));

    FILE *f = fopen(filename, "rb");
    if (!f) {
        fprintf(stderr, "Cannot open %s for reading\n", filename);
        exit(1);
    }

    size_t r1 = fread(h_w1, sizeof(float), w1_cnt, f);
    size_t r2 = fread(h_b1, sizeof(float), b1_cnt, f);
    size_t r3 = fread(h_w2, sizeof(float), w2_cnt, f);
    size_t r4 = fread(h_b2, sizeof(float), b2_cnt, f);
    size_t r5 = fread(h_w3, sizeof(float), w3_cnt, f);
    size_t r6 = fread(h_b3, sizeof(float), b3_cnt, f);
    size_t r7 = fread(h_w4, sizeof(float), w4_cnt, f);
    size_t r8 = fread(h_b4, sizeof(float), b4_cnt, f);
    size_t r9 = fread(h_w5, sizeof(float), w5_cnt, f);
    size_t r10 = fread(h_b5, sizeof(float), b5_cnt, f);
    fclose(f);

    if (r1 != w1_cnt || r2 != b1_cnt ||
        r3 != w2_cnt || r4 != b2_cnt ||
        r5 != w3_cnt || r6 != b3_cnt ||
        r7 != w4_cnt || r8 != b4_cnt ||
        r9 != w5_cnt || r10 != b5_cnt)
    {
        fprintf(stderr, "Error reading weights from %s\n", filename);
        exit(1);
    }

    gpu_autoencoder_copy_weights_to_device(
        ae,
        h_w1, h_b1,
        h_w2, h_b2,
        h_w3, h_b3,
        h_w4, h_b4,
        h_w5, h_b5
    );

    free(h_w1); free(h_b1);
    free(h_w2); free(h_b2);
    free(h_w3); free(h_b3);
    free(h_w4); free(h_b4);
    free(h_w5); free(h_b5);

    printf("Loaded weights from %s\n", filename);
}

void gpu_autoencoder_encode_batch(
    GPUAutoencoder *ae,
    const float *h_input,
    float *h_latent,
    int N_batch)
{
    const int H = ae->H;    // 32
    const int W = ae->W;    // 32
    const int K = 3;
    const int pad = 1;
    const int stride = 1;

    // Copy input [N_batch, 3, 32, 32] to GPU
    size_t in_bytes = (size_t)N_batch * 3 * H * W * sizeof(float);
    CHECK_CUDA(cudaMemcpy(ae->d_x0, h_input, in_bytes, cudaMemcpyHostToDevice));

    dim3 block2d(16, 16);

    // ===== ENCODER =====
    // conv1: 3 -> 256, 32x32 -> h1, ReLU + MaxPool -> p1 (16x16)
    {
        int C_in = 3, C_out = 256;
        int H_out = 32, W_out = 32;

        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N_batch * C_out
        );

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_x0, ae->d_w1, ae->d_b1, ae->d_h1,
            N_batch, C_in, H, W, C_out, K, pad, stride);

        int size = N_batch * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h1, size);

        int Hp = 16, Wp = 16;
        dim3 gridPool(
            (Wp + block2d.x - 1) / block2d.x,
            (Hp + block2d.y - 1) / block2d.y,
            N_batch * C_out
        );

        maxpool2x2_forward<<<gridPool, block2d>>>(
            ae->d_h1, ae->d_p1,
            N_batch, C_out, H_out, W_out);
    }

    // conv2: 256 -> 128, 16x16 -> h2, ReLU + MaxPool -> p2 (8x8)
    {
        int C_in = 256, C_out = 128;
        int H_in = 16, W_in = 16;
        int H_out = 16, W_out = 16;

        dim3 gridConv(
            (W_out + block2d.x - 1) / block2d.x,
            (H_out + block2d.y - 1) / block2d.y,
            N_batch * C_out
        );

        conv2d_forward_naive<<<gridConv, block2d>>>(
            ae->d_p1, ae->d_w2, ae->d_b2, ae->d_h2,
            N_batch, C_in, H_in, W_in, C_out, K, pad, stride);

        int size = N_batch * C_out * H_out * W_out;
        int t = 256;
        int b = (size + t - 1) / t;
        relu_forward<<<b, t>>>(ae->d_h2, size);

        int Hp = 8, Wp = 8;
        dim3 gridPool(
            (Wp + block2d.x - 1) / block2d.x,
            (Hp + block2d.y - 1) / block2d.y,
            N_batch * C_out
        );

        maxpool2x2_forward<<<gridPool, block2d>>>(
            ae->d_h2, ae->d_p2,
            N_batch, C_out, H_out, W_out);
    }

    // FIX: Copy latent [N_batch, 128, 8, 8] correctly
    size_t latent_bytes = (size_t)N_batch * 128 * 8 * 8 * sizeof(float);
    CHECK_CUDA(cudaMemcpy(h_latent, ae->d_p2, latent_bytes, cudaMemcpyDeviceToHost));
}


Writing gpu_autoencoder.cu


In [7]:
%%writefile main_gpu.cu
#include <cstdio>
#include <ctime>
#include <cuda_runtime.h>
#include <cstdlib>   // exit()


#include "load_data.h"
#include "gpu_autoencoder.h"

// GpuTimer dùng cudaEvent để đo time
struct GpuTimer {
    cudaEvent_t start, stop;
    GpuTimer() {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }
    ~GpuTimer() {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }
    void tic() {
        cudaEventRecord(start, 0);
    }
    float toc() {
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        float ms = 0.0f;
        cudaEventElapsedTime(&ms, start, stop);
        return ms;
    }
};

void print_gpu_memory() {
    size_t free_byte, total_byte;
    cudaError_t status = cudaMemGetInfo(&free_byte, &total_byte);

    if (status == cudaSuccess) {
        double total_mb = (double)total_byte / (1024.0 * 1024.0);
        double free_mb = (double)free_byte / (1024.0 * 1024.0);
        double used_mb = total_mb - free_mb;

        printf("[SYSTEM] Memory Usage: %.2f MB / Total: %.2f MB\n", used_mb, total_mb);
    }
}

int main(int argc, char** argv) {
    srand((unsigned int)time(NULL));

    printf("[MAIN] Start program\n");
    fflush(stdout);

    // ---- Check GPU device ----
    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    if (err != cudaSuccess) {
        fprintf(stderr, "[MAIN] cudaGetDeviceCount error: %s\n",
                cudaGetErrorString(err));
        return 1;
    }
    printf("[MAIN] Num CUDA devices = %d\n", deviceCount);
    fflush(stdout);

    // ---- Load CIFAR-10 on CPU ----
    Cifar10 data;
    load_cifar10(&data);   // in: CIFAR-10 loaded successfully ...
    printf("[MAIN] After load_cifar10\n");
    fflush(stdout);

    normalize_cifar10(&data);
    printf("[MAIN] After normalize_cifar10\n");
    fflush(stdout);

    // ---- Mở file log GPU (giống format CPU) ----
    FILE* log_gpu = fopen("training_gpu.txt", "w");
    if (!log_gpu) {
        fprintf(stderr, "[MAIN] Cannot open training_gpu.txt for writing\n");
        return 1;
    }

    GpuTimer epoch_timer;

    int batch_size = 64;
    int epochs     = 2;
    float lr       = 1e-3f;

    int num_batches = TRAIN_NUM / batch_size;

    printf("[MAIN] Start training loop (epochs=%d, num_batches=%d)\n",
           epochs, num_batches);
    fflush(stdout);

    float *h_batch  = (float*)malloc(batch_size * IMG_SIZE * sizeof(float));
    float *h_output = (float*)malloc(batch_size * IMG_SIZE * sizeof(float)); // buffer tạm

    // ---- Init GPU autoencoder ----
    GPUAutoencoder ae;
    gpu_autoencoder_init(&ae, batch_size);

    // Biến để tích lũy total time & final loss
    double total_gpu_time_ms = 0.0;
    double final_loss = 0.0;

    for (int epoch = 0; epoch < epochs; ++epoch) {
        shuffle_cifar10(&data);
        double epoch_loss = 0.0;

        // In giống CPU
        printf("Epoch %d/%d\n", epoch + 1, epochs);
        fflush(stdout);
        fprintf(log_gpu, "Epoch %d/%d\n", epoch + 1, epochs);

        epoch_timer.tic();

        for (int b = 0; b < num_batches; ++b) {
            get_next_batch(&data, batch_size, b, h_batch);

            float loss = gpu_autoencoder_forward(&ae, h_batch, h_output, true);
            gpu_autoencoder_backward(&ae, lr);

            epoch_loss += loss;

            if ((b + 1) % 100 == 0) {
                printf("[TRAIN] Epoch %d, batch %d/%d, loss = %f\n",
                       epoch + 1, b + 1, num_batches, loss);
                fflush(stdout);
            }
        }

        float ms = epoch_timer.toc();
        total_gpu_time_ms += ms;

        double avg_loss = epoch_loss / num_batches;
        final_loss = avg_loss;  // loss của epoch cuối sẽ là final loss

        double epoch_time_sec = ms / 1000.0;

        // In ra màn hình
        printf("Epoch %d finished. Avg Loss: %f, time: %.2f seconds\n",
               epoch + 1, avg_loss, epoch_time_sec);
        fflush(stdout);

        // Ghi giống hệt CPU vào file training_gpu.txt
        fprintf(log_gpu,
                "Epoch %d finished. Avg Loss: %f, time: %.2f seconds\n",
                epoch + 1, avg_loss, epoch_time_sec);
        fflush(log_gpu);
    }

    printf("[MAIN] Training finished\n");
    fflush(stdout);

    // ---- SUMMARY trên màn hình ----
    printf("\n*** Training Summary (GPU) ***\n");
    printf("Total training time: %.2f seconds.\n", total_gpu_time_ms / 1000.0);
    printf("Final reconstruction loss: %f\n", final_loss);
    print_gpu_memory();
    fflush(stdout);

    // ---- SUMMARY ghi xuống training_gpu.txt ----
    fprintf(log_gpu, "\n*** Training Summary ***\n");
    fprintf(log_gpu, "Total training time: %.2f seconds.\n",
            total_gpu_time_ms / 1000.0);
    fprintf(log_gpu, "Final reconstruction loss: %f\n", final_loss);
    fclose(log_gpu);

    // save weights
    gpu_autoencoder_save_weights(&ae, "ae_weights_gpu_naive.bin");

    // ---- cleanup ----
    gpu_autoencoder_free(&ae);
    free(h_batch);
    free(h_output);
    free_cifar10(&data);

    printf("[MAIN] Program finished\n");
    fflush(stdout);

    return 0;
}


Writing main_gpu.cu


In [8]:
!wget https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
!tar -xzvf cifar-10-binary.tar.gz

--2025-12-25 14:11:27--  https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170052171 (162M) [application/x-gzip]
Saving to: ‘cifar-10-binary.tar.gz’


2025-12-25 14:11:31 (42.0 MB/s) - ‘cifar-10-binary.tar.gz’ saved [170052171/170052171]

cifar-10-batches-bin/
cifar-10-batches-bin/data_batch_1.bin
cifar-10-batches-bin/batches.meta.txt
cifar-10-batches-bin/data_batch_3.bin
cifar-10-batches-bin/data_batch_4.bin
cifar-10-batches-bin/test_batch.bin
cifar-10-batches-bin/readme.html
cifar-10-batches-bin/data_batch_5.bin
cifar-10-batches-bin/data_batch_2.bin


In [9]:
!nvcc -arch=sm_75 -O2 main_gpu.cu gpu_autoencoder.cu gpu_layers.cu load_data.cu -o autoencoder_gpu

      const int H = ae->H;
                ^


      const int W = ae->W;
                ^

      const int H0 = ae->H;
                ^

      const int W0 = ae->W;
                ^



In [10]:
!./autoencoder_gpu

[MAIN] Start program
[MAIN] Num CUDA devices = 1
CIFAR-10 loaded successfully
[MAIN] After load_cifar10
[MAIN] After normalize_cifar10
[MAIN] Start training loop (epochs=2, num_batches=781)
Epoch 1/2
[TRAIN] Epoch 1, batch 100/781, loss = 0.069835
[TRAIN] Epoch 1, batch 200/781, loss = 0.062484
[TRAIN] Epoch 1, batch 300/781, loss = 0.058678
[TRAIN] Epoch 1, batch 400/781, loss = 0.058653
[TRAIN] Epoch 1, batch 500/781, loss = 0.048739
[TRAIN] Epoch 1, batch 600/781, loss = 0.054894
[TRAIN] Epoch 1, batch 700/781, loss = 0.045116
Epoch 1 finished. Avg Loss: 0.067721, time: 295.44 seconds
Epoch 2/2
[TRAIN] Epoch 2, batch 100/781, loss = 0.050996
[TRAIN] Epoch 2, batch 200/781, loss = 0.045700
[TRAIN] Epoch 2, batch 300/781, loss = 0.050239
[TRAIN] Epoch 2, batch 400/781, loss = 0.051497
[TRAIN] Epoch 2, batch 500/781, loss = 0.044980
[TRAIN] Epoch 2, batch 600/781, loss = 0.043353
[TRAIN] Epoch 2, batch 700/781, loss = 0.040243
Epoch 2 finished. Avg Loss: 0.047354, time: 303.42 seconds


In [11]:
%%writefile extract_svm_features.cu
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>

#include "load_data.h"
#include "gpu_autoencoder.h"

// ghi 1 dòng theo format LIBSVM: label index:val ...
void write_svm_line(FILE* f, int label,
                    const float* feat, int dim)
{
    fprintf(f, "%d", label);

    // In TOÀN BỘ feature, không bỏ qua zero
    for (int j = 0; j < dim; ++j) {
        float v = feat[j];
        fprintf(f, " %d:%g", j + 1, v);
    }
    fprintf(f, "\n");
}


int main(int argc, char** argv)
{
    if (argc < 2) {
        fprintf(stderr,
                "Usage: %s <ae_weights.bin>\n",
                argv[0]);
        return 1;
    }
    const char* weight_file = argv[1];

    printf("[SVM] Loading CIFAR-10...\n");
    Cifar10 data;
    load_cifar10(&data);
    normalize_cifar10(&data);

    // batch_size cho encoder khi extract feature
    int batch_size = 64;
    GPUAutoencoder ae;
    gpu_autoencoder_init(&ae, batch_size);
    gpu_autoencoder_load_weights(&ae, weight_file);


    float* h_batch  = (float*)malloc(batch_size * IMG_SIZE * sizeof(float));
    float* h_latent = (float*)malloc(batch_size * AE_LATENT_DIM * sizeof(float));
    if (!h_batch || !h_latent) {
        fprintf(stderr, "Host malloc failed\n");
        return 1;
    }

    // ====== TRAIN: 50k ảnh -> train_svm.txt ======
    FILE* f_train = fopen("train_svm.txt", "w");
    if (!f_train) {
        perror("train_svm.txt");
        return 1;
    }

    int N_train           = TRAIN_NUM; // 50000
    int num_batches_train = (N_train + batch_size - 1) / batch_size;

    printf("[SVM] Extracting train features...\n");
    for (int b = 0; b < num_batches_train; ++b) {
        int start = b * batch_size;
        int cur_bs = batch_size;
        if (start + cur_bs > N_train) {
            cur_bs = N_train - start;
        }

        // copy ảnh [start, start+cur_bs) vào h_batch
        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            memcpy(h_batch + i * IMG_SIZE,
                   data.train_images + idx * IMG_SIZE,
                   IMG_SIZE * sizeof(float));
        }

        // encoder-only
        gpu_autoencoder_encode_batch(&ae, h_batch, h_latent, cur_bs);

        // ghi ra file theo format LIBSVM
        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            int label = data.train_labels[idx];
            const float* feat = h_latent + i * AE_LATENT_DIM;
            write_svm_line(f_train, label, feat, AE_LATENT_DIM);
        }

        printf("[SVM][TRAIN] Batch %d/%d done\n",
               b + 1, num_batches_train);
        fflush(stdout);
    }
    fclose(f_train);
    printf("[SVM] Saved train_svm.txt\n");

    // ====== TEST: 10k ảnh -> test_svm.txt ======
    FILE* f_test = fopen("test_svm.txt", "w");
    if (!f_test) {
        perror("test_svm.txt");
        return 1;
    }

    int N_test           = TEST_NUM; // 10000
    int num_batches_test = (N_test + batch_size - 1) / batch_size;

    printf("[SVM] Extracting test features...\n");
    for (int b = 0; b < num_batches_test; ++b) {
        int start = b * batch_size;
        int cur_bs = batch_size;
        if (start + cur_bs > N_test) {
            cur_bs = N_test - start;
        }

        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            memcpy(h_batch + i * IMG_SIZE,
                   data.test_images + idx * IMG_SIZE,
                   IMG_SIZE * sizeof(float));
        }

        // **Không còn debug cudaMemcpy w1/b1, không in input nữa**

        gpu_autoencoder_encode_batch(&ae, h_batch, h_latent, cur_bs);

        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            int label = data.test_labels[idx];
            const float* feat = h_latent + i * AE_LATENT_DIM;
            write_svm_line(f_test, label, feat, AE_LATENT_DIM);
        }

        printf("[SVM][TEST] Batch %d/%d done\n",
               b + 1, num_batches_test);
        fflush(stdout);
    }
    fclose(f_test);
    printf("[SVM] Saved test_svm.txt\n");

    // cleanup
    gpu_autoencoder_free(&ae);
    free(h_batch);
    free(h_latent);
    free_cifar10(&data);

    printf("[SVM] Done.\n");
    return 0;
}


Writing extract_svm_features.cu


In [12]:
!nvcc  -arch=sm_75 -O2 -o extract_svm_features \
    extract_svm_features.cu gpu_autoencoder.cu gpu_layers.cu load_data.cu \
    -lcudart

      const int H = ae->H;
                ^


      const int W = ae->W;
                ^

      const int H0 = ae->H;
                ^

      const int W0 = ae->W;
                ^



In [13]:
!./extract_svm_features "ae_weights_gpu_naive.bin"

[SVM] Loading CIFAR-10...
CIFAR-10 loaded successfully
Loaded weights from ae_weights_gpu_naive.bin
[SVM] Extracting train features...
[SVM][TRAIN] Batch 1/782 done
[SVM][TRAIN] Batch 2/782 done
[SVM][TRAIN] Batch 3/782 done
[SVM][TRAIN] Batch 4/782 done
[SVM][TRAIN] Batch 5/782 done
[SVM][TRAIN] Batch 6/782 done
[SVM][TRAIN] Batch 7/782 done
[SVM][TRAIN] Batch 8/782 done
[SVM][TRAIN] Batch 9/782 done
[SVM][TRAIN] Batch 10/782 done
[SVM][TRAIN] Batch 11/782 done
[SVM][TRAIN] Batch 12/782 done
[SVM][TRAIN] Batch 13/782 done
[SVM][TRAIN] Batch 14/782 done
[SVM][TRAIN] Batch 15/782 done
[SVM][TRAIN] Batch 16/782 done
[SVM][TRAIN] Batch 17/782 done
[SVM][TRAIN] Batch 18/782 done
[SVM][TRAIN] Batch 19/782 done
[SVM][TRAIN] Batch 20/782 done
[SVM][TRAIN] Batch 21/782 done
[SVM][TRAIN] Batch 22/782 done
[SVM][TRAIN] Batch 23/782 done
[SVM][TRAIN] Batch 24/782 done
[SVM][TRAIN] Batch 25/782 done
[SVM][TRAIN] Batch 26/782 done
[SVM][TRAIN] Batch 27/782 done
[SVM][TRAIN] Batch 28/782 done
[SVM][

In [14]:
# Tạo train_svm_small với 1000 dòng đầu
!head -n 1000 train_svm.txt > train_svm_small.txt

# Tạo test_svm_small với 200 dòng đầu
!head -n 200 test_svm.txt > test_svm_small.txt

In [15]:
!git clone https://github.com/cjlin1/libsvm.git
%cd libsvm
!make

Cloning into 'libsvm'...
remote: Enumerating objects: 4201, done.[K
remote: Counting objects: 100% (230/230), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 4201 (delta 140), reused 118 (delta 118), pack-reused 3971 (from 3)[K
Receiving objects: 100% (4201/4201), 9.92 MiB | 14.57 MiB/s, done.
Resolving deltas: 100% (2317/2317), done.
/content/libsvm
g++ -Wall -Wconversion -O3 -fPIC -c svm.cpp
g++ -Wall -Wconversion -O3 -fPIC svm-train.c svm.o -o svm-train -lm
g++ -Wall -Wconversion -O3 -fPIC svm-predict.c svm.o -o svm-predict -lm
g++ -Wall -Wconversion -O3 -fPIC svm-scale.c -o svm-scale


In [16]:
%cd /content/libsvm

!./svm-train -s 0 -t 0 -c 1.0 \
  /content/train_svm_small.txt \
  /content/model_ae_svm

!./svm-predict \
  /content/test_svm_small.txt \
  /content/model_ae_svm \
  /content/pred.txt

/content/libsvm
....*...*
optimization finished, #iter = 1520
nu = 0.237290
obj = -29.230427, rho = -4.034406
nSV = 108, nBSV = 19
......*..*
optimization finished, #iter = 1832
nu = 0.602565
obj = -83.948453, rho = 3.805344
nSV = 168, nBSV = 76
.....*..*
optimization finished, #iter = 1699
nu = 0.252019
obj = -30.450040, rho = -4.090413
nSV = 120, nBSV = 13
......*..*
optimization finished, #iter = 1800
nu = 0.466539
obj = -61.311492, rho = 2.825039
nSV = 141, nBSV = 54
......*...*
optimization finished, #iter = 1932
nu = 0.369825
obj = -46.043250, rho = -2.120927
nSV = 137, nBSV = 35
...*..*
optimization finished, #iter = 1168
nu = 0.152729
obj = -19.359932, rho = -1.255318
nSV = 76, nBSV = 10
.....*..*
optimization finished, #iter = 1557
nu = 0.514323
obj = -67.342534, rho = 2.698691
nSV = 147, nBSV = 65
.....*...*
optimization finished, #iter = 1663
nu = 0.446206
obj = -53.775817, rho = -0.014430
nSV = 136, nBSV = 41
...*..*
optimization finished, #iter = 1111
nu = 0.198382
obj = -