# CPU

## 1) Huấn luyện Autoencoder:

In [None]:
%%writefile cpu_layers.c
#include "cpu_layers.h"

void Relu(float* input, int N, float* output) {
    for (int i = 0; i < N; i++) {
        output[i] = input[i] > 0.0f ? input[i] : 0.0f;
    }
}

void Conv2D_Forward(float* input, int input_width, int input_height, int input_channels,
    float* kernel, int kernel_width, int kernel_height,
    float* biases, int padding, int stride, int filter_count,
    float* output, int output_height, int output_width)
{
    // // Tính toán kích thước output
    // int H_out = (input_height + 2 * padding - kernel_height) / stride + 1;
    // int W_out = (input_width + 2 * padding - kernel_width) / stride + 1;
    // int output_size = filter_count * H_out * W_out;
    // output = (float*)malloc(output_size * sizeof(float));
    // if (output == NULL) {
    //     fprintf(stderr, "ERROR: Memory allocation failed!\n");
    //     return;
    // }
    // Lặp qua kênh đầu ra (filter)
    for (int c_out = 0; c_out < filter_count; c_out++) {
        // Lặp qua chiều cao output
        for (int h_out = 0; h_out < output_height; h_out++) {
            // Lặp qua chiều rộng output
            for (int w_out = 0; w_out < output_width; w_out++) {
                float sum = 0.0f;
                // Lặp qua kênh đầu vào (c_in)
                for (int c_in = 0; c_in < input_channels; c_in++) {
                    // Lặp qua kernel height
                    for (int k_h = 0; k_h < kernel_height; k_h++) {
                        // Lặp qua kernel width
                        for (int k_w = 0; k_w < kernel_width; k_w++) {
                            // Vị trí input tương ứng
                            int h_in = h_out * stride + k_h - padding;
                            int w_in = w_out * stride + k_w - padding;
                            float val = 0.0f;
                            // Kiểm tra zero padding
                            if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) {
                                int channel_size = input_width * input_height;
                                val = input[c_in * channel_size + h_in * input_width + w_in];
                            }

                            int weight_idx = c_out * input_channels * kernel_height * kernel_width + c_in * kernel_height * kernel_width +
                                            k_h * kernel_width + k_w;

                            sum += val * kernel[weight_idx];
                        }
                    }
                }
                sum += biases[c_out];  // Thêm bias
                int output_idx = h_out * output_width + w_out + c_out * output_width * output_height;
                output[output_idx] = sum;
            }
        }
    }
}


void MaxPool2D_Forward(float* input, int input_width, int input_height, int filter_width, int filter_height, int stride, int filter_count,
    float* output, int output_height, int output_width) {
    // int H_out = (input_height - filter_height) / stride + 1;
    // int W_out = (input_width - filter_width) / stride + 1;

    int plane_size_in = input_height * input_width;
    int plane_size_out = output_height * output_width;
    for (int c = 0; c < filter_count; c++) {
        for (int h_out = 0; h_out < output_height; h_out++) {
            for (int w_out = 0; w_out < output_width; w_out++) {
                float max_val = -FLT_MAX;
                int h_start = h_out * stride;
                int w_start = w_out * stride;
                for (int fh = 0; fh < filter_height; fh++) {
                    for (int fw = 0; fw < filter_width; fw++) {
                        int h_in = h_start + fh;
                        int w_in = w_start + fw;
                        int input_idx = c * plane_size_in + h_in * input_width + w_in;
                        float val = input[input_idx];
                        if (val > max_val) {
                            max_val = val;
                        }
                    }
                }
                int output_idx = c * plane_size_out + h_out * output_width + w_out;
                output[output_idx] = max_val;
            }
        }
    }
}

void UpSample2D_Forward(float* input, int input_width, int input_height,
int scale_factor, int filter_count, float* output, int output_height, int output_width) {
    int plane_size_in = input_height * input_width;
    int plane_size_out = output_height * output_width;
    for (int c = 0; c < filter_count; c++) {
        for (int h_in = 0; h_in < input_height; h_in++) {
            for (int w_in = 0; w_in < input_width; w_in++) {
                float val = input[c * plane_size_in + h_in * input_width + w_in];
                for (int sh = 0; sh < scale_factor; sh++) { // Gấp đôi hàng
                    for (int sw = 0; sw < scale_factor; sw++) { // Gấp đôi cột
                        int h_out = h_in * scale_factor + sh;
                        int w_out = w_in * scale_factor + sw;
                        int output_idx = c * plane_size_out + h_out * output_width + w_out;
                        output[output_idx] = val;
                    }
                }
            }
        }
    }
}

float MSE(float* input, float* output, int size) {
    float sum = 0.0f;
    for (int i = 0; i < size; i++) {
        sum += (output[i] - input[i]) * (output[i] - input[i]);
    }
    return sum / size;
}

void Relu_Backward(float* d_output, float* input,int N) {
    for (int i = 0; i < N; i++) {
        d_output[i] = input[i] > 0.0f ? d_output[i] : 0.0f;
    }
}

void MSE_Gradient(float* input, float* output, int size, float* d_output) {
    float sum = 0.0f;
    float factor = 2.0f / size;
    for (int i = 0; i < size; i++) {
        d_output[i] = factor * (output[i] - input[i]);
    }
}

void MaxPool2D_Backward(float* d_output, int d_output_width, int d_output_height, float* input,
    int input_width, int input_height, int filter_width, int filter_height, int stride, int filter_count,
    float* d_input)
{
    // Chỉ gán vị trí giá trị max của input ban đầu là gradient của lớp tiếp theo (d_output), còn lại là 0
    int plane_size_in = input_height * input_width;
    int plane_size_out = d_output_height * d_output_width;
    int total_input_size = filter_count * plane_size_in;
    for (int i = 0; i < total_input_size; i++) { // Khởi tạo gradient của input ban đầu là 0
        d_input[i] = 0.0f;
    }

    for (int c = 0; c < filter_count; c++) {

        int channel_offset_in = c * plane_size_in;
        int channel_offset_out = c * plane_size_out;

        for (int h_out = 0; h_out < d_output_height; h_out++) {
            for (int w_out = 0; w_out < d_output_width; w_out++) {

                int h_start = h_out * stride;
                int w_start = w_out * stride;

                float max_val = -FLT_MAX;
                int max_input_idx = -1;

                for (int fh = 0; fh < filter_height; fh++) {
                    for (int fw = 0; fw < filter_width; fw++) {
                        int h_in = h_start + fh;
                        int w_in = w_start + fw;
                        int input_idx = channel_offset_in + h_in * input_width + w_in;
                        float val = input[input_idx];
                        if (val > max_val) {
                            max_val = val;
                            max_input_idx = input_idx;
                        }
                    }
                }
                //Lấy gradient từ output
                if (max_input_idx != -1) {
                    int output_idx = channel_offset_out + h_out * d_output_width + w_out;
                    d_input[max_input_idx] += d_output[output_idx];
                }
            }
        }
    }
}

void UpSample2D_Backward(float* d_output, int d_output_width, int d_output_height, int scale_factor, int filter_count,
    float* d_input, int d_input_height, int d_input_width) {

    int plane_size_in = d_input_height * d_input_width;
    int plane_size_out = d_output_height * d_output_width;
    int total_input_size = filter_count * plane_size_in;
    for (int i = 0; i < total_input_size; i++) {
        d_input[i] = 0.0f;
    }
    for (int c = 0; c < filter_count; c++) {
        int channel_offset_in = c * plane_size_in;
        int channel_offset_out = c * plane_size_out;
        // Lặp qua input grid (d_input)
        for (int h_in = 0; h_in < d_input_height; h_in++) {
            for (int w_in = 0; w_in < d_input_width; w_in++) {
                float sum_gradient = 0.0f;
                int h_start_out = h_in * scale_factor;
                int w_start_out = w_in * scale_factor;
                for (int sh = 0; sh < scale_factor; sh++) {
                    for (int sw = 0; sw < scale_factor; sw++) {
                        int h_out = h_start_out + sh;
                        int w_out = w_start_out + sw;
                        if (h_out < d_output_height && w_out < d_output_width) {
                            int output_idx = channel_offset_out + h_out * d_output_width + w_out;
                            sum_gradient += d_output[output_idx];
                        }
                    }
                }
                int input_idx = channel_offset_in + h_in * d_input_width + w_in;
                d_input[input_idx] = sum_gradient;
            }
        }
    }
}

void Conv2D_Backward_Input(float* d_output, int d_output_width, int d_output_height, float* kernel, int kernel_width, int kernel_height,
    int input_width, int input_height, int input_channels, int padding, int stride, int filter_count, float* d_input) {
    // Thực hiện tích chập giữa dE/dO và kernel (xoay 180 độ) để tính dE/dI
    int plane_size_in = input_height * input_width;
    int plane_size_out = d_output_height * d_output_width;
    // Lặp qua kênh input (kênh output gradient)
    for (int c_in = 0; c_in < input_channels; c_in++) {
        // Lặp qua input grid (d_input)
        for (int h_in = 0; h_in < input_height; h_in++) {
            for (int w_in = 0; w_in < input_width; w_in++) {
                float sum_gradient = 0.0f;
                // Lặp qua kênh output (số lượng filter)
                for (int c_out = 0; c_out < filter_count; c_out++) {
                    // Lặp qua kernel (xoay 180 độ)
                    for (int kh = 0; kh < kernel_height; kh++) {
                        for (int kw = 0; kw < kernel_width; kw++) {
                            int h_out = h_in - kh + padding;
                            int w_out = w_in - kw + padding;
                            float d_output_val = 0.0f;
                            // Kiểm tra padding
                            if (h_out >= 0 && h_out < d_output_height && w_out >= 0 && w_out < d_output_width) {
                                int d_output_idx = c_out * plane_size_out + h_out * d_output_width + w_out;
                                d_output_val = d_output[d_output_idx];
                            }
                            // Tính chỉ số kernel (xoay 180 độ)
                            int kernel_idx = c_out * input_channels * kernel_height * kernel_width + c_in * kernel_height * kernel_width +
                                            (kernel_height - 1 - kh) * kernel_width + (kernel_width - 1 - kw);

                            sum_gradient += d_output_val * kernel[kernel_idx];
                        }
                    }
                }
                int d_input_idx = c_in * plane_size_in + h_in * input_width + w_in;
                d_input[d_input_idx] = sum_gradient;
            }
        }
    }
}

void Conv2D_Backward_Kernel(float* d_output, int d_output_width, int d_output_height, float* input,
int input_width, int input_height, int input_channels, int kernel_width, int kernel_height, int padding, int stride, int filter_count, float* d_weights) {
    // Thực hiện tích chập giữa dE/dO và input để tính dE/dW
    int plane_size_in = input_height * input_width;
    int plane_size_out = d_output_height * d_output_width;
    // Lặp qua kênh đầu ra (filter)
    for (int c_out = 0; c_out < filter_count; c_out++) {
        // Lặp qua kênh đầu vào
        for (int c_in = 0; c_in < input_channels; c_in++) {
            // Lặp qua kích thước kernel
            for (int k_h = 0; k_h < kernel_height; k_h++) {
                for (int k_w = 0; k_w < kernel_width; k_w++) {
                    float sum_gradient = 0.0f;
                    // Lặp qua output grid (d_output) để tích lũy
                    for (int h_out = 0; h_out < d_output_height; h_out++) {
                        for (int w_out = 0; w_out < d_output_width; w_out++) {
                            int h_in = h_out * stride + k_h - padding;
                            int w_in = w_out * stride + k_w - padding;
                            float input_val = 0.0f;
                            // Kiểm tra padding
                            if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) {
                                int input_idx = c_in * plane_size_in + h_in * input_width + w_in;
                                input_val = input[input_idx];
                            }
                            int d_output_idx = c_out * plane_size_out + h_out * d_output_width + w_out;
                            float d_output_val = d_output[d_output_idx];
                            // Tính gradient
                            sum_gradient += input_val * d_output_val;
                        }
                    }
                    int kernel_idx = c_out * input_channels * kernel_height * kernel_width + c_in * kernel_height * kernel_width +
                                    k_h * kernel_width + k_w;
                    d_weights[kernel_idx] += sum_gradient;
                }
            }
        }
    }
}

void Conv2D_Backward_Biases(float* d_output, int d_output_width, int d_output_height,
    int filter_count, float* d_biases) {
    int plane_size_out = d_output_height * d_output_width;
    // Lặp qua từng filter
    for (int c_out = 0; c_out < filter_count; c_out++) {
        float sum_gradient = 0.0f;
        // Lặp qua từng vị trí trong output
        for (int h_out = 0; h_out < d_output_height; h_out++) {
            for (int w_out = 0; w_out < d_output_width; w_out++) {
                // Tính chỉ số trong mảng d_output
                int output_idx = c_out * plane_size_out + h_out * d_output_width + w_out;
                // Cộng dồn gradient từ d_output
                sum_gradient += d_output[output_idx];
            }
        }
        d_biases[c_out] += sum_gradient;
    }
}

void SGD_Update(float* weights, float* d_weights, double learning_rate, int N_params) {
    for (int i = 0; i < N_params; i++) {
        weights[i] -= (learning_rate * d_weights[i]);
    }
}

Writing cpu_layers.c


In [None]:
%%writefile cpu_layers.h
#pragma once
#include <stdio.h>
#include <float.h>

void Relu(float* input, int N, float* output);
void Conv2D_Forward(float* input, int input_width, int input_height, int input_channels,
    float* kernel, int kernel_width, int kernel_height,
    float* biases, int padding, int stride, int filter_count,
    float* output, int output_height, int output_width);
void MaxPool2D_Forward(float* input, int input_width, int input_height,
    int filter_width, int filter_height, int stride, int filter_count,
    float* output, int output_height, int output_width);
void UpSample2D_Forward(float* input, int input_width, int input_height,
    int scale_factor, int filter_count,
    float* output, int output_height, int output_width);
float MSE(float* input, float* output, int size);
void Relu_Backward(float* d_output, float* input,int N);
void MSE_Gradient(float* input, float* output, int size, float* d_output);
void UpSample2D_Backward(float* d_output, int d_output_width, int d_output_height, int scale_factor, int filter_count,
    float* d_input, int d_input_height, int d_input_width);
void MaxPool2D_Backward(float* d_output, int d_output_width, int d_output_height, float* input,
    int input_width, int input_height, int filter_width, int filter_height, int stride, int filter_count, float* d_input);
void Conv2D_Backward_Input(float* d_output, int d_output_width, int d_output_height, float* kernel, int kernel_width, int kernel_height,
    int input_width, int input_height, int input_channels, int padding, int stride, int filter_count, float* d_input);
void Conv2D_Backward_Kernel(float* d_output, int d_output_width, int d_output_height, float* input,
    int input_width, int input_height, int input_channels, int kernel_width, int kernel_height, int padding, int stride, int filter_count, float* d_weights);
void Conv2D_Backward_Biases(float* d_output, int d_output_width, int d_output_height, int filter_count, float* d_biases);
void SGD_Update(float* weights, float* d_weights, double learning_rate, int N_params);


Writing cpu_layers.h


In [None]:
%%writefile cpu_autoencoder.h
#pragma once
#include "cpu_layers.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
// Định nghĩa kích thước Kernel/Stride/Padding
#define KERNEL_SIZE 3
#define POOL_SIZE 2
#define UPSAMPLE_SIZE 2
#define CONV_PADDING 1
#define CONV_STRIDE 1
#define POOL_STRIDE 2

typedef struct {
    int batch_size;
    double learning_rate;
    // kích thước input
    int input_height;       // 32
    int input_width;        // 32
    int input_channels;     // 3
    // weight, bias và gradient của từng lớp Conv2D
    float* w1; float* b1; float* d_w1; float* d_b1;
    float* w2; float* b2; float* d_w2; float* d_b2;
    float* w3; float* b3; float* d_w3; float* d_b3;
    float* w4; float* b4; float* d_w4; float* d_b4;
    float* w5; float* b5; float* d_w5; float* d_b5;

    float* batch_input;
    float* final_output;
    float* loss_gradient;
    // ouput và gradient của từng lớp Conv2D/MaxPool/UpSample
    float* conv1_output;   float* d_conv1_output;
    float* pool1_output;   float* d_pool1_output;
    float* conv2_output;   float* d_conv2_output;
    float* pool2_output;   float* d_pool2_output; // LATENT SPACE
    float* conv3_output;   float* d_conv3_output;
    float* upsample1_output; float* d_upsample1_output;
    float* conv4_output;   float* d_conv4_output;
    float* upsample2_output; float* d_upsample2_output;
} CPUAutoEncoder;

void random_initialize(float* array, int size, float min, float max);
void initialize_autoencoder(CPUAutoEncoder* autoencoder, int batch_size, double learning_rate);
void free_autoencoder(CPUAutoEncoder* autoencoder);
void forward_autoencoder(CPUAutoEncoder* autoencoder);
void backward_autoencoder(CPUAutoEncoder* autoencoder);
void update_autoencoder_parameters(CPUAutoEncoder* autoencoder);
void save_weights(CPUAutoEncoder* autoencoder, const char* filename);
void cpu_extract_features(CPUAutoEncoder* autoencoder, float* input_data, int num_images, float* features_output);
void cpu_load_weights(CPUAutoEncoder* autoencoder, const char* filename);

Writing cpu_autoencoder.h


In [None]:
%%writefile cpu_autoencoder.c
#include "cpu_autoencoder.h"

// Hàm khởi tạo mảng trọng số với giá trị ngẫu nhiên trong khoảng [min, max]
void random_initialize(float* array, int size, float min, float max) {
    for (int i = 0; i < size; i++) {
        float scale = (float)rand() / (float)RAND_MAX;
        array[i] = min + scale * (max - min);
    }
}

void zero_initialize(float* array, int size) {
    for (int i = 0; i < size; i++) {
        array[i] = 0.0f;
    }
}


void initialize_conv_layer(float** w, float** b, float** dw, float** db, int C_in, int C_out) {
    int size_W = C_out * C_in * KERNEL_SIZE * KERNEL_SIZE;
    *w = (float*)malloc(size_W * sizeof(float));
    *b = (float*)malloc(C_out * sizeof(float));
    *dw = (float*)malloc(size_W * sizeof(float));
    *db = (float*)malloc(C_out * sizeof(float));

    random_initialize(*w, size_W, -0.05f, 0.05f);
    random_initialize(*b, C_out, -0.05f, 0.05f);
    zero_initialize(*dw, size_W);
    zero_initialize(*db, C_out);
}

float* allocate_buffer(int batch_size, int H, int W, int C) {
    int size = batch_size * H * W * C;
    return (float*)malloc(size * sizeof(float));
}


void initialize_autoencoder(CPUAutoEncoder* autoencoder, int batch_size, double learning_rate) {
    // Tham số chung
    autoencoder->batch_size = batch_size;
    autoencoder->learning_rate = learning_rate;
    autoencoder->input_height = 32;
    autoencoder->input_width = 32;
    autoencoder->input_channels = 3;

    // Output channels của các lớp
    int C_in = 3, C1 = 256, C2 = 128, C3 = 128, C4 = 256, C5 = 3;
    // Kích thước không gian (Pixel/kênh)
    int P1 = 32 * 32, P2 = 16 * 16, P3 = 8 * 8;
    // Khởi tạo trọng số, bias và gradient cho từng lớp Conv2D
    initialize_conv_layer(&autoencoder->w1, &autoencoder->b1, &autoencoder->d_w1, &autoencoder->d_b1, C_in, C1);
    initialize_conv_layer(&autoencoder->w2, &autoencoder->b2, &autoencoder->d_w2, &autoencoder->d_b2, C1, C2);
    initialize_conv_layer(&autoencoder->w3, &autoencoder->b3, &autoencoder->d_w3, &autoencoder->d_b3, C2, C3);
    initialize_conv_layer(&autoencoder->w4, &autoencoder->b4, &autoencoder->d_w4, &autoencoder->d_b4, C3, C4);
    initialize_conv_layer(&autoencoder->w5, &autoencoder->b5, &autoencoder->d_w5, &autoencoder->d_b5, C4, C5);
    // Khởi tạo Buffers cho activations và gradients
    int input_height = 32, input_width = 32;
    autoencoder->batch_input = allocate_buffer(batch_size, input_height, input_width, C_in);
    autoencoder->final_output = allocate_buffer(batch_size, input_height, input_width, C5); // Output size (32x32x3)
    autoencoder->loss_gradient = allocate_buffer(batch_size, input_height, input_width, C5);
    // Layer 1 (Conv1): 32x32x256
    autoencoder->conv1_output = allocate_buffer(batch_size, input_height, input_width, C1);
    autoencoder->d_conv1_output = allocate_buffer(batch_size, input_height, input_width, C1);
    // Layer 2 (Pool1): 16x16x256
    int H2 = 16, W2 = 16;
    autoencoder->pool1_output = allocate_buffer(batch_size, H2, W2, C1);
    autoencoder->d_pool1_output = allocate_buffer(batch_size, H2, W2, C1);
    // Layer 3 (Conv2): 16x16x128
    autoencoder->conv2_output = allocate_buffer(batch_size, H2, W2, C2);
    autoencoder->d_conv2_output = allocate_buffer(batch_size, H2, W2, C2);
    // Layer 4 (Pool2 - Latent): 8x8x128
    int H3 = 8, W3 = 8;
    autoencoder->pool2_output = allocate_buffer(batch_size, H3, W3, C2);
    autoencoder->d_pool2_output = allocate_buffer(batch_size, H3, W3, C2);
    // Layer 5 (Conv3): 8x8x128
    autoencoder->conv3_output = allocate_buffer(batch_size, H3, W3, C3);
    autoencoder->d_conv3_output = allocate_buffer(batch_size, H3, W3, C3);
    // Layer 6 (UpSample1): 16x16x128
    autoencoder->upsample1_output = allocate_buffer(batch_size, H2, W2, C3);
    autoencoder->d_upsample1_output = allocate_buffer(batch_size, H2, W2, C3);
    // Layer 7 (Conv4): 16x16x256
    autoencoder->conv4_output = allocate_buffer(batch_size, H2, W2, C4);
    autoencoder->d_conv4_output = allocate_buffer(batch_size, H2, W2, C4);
    // Layer 8 (UpSample2): 32x32x256
    autoencoder->upsample2_output = allocate_buffer(batch_size, input_height, input_width, C4);
    autoencoder->d_upsample2_output = allocate_buffer(batch_size, input_height, input_width, C4);
}

void free_autoencoder(CPUAutoEncoder* autoencoder) {
    // Giải phóng trọng số và gradient
    free(autoencoder->w1); free(autoencoder->b1); free(autoencoder->d_w1); free(autoencoder->d_b1);
    free(autoencoder->w2); free(autoencoder->b2); free(autoencoder->d_w2); free(autoencoder->d_b2);
    free(autoencoder->w3); free(autoencoder->b3); free(autoencoder->d_w3); free(autoencoder->d_b3);
    free(autoencoder->w4); free(autoencoder->b4); free(autoencoder->d_w4); free(autoencoder->d_b4);
    free(autoencoder->w5); free(autoencoder->b5); free(autoencoder->d_w5); free(autoencoder->d_b5);

    // Giải phóng buffers activation/gradient
    free(autoencoder->batch_input);
    free(autoencoder->final_output);
    free(autoencoder->loss_gradient);
    free(autoencoder->conv1_output); free(autoencoder->d_conv1_output);
    free(autoencoder->pool1_output); free(autoencoder->d_pool1_output);
    free(autoencoder->conv2_output); free(autoencoder->d_conv2_output);
    free(autoencoder->pool2_output); free(autoencoder->d_pool2_output);
    free(autoencoder->conv3_output); free(autoencoder->d_conv3_output);
    free(autoencoder->upsample1_output); free(autoencoder->d_upsample1_output);
    free(autoencoder->conv4_output); free(autoencoder->d_conv4_output);
    free(autoencoder->upsample2_output); free(autoencoder->d_upsample2_output);
}

// Forward
void forward_autoencoder(CPUAutoEncoder* autoencoder) {
    int bs = autoencoder->batch_size;

    // Kích thước activation của 1 ảnh tại các lớp
    int size_input = 32 * 32 * 3;
    int size_L1 = 32 * 32 * 256;
    int size_L2 = 16 * 16 * 256;
    int size_L3 = 16 * 16 * 128;
    int size_L4 = 8 * 8 * 128; // Latent
    // Decoder sizes
    int size_L5 = 8 * 8 * 128;
    int size_L6 = 16 * 16 * 128;
    int size_L7 = 16 * 16 * 256;
    int size_L8 = 32 * 32 * 256;
    int size_Out = 32 * 32 * 3;
    for (int b = 0; b < bs; b++) {
        // Tính offset con trỏ cho ảnh thứ b
        float* ptr_input = autoencoder->batch_input + b * size_input;
        float* ptr_L1 = autoencoder->conv1_output + b * size_L1;
        float* ptr_L2 = autoencoder->pool1_output + b * size_L2;
        float* ptr_L3 = autoencoder->conv2_output + b * size_L3;
        float* ptr_L4 = autoencoder->pool2_output + b * size_L4;
        float* ptr_L5 = autoencoder->conv3_output + b * size_L5;
        float* ptr_L6 = autoencoder->upsample1_output + b * size_L6;
        float* ptr_L7 = autoencoder->conv4_output + b * size_L7;
        float* ptr_L8 = autoencoder->upsample2_output + b * size_L8;
        float* ptr_Out = autoencoder->final_output + b * size_Out;
        // --- ENCODER ---
        // L1: Conv1 + ReLU
        Conv2D_Forward(ptr_input, 32, 32, 3, autoencoder->w1, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b1, CONV_PADDING, CONV_STRIDE, 256, ptr_L1, 32, 32);
        Relu(ptr_L1, size_L1, ptr_L1);

        // L2: Pool1
        MaxPool2D_Forward(ptr_L1, 32, 32, POOL_SIZE, POOL_SIZE, POOL_STRIDE, 256, ptr_L2, 16, 16);

        // L3: Conv2 + ReLU
        Conv2D_Forward(ptr_L2, 16, 16, 256, autoencoder->w2, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b2, CONV_PADDING, CONV_STRIDE, 128, ptr_L3, 16, 16);
        Relu(ptr_L3, size_L3, ptr_L3);

        // L4: Pool2 (Latent)
        MaxPool2D_Forward(ptr_L3, 16, 16, POOL_SIZE, POOL_SIZE, POOL_STRIDE, 128, ptr_L4, 8, 8);

        // --- DECODER ---
        // L5: Conv3 + ReLU
        Conv2D_Forward(ptr_L4, 8, 8, 128, autoencoder->w3, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b3, CONV_PADDING, CONV_STRIDE, 128, ptr_L5, 8, 8);
        Relu(ptr_L5, size_L5, ptr_L5);

        // L6: UpSample1
        UpSample2D_Forward(ptr_L5, 8, 8, UPSAMPLE_SIZE, 128, ptr_L6, 16, 16);

        // L7: Conv4 + ReLU
        Conv2D_Forward(ptr_L6, 16, 16, 128, autoencoder->w4, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b4, CONV_PADDING, CONV_STRIDE, 256, ptr_L7, 16, 16);
        Relu(ptr_L7, size_L7, ptr_L7);

        // L8: UpSample2
        UpSample2D_Forward(ptr_L7, 16, 16, UPSAMPLE_SIZE, 256, ptr_L8, 32, 32);

        // L9: Conv5 (Output)
        Conv2D_Forward(ptr_L8, 32, 32, 256, autoencoder->w5, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b5, CONV_PADDING, CONV_STRIDE, 3, ptr_Out, 32, 32);
    }
}


// Backward
void backward_autoencoder(CPUAutoEncoder* autoencoder) {
    int bs = autoencoder->batch_size;
    int total_elements = bs * 32 * 32 * 3;
    MSE_Gradient(autoencoder->batch_input, autoencoder->final_output, total_elements, autoencoder->loss_gradient);
    // Khởi tạo gradient về 0 trước khi cộng dồn
    zero_initialize(autoencoder->d_w1, 256*3*3*3); zero_initialize(autoencoder->d_b1, 256);
    zero_initialize(autoencoder->d_w2, 128*256*3*3); zero_initialize(autoencoder->d_b2, 128);
    zero_initialize(autoencoder->d_w3, 128*128*3*3); zero_initialize(autoencoder->d_b3, 128);
    zero_initialize(autoencoder->d_w4, 256*128*3*3); zero_initialize(autoencoder->d_b4, 256);
    zero_initialize(autoencoder->d_w5, 3*256*3*3); zero_initialize(autoencoder->d_b5, 3);
    // Kích thước 1 ảnh tại các lớp (như Forward)
    int size_Out = 32*32*3;
    int size_L8 = 32*32*256;
    int size_L7 = 16*16*256;
    int size_L6 = 16*16*128;
    int size_L5 = 8*8*128;
    int size_L4 = 8*8*128;
    int size_L3 = 16*16*128;
    int size_L2 = 16*16*256;
    int size_L1 = 32*32*256;
    int size_In = 32*32*3;

    for (int b = 0; b < bs; b++) {
        // Offset pointers
        float* ptr_dOut = autoencoder->loss_gradient + b * size_Out;
        float* ptr_Upsample2_Out = autoencoder->upsample2_output + b * size_L8;
        float* ptr_d_Upsample2_Out = autoencoder->d_upsample2_output + b * size_L8;
        float* ptr_d_Conv4_Out = autoencoder->d_conv4_output + b * size_L7;
        float* ptr_Upsample1_Out = autoencoder->upsample1_output + b * size_L6;
        float* ptr_d_Upsample1_Out = autoencoder->d_upsample1_output + b * size_L6;
        float* ptr_d_Conv3_Out = autoencoder->d_conv3_output + b * size_L5;
        float* ptr_Pool2_Out = autoencoder->pool2_output + b * size_L4;
        float* ptr_d_Pool2_Out = autoencoder->d_pool2_output + b * size_L4;
        float* ptr_Conv2_Out = autoencoder->conv2_output + b * size_L3;
        float* ptr_d_Conv2_Out = autoencoder->d_conv2_output + b * size_L3;
        float* ptr_Pool1_Out = autoencoder->pool1_output + b * size_L2;
        float* ptr_d_Pool1_Out = autoencoder->d_pool1_output + b * size_L2;
        float* ptr_Conv1_Out = autoencoder->conv1_output + b * size_L1;
        float* ptr_d_Conv1_Out = autoencoder->d_conv1_output + b * size_L1;
        float* ptr_Input = autoencoder->batch_input + b * size_In;

        // === L9 (Conv5) ===
        // dW5, dB5
        Conv2D_Backward_Kernel(ptr_dOut, 32, 32, ptr_Upsample2_Out, 32, 32, 256, 3, 3, 1, 1, 3, autoencoder->d_w5);
        Conv2D_Backward_Biases(ptr_dOut, 32, 32, 3, autoencoder->d_b5);
        // dInput cho L8
        Conv2D_Backward_Input(ptr_dOut, 32, 32, autoencoder->w5, 3, 3, 32, 32, 256, 1, 1, 3, ptr_d_Upsample2_Out);

        // === L8 (Upsample2) ===
        UpSample2D_Backward(ptr_d_Upsample2_Out, 32, 32, UPSAMPLE_SIZE, 256, autoencoder->d_conv4_output + b * size_L7, 16, 16);

        // === L7 (Conv4) ===
        // ReLU Backward
        Relu_Backward(ptr_d_Conv4_Out, autoencoder->conv4_output + b * size_L7, 16*16*256);
        Conv2D_Backward_Kernel(ptr_d_Conv4_Out, 16, 16, ptr_Upsample1_Out, 16, 16, 128, 3, 3, 1, 1, 256, autoencoder->d_w4);
        Conv2D_Backward_Biases(ptr_d_Conv4_Out, 16, 16, 256, autoencoder->d_b4);
        Conv2D_Backward_Input(ptr_d_Conv4_Out, 16, 16, autoencoder->w4, 3, 3, 16, 16, 128, 1, 1, 256, ptr_d_Upsample1_Out);

        // === L6 (Upsample1) ===
        UpSample2D_Backward(ptr_d_Upsample1_Out, 16, 16, UPSAMPLE_SIZE, 128, ptr_d_Conv3_Out, 8, 8);

        // === L5 (Conv3) ===
        Relu_Backward(ptr_d_Conv3_Out, autoencoder->conv3_output + b * size_L5, 8*8*128);
        Conv2D_Backward_Kernel(ptr_d_Conv3_Out, 8, 8, ptr_Pool2_Out, 8, 8, 128, 3, 3, 1, 1, 128, autoencoder->d_w3);
        Conv2D_Backward_Biases(ptr_d_Conv3_Out, 8, 8, 128, autoencoder->d_b3);
        Conv2D_Backward_Input(ptr_d_Conv3_Out, 8, 8, autoencoder->w3, 3, 3, 8, 8, 128, 1, 1, 128, ptr_d_Pool2_Out);

        // === L4 (Pool2) ===
        MaxPool2D_Backward(ptr_d_Pool2_Out, 8, 8, ptr_Conv2_Out, 16, 16, 2, 2, 2, 128, ptr_d_Conv2_Out);

        // === L3 (Conv2) ===
        Relu_Backward(ptr_d_Conv2_Out, ptr_Conv2_Out, 16*16*128);
        Conv2D_Backward_Kernel(ptr_d_Conv2_Out, 16, 16, ptr_Pool1_Out, 16, 16, 256, 3, 3, 1, 1, 128, autoencoder->d_w2);
        Conv2D_Backward_Biases(ptr_d_Conv2_Out, 16, 16, 128, autoencoder->d_b2);
        Conv2D_Backward_Input(ptr_d_Conv2_Out, 16, 16, autoencoder->w2, 3, 3, 16, 16, 256, 1, 1, 128, ptr_d_Pool1_Out);

        // === L2 (Pool1) ===
        MaxPool2D_Backward(ptr_d_Pool1_Out, 16, 16, ptr_Conv1_Out, 32, 32, 2, 2, 2, 256, ptr_d_Conv1_Out);

        // === L1 (Conv1) ===
        Relu_Backward(ptr_d_Conv1_Out, ptr_Conv1_Out, 32*32*256);
        Conv2D_Backward_Kernel(ptr_d_Conv1_Out, 32, 32, ptr_Input, 32, 32, 3, 3, 3, 1, 1, 256, autoencoder->d_w1);
        Conv2D_Backward_Biases(ptr_d_Conv1_Out, 32, 32, 256, autoencoder->d_b1);
    }
}

void update_autoencoder_parameters(CPUAutoEncoder* autoencoder) {
    // Cập nhật tất cả 5 lớp Conv: W += -learning_rate * dW
    int size_W1 = 256 * 3 * 3 * 3;
    SGD_Update(autoencoder->w1, autoencoder->d_w1, autoencoder->learning_rate, size_W1);
    SGD_Update(autoencoder->b1, autoencoder->d_b1, autoencoder->learning_rate, 256);
    int size_W2 = 128 * 256 * 3 * 3;
    SGD_Update(autoencoder->w2, autoencoder->d_w2, autoencoder->learning_rate, size_W2);
    SGD_Update(autoencoder->b2, autoencoder->d_b2, autoencoder->learning_rate, 128);
    int size_W3  = 128 * 128 * 3 * 3;
    SGD_Update(autoencoder->w3, autoencoder->d_w3, autoencoder->learning_rate, size_W3);
    SGD_Update(autoencoder->b3, autoencoder->d_b3, autoencoder->learning_rate, 128);
    int size_W4 = 256 * 128 * 3 * 3;
    SGD_Update(autoencoder->w4, autoencoder->d_w4, autoencoder->learning_rate, size_W4);
    SGD_Update(autoencoder->b4, autoencoder->d_b4, autoencoder->learning_rate, 256);
    int size_W5 = 3 * 256 * 3 * 3;
    SGD_Update(autoencoder->w5, autoencoder->d_w5, autoencoder->learning_rate, size_W5);
    SGD_Update(autoencoder->b5, autoencoder->d_b5, autoencoder->learning_rate, 3);
}

void save_weights(CPUAutoEncoder* autoencoder, const char* filename) {
    FILE* file = fopen(filename, "wb");
    if (file == NULL) {
        printf("Error opening file for writing weights.\n");
        return;
    }
    // Lưu trọng số và bias của từng lớp Conv2D
    fwrite(autoencoder->w1, sizeof(float), 256*3*3*3, file);
    fwrite(autoencoder->b1, sizeof(float), 256, file);
    fwrite(autoencoder->w2, sizeof(float), 128*256*3*3, file);
    fwrite(autoencoder->b2, sizeof(float), 128, file);
    fwrite(autoencoder->w3, sizeof(float), 128*128*3*3, file);
    fwrite(autoencoder->b3, sizeof(float), 128, file);
    fwrite(autoencoder->w4, sizeof(float), 256*128*3*3, file);
    fwrite(autoencoder->b4, sizeof(float), 256, file);
    fwrite(autoencoder->w5, sizeof(float), 3*256*3*3, file);
    fwrite(autoencoder->b5, sizeof(float), 3, file);
    fclose(file);
}


void cpu_extract_features(CPUAutoEncoder* autoencoder, float* input_data, int num_images, float* features_output) {
    // Chỉ có forward pass của encoder
    // Kích thước các lớp
    int size_input = 32 * 32 * 3;
    int size_L1 = 32 * 32 * 256;
    int size_L2 = 16 * 16 * 256;
    int size_L3 = 16 * 16 * 128;
    int size_L4 = 8 * 8 * 128;
    for (int i = 0; i < num_images; i++) {
        float* ptr_input = input_data + i * size_input;
        float* ptr_feature_dst = features_output + i * size_L4;
        float* ptr_L1 = autoencoder->conv1_output;
        float* ptr_L2 = autoencoder->pool1_output;
        float* ptr_L3 = autoencoder->conv2_output;
        float* ptr_L4 = autoencoder->pool2_output;
        //  ENCODER FORWARD PASS
        // L1: Conv1 + ReLU
        Conv2D_Forward(ptr_input, 32, 32, 3, autoencoder->w1, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b1, CONV_PADDING, CONV_STRIDE, 256, ptr_L1, 32, 32);
        Relu(ptr_L1, size_L1, ptr_L1);
        // L2: Pool1
        MaxPool2D_Forward(ptr_L1, 32, 32, POOL_SIZE, POOL_SIZE, POOL_STRIDE, 256, ptr_L2, 16, 16);
        // L3: Conv2 + ReLU
        Conv2D_Forward(ptr_L2, 16, 16, 256, autoencoder->w2, KERNEL_SIZE, KERNEL_SIZE, autoencoder->b2, CONV_PADDING, CONV_STRIDE, 128, ptr_L3, 16, 16);
        Relu(ptr_L3, size_L3, ptr_L3);
        // L4: Pool2 (Latent Space)
        MaxPool2D_Forward(ptr_L3, 16, 16, POOL_SIZE, POOL_SIZE, POOL_STRIDE, 128, ptr_L4, 8, 8);

        // Sao chép kết quả vào mảng output tổng
        memcpy(ptr_feature_dst, ptr_L4, size_L4 * sizeof(float));
    }
}

void cpu_load_weights(CPUAutoEncoder* autoencoder, const char* filename) {
    FILE* file = fopen(filename, "rb");
    if (file == NULL) {
        printf("Error opening file for reading weights.\n");
        return;
    }
    // Đọc trọng số và bias của từng lớp Conv2D
    fread(autoencoder->w1, sizeof(float), 256*3*3*3, file);
    fread(autoencoder->b1, sizeof(float), 256, file);
    fread(autoencoder->w2, sizeof(float), 128*256*3*3, file);
    fread(autoencoder->b2, sizeof(float), 128, file);
    fread(autoencoder->w3, sizeof(float), 128*128*3*3, file);
    fread(autoencoder->b3, sizeof(float), 128, file);
    fread(autoencoder->w4, sizeof(float), 256*128*3*3, file);
    fread(autoencoder->b4, sizeof(float), 256, file);
    fread(autoencoder->w5, sizeof(float), 3*256*3*3, file);
    fread(autoencoder->b5, sizeof(float), 3, file);
    fclose(file);
}

Writing cpu_autoencoder.c


In [None]:
%%writefile load_data.h
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define TRAIN_NUM    50000
#define TEST_NUM     10000
#define IMG_SIZE     (32*32*3)     // 3072

typedef struct {
    float* train_images;   // [50000 * 3072]
    float* test_images;    // [10000 * 3072]
    uint8_t* train_labels; // [50000]
    uint8_t* test_labels;  // [10000]
    int* train_indices;
} Cifar10;

void load_cifar10(Cifar10* data);
void normalize_cifar10(Cifar10* data);
void shuffle_cifar10(Cifar10* data);
void get_next_batch(Cifar10* data, size_t batch_size, size_t batch_id, float* batch_images);
void print_cifar10(Cifar10* data);
void free_cifar10(Cifar10* data);

Writing load_data.h


In [None]:
%%writefile load_data.c
#include "load_data.h"

static void read_batch(const char* filename, float* images_start, uint8_t* labels) {
    FILE* f = fopen(filename, "rb");
    if (!f) {
        perror(filename);
        exit(EXIT_FAILURE);
    }

    uint8_t buffer[3073];
    for (int i = 0; i < 10000; i++) {
        if (fread(buffer, 1, 3073, f) != 3073) {
            fprintf(stderr, "Error: incomplete read in %s at image %d\n", filename, i);
            fclose(f);
            exit(EXIT_FAILURE);
        }
        labels[i] = buffer[0];
        for (int j = 0; j < 3072; j++) {
            images_start[i * 3072 + j] = (float)buffer[1 + j];  //Covert unit8 to float
        }
    }
    fclose(f);
}

void load_cifar10(Cifar10* data) {
    data->train_images = (float*)malloc(TRAIN_NUM * IMG_SIZE * sizeof(float));
    data->test_images  = (float*)malloc(TEST_NUM  * IMG_SIZE * sizeof(float));
    data->train_labels = (uint8_t*)malloc(TRAIN_NUM * sizeof(uint8_t));
    data->test_labels  = (uint8_t*)malloc(TEST_NUM  * sizeof(uint8_t));

    if (!data->train_images || !data->test_images ||
        !data->train_labels  || !data->test_labels) {
        fprintf(stderr, "ERROR: Memory allocation failed!\n");
        exit(EXIT_FAILURE);
    }

    data->train_indices = (int*)malloc(TRAIN_NUM * sizeof(int));
    for (int i = 0; i < TRAIN_NUM; i++) {
        data->train_indices[i] = i;
    }

    //Load training data
    for (int i = 1; i <= 5; i++) {
        char filename[100];
        snprintf(filename, sizeof(filename), "cifar-10-batches-bin/data_batch_%d.bin", i);
        read_batch(filename,
                   data->train_images + (i-1) * 10000 * IMG_SIZE,
                   data->train_labels + (i-1) * 10000);
    }

    //Load test data
    read_batch("cifar-10-batches-bin/test_batch.bin",
               data->test_images, data->test_labels);

    printf("CIFAR-10 loaded successfully\n");
}

void normalize_cifar10(Cifar10* data) {
    for (size_t i = 0; i < TRAIN_NUM * IMG_SIZE; i++) {
        data->train_images[i] /= 255.0f;
    }
    for (size_t i = 0; i < TEST_NUM * IMG_SIZE; i++) {
        data->test_images[i] /= 255.0f;
    }
}

// Shuffle indices
void shuffle_cifar10(Cifar10* data) {
    for (int i = TRAIN_NUM - 1; i > 0; i--) {
        int j = rand() % (i + 1);
        int temp = data->train_indices[i];
        data->train_indices[i] = data->train_indices[j];
        data->train_indices[j] = temp;
    }
}

void get_next_batch(Cifar10* data, size_t batch_size, size_t batch_id, float* batch_images) {
    size_t start = batch_id * batch_size;
    for (size_t i = 0; i < batch_size; i++) {
        int idx = data->train_indices[start + i];

        memcpy(batch_images + i * IMG_SIZE,
               data->train_images + idx * IMG_SIZE,
               IMG_SIZE * sizeof(float));
    }
}

void print_cifar10(Cifar10* data){
    for (int i = 0; i < 2; i++) {
        printf("Label: %d\n", data->train_labels[i]);
        for (int j = 0; j < IMG_SIZE; j++) {
            printf("%f ", data->train_images[i*IMG_SIZE + j]);
        }
        printf("\n");
    }
    // for (int i = 0; i < 2; i++) {
    //     printf("Label: %d\n", data->test_labels[i]);
    //     for (int j = 0; j < IMG_SIZE; j++) {
    //         printf("%f ", data->test_images[i*IMG_SIZE + j]);
    //     }
    // }
}

void free_cifar10(Cifar10* data) {
    free(data->train_images);
    free(data->test_images);
    free(data->train_labels);
    free(data->test_labels);
    free(data->train_indices);

    data->train_images = data->test_images = NULL;
    data->train_labels = data->test_labels = NULL;
    data->train_indices = NULL;
}

Writing load_data.c


In [None]:
%%writefile main.c
#include "load_data.h"
#include "cpu_autoencoder.h"
#include <time.h>
#include <sys/resource.h>
#include <stdint.h>

void print_memory_usage() {
    struct rusage usage;
    if (getrusage(RUSAGE_SELF, &usage) == 0) {
        double memory_usage_mb = usage.ru_maxrss / 1024.0;
        double memory_usage_gb = memory_usage_mb / 1024.0;
        printf("[SYSTEM] Memory Usage: %.2f MB (%.4f GB)\n", memory_usage_mb, memory_usage_gb);
    } else {
        printf("[SYSTEM] Error checking memory usage.\n");
    }
}

// void save_summary(double total_time, double final_loss, FILE* file) {
//     fprintf(file, "\n*** Training Summary ***\n");
//     fprintf(file, "Total training time: %.2f seconds.\n", total_time);
//     fprintf(file, "Final reconstruction loss: %f\n", final_loss);
//     fclose(file);
// }

uint8_t float_to_pixel(float val) {
    if (val < 0.0f) val = 0.0f;
    if (val > 1.0f) val = 1.0f;
    return (uint8_t)(val * 255.0f);
}

void save_image_pnm(const char* filename, float* planar_data, int width, int height) {
    FILE* f = fopen(filename, "wb");
    if (!f) {
        printf("Error opening file %s for writing\n", filename);
        return;
    }

    // Header PNM: P6 format (binary)
    fprintf(f, "P6\n%d %d\n255\n", width, height);

    int plane_size = width * height;
    uint8_t* pixel_buffer = (uint8_t*)malloc(width * height * 3 * sizeof(uint8_t));
    if (!pixel_buffer) {
        printf("Error allocating pixel buffer\n");
        fclose(f);
        return;
    }

    // Convert float data to uint8_t and interleave RGB channels
    for (int h = 0; h < height; h++) {
        for (int w = 0; w < width; w++) {
            int pixel_idx = (h * width + w) * 3;
            int data_idx = h * width + w;

            pixel_buffer[pixel_idx] = float_to_pixel(planar_data[data_idx]);
            pixel_buffer[pixel_idx + 1] = float_to_pixel(planar_data[plane_size + data_idx]);
            pixel_buffer[pixel_idx + 2] = float_to_pixel(planar_data[2 * plane_size + data_idx]);
        }
    }

    // Write all pixel data at once
    fwrite(pixel_buffer, 1, width * height * 3, f);
    fclose(f);
    free(pixel_buffer);
}

void sample_reconstructions(CPUAutoEncoder* ae, Cifar10* data, int num_samples) {
    printf("\n*** Sampling Reconstructed Images ***\n");
    int batch_size = ae->batch_size;

    float* sample_batch = (float*)malloc(batch_size * 32 * 32 * 3 * sizeof(float));
    if (!sample_batch) {
        printf("Error allocating sample batch\n");
        return;
    }

    memcpy(ae->batch_input, data->test_images, batch_size * 32 * 32 * 3 * sizeof(float));
    forward_autoencoder(ae);
    char filename[64];
    int img_size = 32 * 32 * 3;

    for (int i = 0; i < num_samples; i++) {
        // Save original image
        snprintf(filename, sizeof(filename), "sample_%d_original.pnm", i);
        save_image_pnm(filename, ae->batch_input + i * img_size, 32, 32);

        // Save reconstructed image
        snprintf(filename, sizeof(filename), "sample_%d_reconstructed.pnm", i);
        save_image_pnm(filename, ae->final_output + i * img_size, 32, 32);

        printf("Saved pair %d: %s vs %s\n", i, "original", "reconstructed");
    }

    free(sample_batch);
}

int main(int argc, char** argv) {
    srand((unsigned int)time(NULL));

    //Load Data
    Cifar10 data;
    load_cifar10(&data);
    normalize_cifar10(&data);
    printf("Data loaded and normalized.\n");

    // Hyperparameters
    int train_subset_size = 1000;
    int batch_size = 32; // Can be changed
    int num_epochs = 20; // Can be changed
    int num_batches = train_subset_size / batch_size;
    float learning_rate = 0.001;
    float* batch_images = (float*)malloc(batch_size * IMG_SIZE * sizeof(float));
    double total_time = 0.0;
    double final_loss = 0.0;
    // Initialize AutoEncoder
    CPUAutoEncoder autoencoder;
    initialize_autoencoder(&autoencoder, batch_size, learning_rate);
    printf("Autoencoder initialized (batch_size=%d, learning_rate=%f)\n", batch_size, learning_rate);
    printf("Start training...\n");
    // Training Loop
    for (int epoch = 0; epoch < num_epochs; epoch++) {
        // Shuffle the training indices at the beginning of each epoch
        clock_t start_time = clock();
        shuffle_cifar10(&data);
        double epoch_loss = 0.0;
        printf("Epoch %d/%d\n", epoch + 1, num_epochs);
        for (int batch_id = 0; batch_id < num_batches; batch_id++) {
            // Get the current batch data from the shuffled array
            get_next_batch(&data, batch_size, batch_id, batch_images);
            // forward + backward autoencoder on batch_images
            // copy into autoencoder input buffer
            for (int i = 0; i < batch_size * IMG_SIZE; i++) {
                autoencoder.batch_input[i] = batch_images[i];
            }
            // Training process
            forward_autoencoder(&autoencoder);
            // Calculate loss for display
            float current_loss = MSE(autoencoder.batch_input, autoencoder.final_output, batch_size * IMG_SIZE);
            epoch_loss += current_loss;
            backward_autoencoder(&autoencoder);
            update_autoencoder_parameters(&autoencoder);
            if ((batch_id + 1) % 100 == 0) {
                printf("[TRAIN] Epoch %d, batch %d/%d, loss = %f\n", epoch + 1, batch_id + 1, num_batches, current_loss);
            }
        }
        clock_t end_time = clock();
        double epoch_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
        total_time += epoch_time;
        printf("==> Epoch %d finished. Avg Loss: %f, time: %.2f seconds\n", epoch + 1, epoch_loss / num_batches, epoch_time);
        if(epoch + 1 == 20) final_loss = epoch_loss / num_batches;
    }

    printf("\n*** Training Summary ***\n");
    printf("Total training time: %.2f seconds.\n", total_time);
    printf("Final reconstruction loss: %f\n", final_loss);
    print_memory_usage();

    //Save weights after training
    save_weights(&autoencoder, "autoencoder_weights_cpu.bin");
    //Save 5 pairs of original and reconstructed images
    //sample_reconstructions(&autoencoder, &data, 5);
    // Free memory
    free_autoencoder(&autoencoder);
    free(batch_images);
    free_cifar10(&data);

    return 0;
}

Writing main.c


In [None]:
%%writefile extract_svm_features_cpu.c
// # include <cstdio>
// # include <cstdlib>
// # include <cstring>
// # include <cuda_runtime.h>


  #include "load_data.h"
  //#include "gpu_autoencoder.h"
  #include "cpu_autoencoder.h"


#define AE_LATENT_DIM 128 * 8 * 8

// ghi 1 dòng theo format LIBSVM: label index:val ...
void write_svm_line(FILE* f, int label,
                    const float* feat, int dim)
{
    fprintf(f, "%d", label);

    // In TOÀN BỘ feature, không bỏ qua zero
    for (int j = 0; j < dim; ++j) {
        float v = feat[j];
        fprintf(f, " %d:%g", j + 1, v);
    }
    fprintf(f, "\n");
}


int main(int argc, char** argv)
{
    //if (argc < 3) {
        //fprintf(stderr,
                //"Usage: %s <path_to_cifar-10-batches-bin> <ae_weights.bin>\n",
                //argv[0]);
        //return 1;
    //}
    //const char* data_dir    = argv[1];
    //const char* weight_file = argv[2];
    char* weight_file_cpu = "autoencoder_weights_cpu.bin";
    float learning_rate = 0.001;

    printf("[SVM] Loading CIFAR-10...\n");
    Cifar10 data;
    load_cifar10(&data);
    normalize_cifar10(&data);

    // batch_size cho encoder khi extract feature
    int batch_size = 64;
    //GPUAutoencoder ae;
    //gpu_autoencoder_init(&ae, batch_size);
    //gpu_autoencoder_load_weights(&ae, weight_file);

    CPUAutoEncoder autoencoder;
    initialize_autoencoder(&autoencoder, batch_size, learning_rate);
    cpu_load_weights(&autoencoder, weight_file_cpu);


    float* h_batch  = (float*)malloc(batch_size * IMG_SIZE * sizeof(float));
    float* h_latent = (float*)malloc(batch_size * AE_LATENT_DIM * sizeof(float));
    if (!h_batch || !h_latent) {
        fprintf(stderr, "Host malloc failed\n");
        return 1;
    }

    // ====== TRAIN: 50k ảnh -> train_svm.txt ======
    FILE* f_train = fopen("train_svm.txt", "w");
    if (!f_train) {
        perror("train_svm.txt");
        return 1;
    }

    int N_train           = 10000;//TRAIN_NUM; // 50000
    int num_batches_train = (N_train + batch_size - 1) / batch_size;

    printf("[SVM] Extracting train features...\n");
    for (int b = 0; b < num_batches_train; ++b) {
        int start = b * batch_size;
        int cur_bs = batch_size;
        if (start + cur_bs > N_train) {
            cur_bs = N_train - start;
        }

        // copy ảnh [start, start+cur_bs) vào h_batch
        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            memcpy(h_batch + i * IMG_SIZE,
                   data.train_images + idx * IMG_SIZE,
                   IMG_SIZE * sizeof(float));
        }

        // encoder-only
        //gpu_autoencoder_encode_batch(&ae, h_batch, h_latent, cur_bs);
        cpu_extract_features(&autoencoder, h_batch, cur_bs, h_latent);


        // ghi ra file theo format LIBSVM
        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            int label = data.train_labels[idx];
            const float* feat = h_latent + i * AE_LATENT_DIM;
            write_svm_line(f_train, label, feat, AE_LATENT_DIM);
        }

        printf("[SVM][TRAIN] Batch %d/%d done\n",
               b + 1, num_batches_train);
        fflush(stdout);
    }
    fclose(f_train);
    printf("[SVM] Saved train_svm.txt\n");

    // ====== TEST: 10k ảnh -> test_svm.txt ======
    FILE* f_test = fopen("test_svm.txt", "w");
    if (!f_test) {
        perror("test_svm.txt");
        return 1;
    }

    int N_test           = 2000;//TEST_NUM; // 10000
    int num_batches_test = (N_test + batch_size - 1) / batch_size;

    printf("[SVM] Extracting test features...\n");
    for (int b = 0; b < num_batches_test; ++b) {
        int start = b * batch_size;
        int cur_bs = batch_size;
        if (start + cur_bs > N_test) {
            cur_bs = N_test - start;
        }

        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            memcpy(h_batch + i * IMG_SIZE,
                   data.test_images + idx * IMG_SIZE,
                   IMG_SIZE * sizeof(float));
        }

        // **Không còn debug cudaMemcpy w1/b1, không in input nữa**

        //gpu_autoencoder_encode_batch(&ae, h_batch, h_latent, cur_bs);
        cpu_extract_features(&autoencoder, h_batch, cur_bs, h_latent);

        for (int i = 0; i < cur_bs; ++i) {
            int idx = start + i;
            int label = data.test_labels[idx];
            const float* feat = h_latent + i * AE_LATENT_DIM;
            write_svm_line(f_test, label, feat, AE_LATENT_DIM);
        }

        printf("[SVM][TEST] Batch %d/%d done\n",
               b + 1, num_batches_test);
        fflush(stdout);
    }
    fclose(f_test);
    printf("[SVM] Saved test_svm.txt\n");

    // cleanup
    //gpu_autoencoder_free(&ae);
    free_autoencoder(&autoencoder);
    free(h_batch);
    free(h_latent);
    free_cifar10(&data);

    printf("[SVM] Done.\n");
    return 0;
}

Writing extract_svm_features_cpu.c


In [None]:
!gcc main.c load_data.c cpu_layers.c cpu_autoencoder.c -o run_model -lm

In [None]:
!./run_model

CIFAR-10 loaded successfully
Data loaded and normalized.
Autoencoder initialized (batch_size=32, learning_rate=0.001000)
Start training...
Epoch 1/20
==> Epoch 1 finished. Avg Loss: 0.279725, time: 2200.85 seconds
Epoch 2/20
==> Epoch 2 finished. Avg Loss: 0.212086, time: 2150.78 seconds
Epoch 3/20
==> Epoch 3 finished. Avg Loss: 0.159211, time: 2146.80 seconds
Epoch 4/20
==> Epoch 4 finished. Avg Loss: 0.110326, time: 2170.06 seconds
Epoch 5/20
==> Epoch 5 finished. Avg Loss: 0.079942, time: 2154.88 seconds
Epoch 6/20
==> Epoch 6 finished. Avg Loss: 0.065304, time: 2164.51 seconds
Epoch 7/20
==> Epoch 7 finished. Avg Loss: 0.060133, time: 2167.37 seconds
Epoch 8/20
==> Epoch 8 finished. Avg Loss: 0.057733, time: 2181.04 seconds
Epoch 9/20
==> Epoch 9 finished. Avg Loss: 0.056533, time: 2160.87 seconds
Epoch 10/20
==> Epoch 10 finished. Avg Loss: 0.056988, time: 2162.17 seconds
Epoch 11/20
==> Epoch 11 finished. Avg Loss: 0.055740, time: 2166.18 seconds
Epoch 12/20
==> Epoch 12 finishe

## 2) Trích xuất đặc trưng:

In [None]:
!gcc extract_svm_features_cpu.c load_data.c cpu_layers.c cpu_autoencoder.c -o run_cpu -lm

In [None]:
!./run_cpu

[SVM] Loading CIFAR-10...
CIFAR-10 loaded successfully
[SVM] Extracting train features...
[SVM][TRAIN] Batch 1/157 done
[SVM][TRAIN] Batch 2/157 done
[SVM][TRAIN] Batch 3/157 done
[SVM][TRAIN] Batch 4/157 done
[SVM][TRAIN] Batch 5/157 done
[SVM][TRAIN] Batch 6/157 done
[SVM][TRAIN] Batch 7/157 done
[SVM][TRAIN] Batch 8/157 done
[SVM][TRAIN] Batch 9/157 done
[SVM][TRAIN] Batch 10/157 done
[SVM][TRAIN] Batch 11/157 done
[SVM][TRAIN] Batch 12/157 done
[SVM][TRAIN] Batch 13/157 done
[SVM][TRAIN] Batch 14/157 done
[SVM][TRAIN] Batch 15/157 done
[SVM][TRAIN] Batch 16/157 done
[SVM][TRAIN] Batch 17/157 done
[SVM][TRAIN] Batch 18/157 done
[SVM][TRAIN] Batch 19/157 done
[SVM][TRAIN] Batch 20/157 done
[SVM][TRAIN] Batch 21/157 done
[SVM][TRAIN] Batch 22/157 done
[SVM][TRAIN] Batch 23/157 done
[SVM][TRAIN] Batch 24/157 done
[SVM][TRAIN] Batch 25/157 done
[SVM][TRAIN] Batch 26/157 done
[SVM][TRAIN] Batch 27/157 done
[SVM][TRAIN] Batch 28/157 done
[SVM][TRAIN] Batch 29/157 done
[SVM][TRAIN] Batch 3

In [None]:
!head -1 train_svm.txt

6 1:0.0758321 2:0.0997156 3:0.101006 4:0.104555 5:0.0996163 6:0.105693 7:0.115024 8:0.135557 9:0.0746598 10:0.0581133 11:0.073651 12:0.0568446 13:0.0886734 14:0.0670331 15:0.0723226 16:0.0824314 17:0.0888202 18:0.0697416 19:0.0611363 20:0.143325 21:0.139629 22:0.1219 23:0.0904831 24:0.0907371 25:0.0822361 26:0.0578712 27:0.145694 28:0.184246 29:0.185922 30:0.210267 31:0.160294 32:0.0933218 33:0.0639415 34:0.177734 35:0.194098 36:0.139713 37:0.10564 38:0.175907 39:0.104334 40:0.105946 41:0.0629558 42:0.0996863 43:0.090156 44:0.113609 45:0.0742682 46:0.0619334 47:0.0254723 48:0.11474 49:0.0910471 50:0.066766 51:0.0664103 52:0.0678658 53:0.0645771 54:0.0592995 55:0.0716544 56:0.113293 57:0.0856906 58:0.103317 59:0.0793105 60:0.0740893 61:0.0749404 62:0.0585442 63:0.0584144 64:0.106119 65:0.119777 66:0.148933 67:0.153089 68:0.148225 69:0.14583 70:0.159303 71:0.16475 72:0.14545 73:0.140957 74:0.150326 75:0.149316 76:0.134958 77:0.178343 78:0.156475 79:0.160586 80:0.133052 81:0.176826 82:0.1