In [None]:
%%writefile ImageLib.h

#pragma once
using byte = unsigned char;

// 1D/2D/3D/4D 동적할당 & 해제
double *dmatrix1D(int nH);
double **dmatrix2D(int nH, int nW);
double ***dmatrix3D(int nH, int nW, int nC);
double ****dmatrix4D(int nH, int nW, int nC, int nNum);

void free_dmatrix1D(double *Image, int nH);
void free_dmatrix2D(double **Image, int nH, int nW);
void free_dmatrix3D(double ***Image, int nH, int nW, int nC);
void free_dmatrix4D(double ****Image, int nH, int nW, int nC, int nNum);

Writing ImageLib.h


In [None]:
%%writefile ImageLib.cpp

#include "ImageLib.h"

double *dmatrix1D(int nH) {
    return new double[nH]();
}

double **dmatrix2D(int nH, int nW) {
    double **Temp = new double*[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double[nW]();
    }
    return Temp;
}

double ***dmatrix3D(int nH, int nW, int nC) {
    double ***Temp = new double**[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double*[nW];
        for (int x = 0; x < nW; ++x) {
            Temp[y][x] = new double[nC]();
        }
    }
    return Temp;
}

double ****dmatrix4D(int nH, int nW, int nC, int nNum) {
    double ****Temp = new double***[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double**[nW];
        for (int x = 0; x < nW; ++x) {
            Temp[y][x] = new double*[nC];
            for (int c = 0; c < nC; ++c) {
                Temp[y][x][c] = new double[nNum]();
            }
        }
    }
    return Temp;
}

void free_dmatrix1D(double *Image, int) { delete[] Image; }

void free_dmatrix2D(double **Image, int nH, int) {
    for (int y = 0; y < nH; ++y) delete[] Image[y];
    delete[] Image;
}

void free_dmatrix3D(double ***Image, int nH, int nW, int) {
    for (int y = 0; y < nH; ++y) {
        for (int x = 0; x < nW; ++x) delete[] Image[y][x];
        delete[] Image[y];
    }
    delete[] Image;
}

void free_dmatrix4D(double ****Image, int nH, int nW, int nC, int) {
    for (int y = 0; y < nH; ++y) {
        for (int x = 0; x < nW; ++x) {
            for (int c = 0; c < nC; ++c) delete[] Image[y][x][c];
            delete[] Image[y][x];
        }
        delete[] Image[y];
    }
    delete[] Image;
}

Writing ImageLib.cpp


In [None]:
%%writefile CTensor.h

#pragma once
#include "ImageLib.h"
#include <iostream>

/// Tensor3D는 크기가 (nH x nW x nC)인 3차원 tensor를 관리함
class Tensor3D {
private:
    double*** tensor;
    int nH; // height
    int nW; // width
    int nC; // channel

public:
    // 동작: 1) 3차원 행렬 동적할당 후 tensor에 시작 주소 저장
    //       2) 모든 element 0으로 초기화
    // 사용함수: dmatrix3D()
    Tensor3D(int _nH, int _nW, int _nC) : tensor(nullptr), nH(_nH), nW(_nW), nC(_nC) {
        tensor = dmatrix3D(nH, nW, nC);
        // 0으로 초기화
        for (int h = 0; h < nH; ++h) {
            for (int w = 0; w < nW; ++w) {
                for (int c = 0; c < nC; ++c) {
                    tensor[h][w][c] = 0.0;
                }
            }
        }
    }

    // 동작: 3차원 동적 배열 할당 해제
    // 사용함수: free_dmatrix3D()
    ~Tensor3D() {
        if (tensor) {
            free_dmatrix3D(tensor, nH, nW, nC);
            tensor = nullptr;
        }
    }

    // 특정 위치 원소 설정/조회
    inline void set_elem(int _h, int _w, int _c, double _val) { tensor[_h][_w][_c] = _val; }

    // 동작: 행=_h, 열=_w, 채널=_c 위치 element 반환
    inline double get_elem(int _h, int _w, int _c) const {
        return tensor[_h][_w][_c];
    }

    // 동작: 행렬의 차원(nH, nW, nC)을 pass by reference로 반환
    inline void get_info(int& _nH, int& _nW, int& _nC) const {
        _nH = nH; _nW = nW; _nC = nC;
    }

    inline void set_tensor(double*** _tensor) { tensor = _tensor; }
    inline double*** get_tensor() const { return tensor; }

    // 동작: 행렬의 크기 (nH*nW*nC)를 화면에 출력
    void print() const {
        std::cout << "Tensor3D size: "
                  << nH << " x " << nW << " x " << nC
                  << " (" << static_cast<long long>(nH) * nW * nC << " elements)"
                  << std::endl;
    }
};

Writing CTensor.h


In [None]:
%%writefile CLayer.h
#pragma once
#include <iostream>
#include <fstream>
#include <stdexcept>
#include <string>
#include "ImageLib.h"
#include "CTensor.h"

#define MEAN_INIT 0
#define LOAD_INIT 1

using std::cout;
using std::endl;
using std::string;

class Layer {
protected:
    int fK;
    int fC_in;
    int fC_out;
    string name;
public:
    Layer(string _name, int _fK, int _fC_in, int _fC_out)
        : name(_name), fK(_fK), fC_in(_fC_in), fC_out(_fC_out) {}
    virtual ~Layer() {}
    virtual Tensor3D* forward(const Tensor3D* input) = 0;
    virtual void print() const = 0;
    virtual void get_info(string& _name, int& _fK, int& _fC_in, int& _fC_out) const = 0;

    // 병렬 스위치
    virtual void set_parallel(bool) {}
};

class Layer_ReLU : public Layer {
    bool use_parallel = false;
public:
    Layer_ReLU(string _name, int _fK, int _fC_in, int _fC_out)
        : Layer(_name, _fK, _fC_in, _fC_out) {}
    ~Layer_ReLU() {}

    void set_parallel(bool on) override { use_parallel = on; }

    Tensor3D* forward(const Tensor3D* input) override {
        int nH, nW, nC;
        input->get_info(nH, nW, nC);
        Tensor3D* output = new Tensor3D(nH, nW, nC);

        if (!use_parallel) {
            for (int h = 0; h < nH; ++h)
                for (int w = 0; w < nW; ++w)
                    for (int c = 0; c < nC; ++c) {
                        double x = input->get_elem(h, w, c);
                        output->set_elem(h, w, c, x > 0.0 ? x : 0.0);
                    }
        } else {
            #ifdef _OPENMP
            #pragma omp parallel for collapse(3) schedule(static)
            #endif
            for (int c = 0; c < nC; ++c)
                for (int h = 0; h < nH; ++h)
                    for (int w = 0; w < nW; ++w) {
                        double x = input->get_elem(h, w, c);
                        output->set_elem(h, w, c, x > 0.0 ? x : 0.0);
                    }
        }

        cout << name << " is finished" << endl;
        return output;
    }

    void get_info(string& _name, int& _fK, int& _fC_in, int& _fC_out) const override {
        _name = name; _fK = fK; _fC_in = fC_in; _fC_out = fC_out;
    }

    void print() const override {
        cout << "[Layer_ReLU] " << name
             << " | fK=" << fK
             << " | C_in=" << fC_in
             << " | C_out=" << fC_out
             << endl;
    }
};

class Layer_Conv : public Layer {
private:
    string filename_weight;
    string filename_bias;
    double**** weight_tensor; // [fK][fK][fC_in][fC_out]
    double*   bias_tensor;    // [fC_out]
    bool use_parallel = false;

    void init(int init_type) {
        weight_tensor = dmatrix4D(fK, fK, fC_in, fC_out);
        bias_tensor   = dmatrix1D(fC_out);

        if (init_type == MEAN_INIT) {
            const double val = 1.0 / static_cast<double>(fK * fK * fC_in);
            for (int kh=0; kh<fK; ++kh)
                for (int kw=0; kw<fK; ++kw)
                    for (int ic=0; ic<fC_in; ++ic)
                        for (int oc=0; oc<fC_out; ++oc)
                            weight_tensor[kh][kw][ic][oc] = val;
            for (int oc=0; oc<fC_out; ++oc) bias_tensor[oc] = 0.0;
        } else if (init_type == LOAD_INIT) {
            if (filename_weight.empty() || filename_bias.empty())
                throw std::runtime_error("LOAD_INIT requires valid weight/bias filenames.");

            std::ifstream fw(filename_weight);
            if (!fw.is_open()) throw std::runtime_error("Failed to open weight file: " + filename_weight);
            for (int kh=0; kh<fK; ++kh)
                for (int kw=0; kw<fK; ++kw)
                    for (int ic=0; ic<fC_in; ++ic)
                        for (int oc=0; oc<fC_out; ++oc) {
                            double v;
                            if (!(fw >> v)) throw std::runtime_error("Invalid weight data in " + filename_weight);
                            weight_tensor[kh][kw][ic][oc] = v;
                        }
            fw.close();

            std::ifstream fb(filename_bias);
            if (!fb.is_open()) throw std::runtime_error("Failed to open bias file: " + filename_bias);
            for (int oc=0; oc<fC_out; ++oc) {
                double b;
                if (!(fb >> b)) throw std::runtime_error("Invalid bias data in " + filename_bias);
                bias_tensor[oc] = b;
            }
            fb.close();
        } else {
            throw std::runtime_error("Unknown init_type for Layer_Conv");
        }
    }

public:
    Layer_Conv(string _name, int _fK, int _fC_in, int _fC_out, int init_type,
               string _filename_weight = "", string _filename_bias = "")
        : Layer(_name, _fK, _fC_in, _fC_out),
          filename_weight(_filename_weight),
          filename_bias(_filename_bias),
          weight_tensor(nullptr),
          bias_tensor(nullptr)
    {
        init(init_type);
    }

    ~Layer_Conv() override {
        if (weight_tensor) { free_dmatrix4D(weight_tensor, fK, fK, fC_in, fC_out); weight_tensor = nullptr; }
        if (bias_tensor)   { free_dmatrix1D(bias_tensor, fC_out); bias_tensor = nullptr; }
    }

    void set_parallel(bool on) override { use_parallel = on; }

    Tensor3D* forward(const Tensor3D* input) override {
        int inH, inW, inC;
        input->get_info(inH, inW, inC);
        if (inC != fC_in) throw std::runtime_error("Conv: input channel mismatch.");
        if (inH < fK || inW < fK) throw std::runtime_error("Conv: input smaller than kernel.");

        const int outH = inH - fK + 1;
        const int outW = inW - fK + 1;
        Tensor3D* output = new Tensor3D(outH, outW, fC_out);

        if (!use_parallel) {
            for (int h=0; h<outH; ++h){
                for (int w=0; w<outW; ++w){
                    for (int oc=0; oc<fC_out; ++oc){
                        double acc = 0.0;
                        for (int kh=0; kh<fK; ++kh)
                            for (int kw=0; kw<fK; ++kw)
                                for (int ic=0; ic<fC_in; ++ic){
                                    double x = input->get_elem(h+kh, w+kw, ic);
                                    acc += weight_tensor[kh][kw][ic][oc] * x;
                                }
                        acc += bias_tensor[oc];
                        output->set_elem(h, w, oc, acc);
                    }
                }
            }
        } else {
            #ifdef _OPENMP
            #pragma omp parallel for collapse(3) schedule(static)
            #endif
            for (int oc=0; oc<fC_out; ++oc){
                for (int h=0; h<outH; ++h){
                    for (int w=0; w<outW; ++w){
                        double acc = 0.0;
                        for (int ic=0; ic<fC_in; ++ic)
                            for (int kh=0; kh<fK; ++kh)
                                for (int kw=0; kw<fK; ++kw){
                                    double x = input->get_elem(h+kh, w+kw, ic);
                                    acc += weight_tensor[kh][kw][ic][oc] * x;
                                }
                        acc += bias_tensor[oc];
                        output->set_elem(h, w, oc, acc);
                    }
                }
            }
        }

        cout << name << " is finished" << endl;
        return output;
    }

    void get_info(string& _name, int& _fK, int& _fC_in, int& _fC_out) const override {
        _name = name; _fK = fK; _fC_in = fC_in; _fC_out = fC_out;
    }

    void print() const override {
        cout << "[Layer_Conv] " << name
             << " | fK=" << fK
             << " | C_in=" << fC_in
             << " | C_out=" << fC_out
             << " | stride=1, padding=0" << endl;
    }
};

Overwriting CLayer.h


In [None]:
from google.colab import files
uploaded = files.upload()

Saving tensor_5x5x2.txt to tensor_5x5x2.txt


In [None]:
%%writefile main.cpp
#include <iostream>
#include <fstream>
#include <iomanip>
#include <string>
#include <thread>
#include <chrono>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "ImageLib.h"
#include "CLayer.h"
using namespace std;

void print_all_elements(const Tensor3D& tensor) {
    int nH, nW, nC;
    tensor.get_info(nH, nW, nC);
    cout.precision(3);
    for (int c = 0; c < nC; c++) {
        cout << c << "-th channel:" << endl;
        for (int h = 0; h < nH; h++) {
            for (int w = 0; w < nW; w++) {
                cout << setw(8) << tensor.get_elem(h, w, c);
            }
            cout << endl;
        }
    }
}

Tensor3D* read_tensor(string filename) {
    ifstream fin(filename);
    if (!fin.is_open()) {
        cerr << "Failed to open: " << filename << endl;
        return nullptr;
    }
    int fK = 5;
    int fC = 2;
    Tensor3D* temp = new Tensor3D(fK, fK, fC);
    for (int c = 0; c < fC; c++) {
        for (int w = 0; w < fK; w++) {
            for (int h = 0; h < fK; h++) {
                double val;
                fin >> val;
                temp->set_elem(h, w, c, val);
            }
        }
    }
    fin.close();
    return temp;
}

void print_hw_info() {
    cout << "===== Hardware Info =====" << endl;
    cout << "std::thread::hardware_concurrency(): " << thread::hardware_concurrency() << endl;
#ifdef _OPENMP
    cout << "OpenMP available: yes" << endl;
    cout << "omp_get_max_threads(): " << omp_get_max_threads() << endl;
    cout << "omp_get_num_procs(): " << omp_get_num_procs() << endl;
#else
    cout << "OpenMP available: no (compile with -fopenmp)" << endl;
#endif
    cout << "=========================" << endl << endl;
}

template<typename Fn>
double time_ms(Fn&& fn) {
    auto t0 = chrono::high_resolution_clock::now();
    fn();
    auto t1 = chrono::high_resolution_clock::now();
    return chrono::duration<double, milli>(t1 - t0).count();
}

void bench_fill_serial(Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
    for (int h=0; h<H; ++h)
        for (int w=0; w<W; ++w)
            for (int c=0; c<C; ++c)
                t.set_elem(h,w,c, (h+1)*0.001 + (w+1)*0.002 + (c+1)*0.003);
}

void bench_fill_parallel(Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
#ifdef _OPENMP
    #pragma omp parallel for collapse(3) schedule(static)
#endif
    for (int h=0; h<H; ++h)
        for (int w=0; w<W; ++w)
            for (int c=0; c<C; ++c)
                t.set_elem(h,w,c, (h+1)*0.001 + (w+1)*0.002 + (c+1)*0.003);
}

double bench_sum_serial(const Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
    double s=0.0;
    for (int h=0; h<H; ++h)
        for (int w=0; w<W; ++w)
            for (int c=0; c<C; ++c)
                s += t.get_elem(h,w,c);
    return s;
}

double bench_sum_parallel(const Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
    double s=0.0;
#ifdef _OPENMP
    #pragma omp parallel for collapse(3) reduction(+:s) schedule(static)
#endif
    for (int h=0; h<H; ++h)
        for (int w=0; w<W; ++w)
            for (int c=0; c<C; ++c)
                s += t.get_elem(h,w,c);
    return s;
}

int main() {
    print_hw_info();

    Layer* layer1 = new Layer_Conv("Conv1", 3, 2, 1, MEAN_INIT);
    Layer* layer2 = new Layer_ReLU("Relu1", 1, 1, 1);

    cout << "(Layer information)________________" << endl;
    layer1->print();
    layer2->print();
    cout << endl;

    Tensor3D* tensor1 = read_tensor("tensor_5x5x2.txt");
    if (!tensor1) return 1;

    layer1->set_parallel(false);
    layer2->set_parallel(false);
    Tensor3D *tensor2_s = nullptr, *tensor3_s = nullptr;
    (void)time_ms([&](){
        tensor2_s = layer1->forward(tensor1);
        tensor3_s = layer2->forward(tensor2_s);
    });

    layer1->set_parallel(true);
    layer2->set_parallel(true);
    Tensor3D *tensor2_p = nullptr, *tensor3_p = nullptr;
    (void)time_ms([&](){
        tensor2_p = layer1->forward(tensor1);
        tensor3_p = layer2->forward(tensor2_p);
    });

    cout << "\n(Tensor information)________________" << endl;

    cout << "[tensor1]:" << endl;
    tensor1->print();
    print_all_elements((*tensor1));
    cout << endl;

    cout << "[tensor2]:" << endl;
    tensor2_s->print();
    print_all_elements((*tensor2_s));
    cout << endl;

    cout << "[tensor3]:" << endl;
    tensor3_s->print();
    print_all_elements((*tensor3_s));
    cout << endl;

    const int BH=1024, BW=2048, BC=16;
    Tensor3D bench(BH,BW,BC);

    auto ms_fill_s = time_ms([&]{ bench_fill_serial(bench); });
    double sum_s   = 0.0;
    auto ms_sum_s  = time_ms([&]{ sum_s = bench_sum_serial(bench); });

    auto ms_fill_p = time_ms([&]{ bench_fill_parallel(bench); });
    double sum_p   = 0.0;
    auto ms_sum_p  = time_ms([&]{ sum_p = bench_sum_parallel(bench); });

    cout.setf(std::ios::fixed);
    cout << "===== Benchmark Tensor =====" << endl;
    cout << "Size: " << BH << " x " << BW << " x " << BC << endl;
    cout << setprecision(3);
    cout << "Serial  : fill " << ms_fill_s << " ms, sum " << ms_sum_s
         << " ms, total " << (ms_fill_s + ms_sum_s) << " ms" << endl;
    cout << "Parallel: fill " << ms_fill_p << " ms, sum " << ms_sum_p
         << " ms, total " << (ms_fill_p + ms_sum_p) << " ms" << endl;
    cout << setprecision(6);
    cout << "sum(serial) = "   << sum_s << ", sum(parallel) = " << sum_p << endl;
    cout << setprecision(3);
    double speedup = (ms_fill_s + ms_sum_s) / (ms_fill_p + ms_sum_p);
    cout << "Speedup (Serial/Parallel): " << speedup << "x" << endl;

    delete tensor1;
    delete tensor2_s; delete tensor3_s;
    delete tensor2_p; delete tensor3_p;
    delete layer1; delete layer2;
    return 0;
}

Overwriting main.cpp


In [None]:
%%bash
g++ -O3 -march=native -fopenmp main.cpp ImageLib.cpp -o run
./run

===== Hardware Info =====
std::thread::hardware_concurrency(): 2
OpenMP available: yes
omp_get_max_threads(): 2
omp_get_num_procs(): 2

(Layer information)________________
[Layer_Conv] Conv1 | fK=3 | C_in=2 | C_out=1 | stride=1, padding=0
[Layer_ReLU] Relu1 | fK=1 | C_in=1 | C_out=1

Conv1 is finished
Relu1 is finished
Conv1 is finished
Relu1 is finished

(Tensor information)________________
[tensor1]:
Tensor3D size: 5 x 5 x 2 (50 elements)
0-th channel:
   0.694   0.609  -0.151  -0.252 -0.0679
   0.505   0.538    0.64   0.506    0.64
   0.267  0.0882  -0.457   0.197   0.255
    0.26  -0.654  -0.505  -0.671  -0.333
 -0.0268  -0.148   0.301-0.00506  0.0339
1-th channel:
   0.307   0.615  -0.336   0.215     0.2
    0.25  -0.097  -0.403   0.279  -0.298
  -0.502  -0.232 -0.0643  -0.657   0.158
    0.23   0.299   -0.63  -0.651  -0.412
   0.579   -0.61  -0.309  -0.245   -0.11

[tensor2]:
Tensor3D size: 3 x 3 x 1 (9 elements)
0-th channel:
   0.126  0.0577  0.0225
  -0.026  -0.137  -0.134
  -