In [None]:
%%writefile ImageLib.h

#pragma once
using byte = unsigned char;

// 1D/2D/3D/4D 동적할당 & 해제
double *dmatrix1D(int nH);
double **dmatrix2D(int nH, int nW);
double ***dmatrix3D(int nH, int nW, int nC);
double ****dmatrix4D(int nH, int nW, int nC, int nNum);

void free_dmatrix1D(double *Image, int nH);
void free_dmatrix2D(double **Image, int nH, int nW);
void free_dmatrix3D(double ***Image, int nH, int nW, int nC);
void free_dmatrix4D(double ****Image, int nH, int nW, int nC, int nNum);

Writing ImageLib.h


In [None]:
%%writefile ImageLib.cpp

#include "ImageLib.h"

double *dmatrix1D(int nH) {
    return new double[nH]();
}

double **dmatrix2D(int nH, int nW) {
    double **Temp = new double*[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double[nW]();
    }
    return Temp;
}

double ***dmatrix3D(int nH, int nW, int nC) {
    double ***Temp = new double**[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double*[nW];
        for (int x = 0; x < nW; ++x) {
            Temp[y][x] = new double[nC]();
        }
    }
    return Temp;
}

double ****dmatrix4D(int nH, int nW, int nC, int nNum) {
    double ****Temp = new double***[nH];
    for (int y = 0; y < nH; ++y) {
        Temp[y] = new double**[nW];
        for (int x = 0; x < nW; ++x) {
            Temp[y][x] = new double*[nC];
            for (int c = 0; c < nC; ++c) {
                Temp[y][x][c] = new double[nNum]();
            }
        }
    }
    return Temp;
}

void free_dmatrix1D(double *Image, int) { delete[] Image; }

void free_dmatrix2D(double **Image, int nH, int) {
    for (int y = 0; y < nH; ++y) delete[] Image[y];
    delete[] Image;
}

void free_dmatrix3D(double ***Image, int nH, int nW, int) {
    for (int y = 0; y < nH; ++y) {
        for (int x = 0; x < nW; ++x) delete[] Image[y][x];
        delete[] Image[y];
    }
    delete[] Image;
}

void free_dmatrix4D(double ****Image, int nH, int nW, int nC, int) {
    for (int y = 0; y < nH; ++y) {
        for (int x = 0; x < nW; ++x) {
            for (int c = 0; c < nC; ++c) delete[] Image[y][x][c];
            delete[] Image[y][x];
        }
        delete[] Image[y];
    }
    delete[] Image;
}

Writing ImageLib.cpp


In [None]:
%%writefile CTensor.h

#pragma once
#include "ImageLib.h"
#include <iostream>

/// Tensor3D는 크기가 (nH x nW x nC)인 3차원 tensor를 관리함
class Tensor3D {
private:
    double*** tensor;
    int nH; // height
    int nW; // width
    int nC; // channel

public:
    // 동작: 1) 3차원 행렬 동적할당 후 tensor에 시작 주소 저장
    //       2) 모든 element 0으로 초기화
    // 사용함수: dmatrix3D()
    Tensor3D(int _nH, int _nW, int _nC) : tensor(nullptr), nH(_nH), nW(_nW), nC(_nC) {
        tensor = dmatrix3D(nH, nW, nC);
        // 0으로 초기화
        for (int h = 0; h < nH; ++h) {
            for (int w = 0; w < nW; ++w) {
                for (int c = 0; c < nC; ++c) {
                    tensor[h][w][c] = 0.0;
                }
            }
        }
    }

    // 동작: 3차원 동적 배열 할당 해제
    // 사용함수: free_dmatrix3D()
    ~Tensor3D() {
        if (tensor) {
            free_dmatrix3D(tensor, nH, nW, nC);
            tensor = nullptr;
        }
    }

    // 특정 위치 원소 설정/조회
    inline void set_elem(int _h, int _w, int _c, double _val) { tensor[_h][_w][_c] = _val; }

    // 동작: 행=_h, 열=_w, 채널=_c 위치 element 반환
    inline double get_elem(int _h, int _w, int _c) const {
        return tensor[_h][_w][_c];
    }

    // 동작: 행렬의 차원(nH, nW, nC)을 pass by reference로 반환
    inline void get_info(int& _nH, int& _nW, int& _nC) const {
        _nH = nH; _nW = nW; _nC = nC;
    }

    inline void set_tensor(double*** _tensor) { tensor = _tensor; }
    inline double*** get_tensor() const { return tensor; }

    // 동작: 행렬의 크기 (nH*nW*nC)를 화면에 출력
    void print() const {
        std::cout << "Tensor3D size: "
                  << nH << " x " << nW << " x " << nC
                  << " (" << static_cast<long long>(nH) * nW * nC << " elements)"
                  << std::endl;
    }
};

Writing CTensor.h


In [None]:
%%writefile main.cpp
#include "ImageLib.h"
#include "CTensor.h"
#include <iostream>
#include <iomanip>
#include <thread>
#include <chrono>
#ifdef _OPENMP
#include <omp.h>
#endif
using namespace std;

void print_all_elements(const Tensor3D& tensor) {
    int nH, nW, nC;
    tensor.get_info(nH, nW, nC);
    for (int c = 0; c < nC; c++) {
        cout << c << "-th channel:" << endl;
        for (int h = 0; h < nH; h++) {
            for (int w = 0; w < nW; w++) {
                cout << setw(4) << tensor.get_elem(h, w, c);
            }
            cout << endl;
        }
    }
}

void fill_serial(Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
    for (int h=0; h<H; h++)
        for (int w=0; w<W; w++)
            for (int c=0; c<C; c++)
                t.set_elem(h,w,c,(h+1)*0.1+(w+1)*0.01+(c+1)*0.001);
}

void fill_parallel(Tensor3D& t) {
    int H,W,C; t.get_info(H,W,C);
#ifdef _OPENMP
    #pragma omp parallel for collapse(3)
#endif
    for (int h=0; h<H; h++)
        for (int w=0; w<W; w++)
            for (int c=0; c<C; c++)
                t.set_elem(h,w,c,(h+1)*0.1+(w+1)*0.01+(c+1)*0.001);
}

int main() {

    int nH = 2, nW = 3, nC = 2;
    Tensor3D tensor(nH, nW, nC);

    tensor.set_elem(0, 0, 0, 1);
    tensor.set_elem(1, 1, 0, 2);
    tensor.set_elem(0, 0, 1, 3);
    tensor.set_elem(1, 2, 1, 4);

    print_all_elements(tensor);
    tensor.print();

    cout << "\n===== Hardware Info =====\n";
    cout << "CPU threads: " << thread::hardware_concurrency() << "\n";
#ifdef _OPENMP
    cout << "OpenMP max threads: " << omp_get_max_threads() << "\n";
#else
    cout << "OpenMP not available (compile with -fopenmp)\n";
#endif
    cout << "=========================\n";

    int BH=512, BW=512, BC=64;
    Tensor3D big(BH,BW,BC);

    auto t1 = chrono::high_resolution_clock::now();
    fill_serial(big);
    auto t2 = chrono::high_resolution_clock::now();
    double serial_ms = chrono::duration<double,milli>(t2-t1).count();

    auto t3 = chrono::high_resolution_clock::now();
    fill_parallel(big);
    auto t4 = chrono::high_resolution_clock::now();
    double parallel_ms = chrono::duration<double,milli>(t4-t3).count();

    cout << "\n===== Benchmark Result =====\n";
    cout << "Serial   : " << serial_ms   << " ms\n";
    cout << "Parallel : " << parallel_ms << " ms\n";
    cout << "============================\n";

    return 0;
}

Overwriting main.cpp


In [None]:
%%bash
g++ -O3 -march=native -fopenmp main.cpp ImageLib.cpp -o run

In [None]:
%%bash
export OMP_NUM_THREADS=$(nproc)
./run

===== Hardware Info =====
std::thread::hardware_concurrency(): 2
OpenMP available: yes
omp_get_max_threads(): 2
omp_get_num_procs(): 2
CPU: Intel(R) Xeon(R) CPU @ 2.20GHz
MemTotal:       13289424 kB

0-th channel:
   1   0   0
   0   2   0
1-th channel:
   3   0   0
   0   0   4
Tensor3D size: 2 x 3 x 2 (12 elements)

===== Benchmark Tensor =====
Size: 1024 x 2048 x 16
Serial  : fill 34.010 ms, sum 47.994 ms, total 82.003 ms
Parallel: fill 71.069 ms, sum 29.203 ms, total 100.272 ms
sum(serial) = 86805315.584000, sum(parallel) = 86805315.584000
Speedup (Serial/Parallel): 0.818x
