In [None]:
%%writefile matrixMult.cu

#include <bits/stdc++.h>
#include <cuda.h>

using namespace std;

__global__ void matrixMultKernel(float* A, float* B, float* C, int M, int N, int P, int matricePosition) {

    int k = threadIdx.x + matricePosition;

    float* a = A + k * M * N;
    float* b = B + k * N * P;
    float* c = C + k * M * P;

    for(int i = 0; i < M; i++) {
        for(int j = 0; j < N; j++) {
            for(int l = 0; l < P; l++) {
                c[i * P + l] += a[i * N + j] * b[j * P + l];
            }
        }
    }
}

int main(int argc, char* argv[]) {

    int T = atoi(argv[1]); // Number of threads
    int K = atoi(argv[2]); // Number of matrices

    //int M = 5, N = 5, P = 5; // Dimensions of the matrices

    int M = atoi(argv[3]);
    int N = atoi(argv[4]);
    int P = atoi(argv[5]);

    int sizeA = M * N * K;
    int sizeB = N * P * K;
    int sizeC = M * P * K;

    // Allocate memory to CPU
    float *h_A = new float[sizeA];
    float *h_B = new float[sizeB];
    float *h_C = new float[sizeC];

    // Initialize matrices A and B
    for (int i = 0; i < sizeA; i++) {
        h_A[i] = rand() % 10;
    }
    for(int i = 0; i < sizeB; i++) {
        h_B[i] = rand() % 10;
    }

    // Allocate memory to GPU
    float *d_A;
    cudaMalloc(&d_A, sizeA * sizeof(float));
    float *d_B;
    cudaMalloc(&d_B, sizeB * sizeof(float));
    float *d_C;
    cudaMalloc(&d_C, sizeC * sizeof(float));



    //copy from host to device
    cudaMemcpy(d_A, h_A, sizeA * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB * sizeof(float), cudaMemcpyHostToDevice);

    //cuda process suru
    int matricesLeftForMult = K;
    int matricePosition = 0;

    while(matricesLeftForMult > 0){

        int currentBatch = min(matricesLeftForMult, T);

        // kernel_name <<< numBlocks, threadsPerBlock >>> (kernel_arguments...);

        matrixMultKernel <<<1, currentBatch>>> (d_A, d_B, d_C, M, N, P, matricePosition);
        cudaDeviceSynchronize();

        matricesLeftForMult -= currentBatch;
        matricePosition += currentBatch;
    }


    // copy back to cpu
    cudaMemcpy(h_C, d_C, sizeC * sizeof(float), cudaMemcpyDeviceToHost);


    // Print result matrix C for each batch
    for (int k = 0; k < K; ++k) {
        cout << "Matrix C[" << k << "]:" << endl;
        for (int i = 0; i < M; ++i) {
            for (int j = 0; j < P; ++j) {
                cout << h_C[k * M * P + i * P + j] << " ";
            }
            cout << endl;
        }
        cout << "-----------------------------" << endl;
    }


    // Free GPU memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free CPU memory
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;


    cout << "All operation done" << endl;

}