In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-8upp7jll
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-8upp7jll
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=ac63672d3df2292759195e5e9c14f0c0be0e3b0914cc9a0a6279f4faf4f8991c
  Stored in directory: /tmp/pip-ephem-wheel-cache-0e2hidow/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [5]:
!nvidia-smi

Sun Oct 22 14:19:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>


__global__ void matrix_multiplication(int *A, int *B, int *C, int N) {
  int row = blockIdx.x * blockDim.x + threadIdx.x;
  int col = blockIdx.y * blockDim.y + threadIdx.y;

  // Check if the row and column indices are within the bounds of the matrix.
  if (row < N && col < N) {
    int sum = 0;
    for (int k = 0; k < N; k++) {
      sum += A[row * N + k] * B[k * N + col];
    }
    C[row * N + col] = sum;
  }
}

int main() {
  int N = 1000;
  int *h_A, *h_B, *h_C;

  // Allocate memory for the matrices on the host.
  h_A = (int *)malloc(N * N * sizeof(int));
  h_B = (int *)malloc(N * N * sizeof(int));
  h_C = (int *)malloc(N * N * sizeof(int));

  // Initialize the matrices.
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      h_A[i * N + j] = i;
      h_B[i * N + j] = j;
    }
  }

  // Allocate memory for the matrices on the device.
  int *d_A, *d_B, *d_C;
  cudaMalloc((void**)&d_A, N * N * sizeof(int));
  cudaMalloc((void**)&d_B, N * N * sizeof(int));
  cudaMalloc((void**)&d_C, N * N * sizeof(int));

  // Copy the matrices from the host to the device.
  cudaMemcpy(d_A, h_A, N * N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, N * N * sizeof(int), cudaMemcpyHostToDevice);

  // Launch the kernel.
  dim3 blocksPerGrid(N, N);
  dim3 threadsPerBlock(1, 1);
  struct timespec start, end;
  clock_gettime(CLOCK_MONOTONIC, &start);
  matrix_multiplication<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
  cudaDeviceSynchronize();
  clock_gettime(CLOCK_MONOTONIC, &end);

  double time_spent = (end.tv_nsec - start.tv_nsec) / 1000000000.0 + (end.tv_sec - start.tv_sec);
  printf("Time taken: %f seconds\n", time_spent);

  // Copy the results from the device to the host.
  cudaMemcpy(h_C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

  // Free the memory on the device.
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Write the result to a file.
  FILE *fp;
  fp = fopen("Cuda.txt", "w");
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      fprintf(fp, "%d ", h_C[i * N + j]);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);

  // Free host memory.
  free(h_A);
  free(h_B);
  free(h_C);

  return 0;
}

Time taken: 0.215299 seconds

