# Multiplying Matrices

Multiplying Matrices

## Setup

- Check CUDA version

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


- Install C-language support

In [5]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6g1ggvb6
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6g1ggvb6
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone


- Load plugin

In [6]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


## Multiply matrices

- Multiply matrices

In [10]:
%%cu
// Using CUDA libraries in CoLab
#include <stdio.h>

// Device function that runs on the GPU - indicated by `__global__`
__global__ void matrixMultiply(float *a, float *b, float *c, int n) {
  // Calculate the row and column index of the element to compute
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  // Check if the index is within the bounds of the matrices
  if (row >= n || col >= n) return;

  // Initialize the sum
  float sum = 0.0f;

  // Loop over the common dimension of a and b
  for (int i = 0; i < n; i++)
  {
    // Accumulate the product of the corresponding elements of a and b
    sum += a[row * n + i] * b[i * n + col];
  }

  // Store the result in c
  c[row * n + col] = sum;
}

// Host function runs on the CPU
int main() {
  // Define a timer variable
  cudaEvent_t start, stop;
  float time;

  // Define the matrix size
  int n = 1024;

  // Allocate host memory for the matrices
  float *h_A = (float *)malloc(n * n * sizeof(float));
  float *h_B = (float *)malloc(n * n * sizeof(float));
  float *h_C = (float *)malloc(n * n * sizeof(float));

  // Initialize the matrices with random values
  for (int i = 0; i < n * n; i++)
  {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }

  // Allocate device memory for the matrices
  float *d_A, *d_B, *d_C;
  cudaMalloc(&d_A, n * n * sizeof(float));
  cudaMalloc(&d_B, n * n * sizeof(float));
  cudaMalloc(&d_C, n * n * sizeof(float));

  // Create and start the timer
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0);

  // Copy the matrices from host to device
  cudaMemcpy(d_A, h_A, n * n * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, n * n * sizeof(float), cudaMemcpyHostToDevice);

  // Define the block and grid dimensions
  int blockSize = 32;
  dim3 dimBlock(blockSize, blockSize);
  dim3 dimGrid((n + blockSize - 1) / blockSize, (n + blockSize - 1) / blockSize);

  // Launch the kernel
  matrixMultiply<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, n);

  // Copy the result from device to host
  cudaMemcpy(h_C, d_C, n * n * sizeof(float), cudaMemcpyDeviceToHost);

  // Stop and destroy the timer
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&time, start, stop);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  // Free the device memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  // Free the host memory
  free(h_A);
  free(h_B);
  free(h_C);

  printf("Matrix multiplication took %f milliseconds\n", time);

  // Return 0 to indicate success
  return 0;
}

Matrix multiplication took 0.000000 milliseconds

