In [2]:
# Verificar a presença de uma GPU
!nvidia-smi

Fri Jan 17 21:09:47 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Instalar o compilador CUDA
!apt-get install -q nvidia-cuda-toolkit g++ freeglut3-dev libx11-dev libxmu-dev libxi-dev libglu1-mesa-dev


Reading package lists...
Building dependency tree...
Reading state information...
g++ is already the newest version (4:11.2.0-1ubuntu1).
g++ set to manually installed.
libx11-dev is already the newest version (2:1.7.5-1ubuntu0.3).
libx11-dev set to manually installed.
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra freeglut3 libaccinj64-11.5 libatk-wrapper-java
  libatk-wrapper-java-jni libbabeltrace1 libcub-dev libcublas11 libcublaslt11 libcudart11.0
  libcufft10 libcufftw10 libcuinj64-11.5 libcupti-dev libcupti-doc libcupti11.5 libcurand10
  libcusolver11 libcusolvermg11 libcusparse11 libdebuginfod-common libdebuginfod1 libegl-dev
  libfontenc1 libgail-common libgail18 libgl-dev libgl1-mesa-dev libgles-dev libgles1 libglu1-mesa
  libglvnd-core-dev libglvnd-dev libglx-dev libgtk2.0-0 libgtk2.0-bin libgtk2.0-common libice-dev
  libipt2 libnppc11 libnppial11 libnppicc11 libnppidei11 libnppif11 libnppig11 libnppim11
  libnppist11 libnppisu11 li

In [4]:
# Verificar a instalação CUDA
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [5]:
%%writefile diffusion.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define N 2000      // Tamanho da grade
#define T 1000      // Número de iterações no tempo
#define D 0.1       // Coeficiente de difusão
#define DELTA_T 0.01
#define DELTA_X 1.0

__global__ void diff_eq(double *C, double *C_new) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    int index = i * N + j;

    if (i > 0 && i < N-1 && j > 0 && j < N-1) {
        C_new[index] = C[index] + D * DELTA_T * (
            (C[index + N] + C[index - N] + C[index + 1] + C[index - 1] - 4 * C[index]) / (DELTA_X * DELTA_X)
        );
    }
}

static __device__ double atomicAddDouble(double* address, double val) {
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;

    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    } while (assumed != old);

    return __longlong_as_double(old);
}


__global__ void compute_difference(double *C, double *C_new, double *dif) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    int index = i * N + j;

    __shared__ double dif_per_block[256];
    int t_idx = threadIdx.x + threadIdx.y * blockDim.x;

    if (i < N && j < N) {
        dif_per_block[t_idx] = fabs(C_new[index] - C[index]);
    } else {
        dif_per_block[t_idx] = 0.0;
    }

    __syncthreads();

    for (int s = blockDim.x * blockDim.y / 2; s > 0; s >>= 1) {
        if (t_idx < s) {
            dif_per_block[t_idx] += dif_per_block[t_idx + s];
        }
        __syncthreads();
    }

    if (t_idx == 0) {
        atomicAddDouble(dif, dif_per_block[0]);
    }
}



int main() {
    // Alocação de memória no host
    double *C = (double *)malloc(N * N * sizeof(double));
    double *C_new = (double *)malloc(N * N * sizeof(double));

    // Inicialização da memória
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            C[i * N + j] = 0.0;
            C_new[i * N + j] = 0.0;
        }
    }
    C[(N/2) * N + (N/2)] = 1.0; // Inicializar concentração alta no centro

    // Alocação de memória na GPU
    double *d_C, *d_C_new, *d_dif;
    cudaMalloc((void **)&d_C, N * N * sizeof(double));
    cudaMalloc((void **)&d_C_new, N * N * sizeof(double));
    cudaMalloc((void **)&d_dif, sizeof(double));

    // Copiar dados do host para a GPU
    cudaMemcpy(d_C, C, N * N * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C_new, C_new, N * N * sizeof(double), cudaMemcpyHostToDevice);

    // Dimensões do bloco e da grade
    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Iterações no tempo
    for (int t = 0; t < T; t++) {
        diff_eq<<<blocksPerGrid, threadsPerBlock>>>(d_C, d_C_new);

        if ((t % 100) == 0) {
            cudaMemset(d_dif, 0, sizeof(double));
            compute_difference<<<blocksPerGrid, threadsPerBlock>>>(d_C, d_C_new, d_dif);

            double difmedio;
            cudaMemcpy(&difmedio, d_dif, sizeof(double), cudaMemcpyDeviceToHost);
            difmedio /= ((N-2)*(N-2));
            printf("Iteração %d - Diferença média=%g\n", t, difmedio);
        }

        // Trocar os ponteiros
        double *temp = d_C;
        d_C = d_C_new;
        d_C_new = temp;
    }

    // Copiar resultado final da GPU para o host
    cudaMemcpy(C, d_C, N * N * sizeof(double), cudaMemcpyDeviceToHost);

    // Exibir resultado final no centro da grade
    printf("Concentração final no centro: %f\n", C[(N/2) * N + (N/2)]);

    // Liberar memória
    cudaFree(d_C);
    cudaFree(d_C_new);
    cudaFree(d_dif);
    free(C);
    free(C_new);

    return 0;
}


Writing diffusion.cu


In [7]:
!nvcc -o diffusion diffusion.cu -lm
!time ./diffusion

Iteração 0 - Diferença média=2.00401e-09
Iteração 100 - Diferença média=1.23248e-09
Iteração 200 - Diferença média=7.81794e-10
Iteração 300 - Diferença média=5.11528e-10
Iteração 400 - Diferença média=4.21632e-10
Iteração 500 - Diferença média=3.62223e-10
Iteração 600 - Diferença média=3.05976e-10
Iteração 700 - Diferença média=2.57135e-10
Iteração 800 - Diferença média=2.21174e-10
Iteração 900 - Diferença média=2.00244e-10
Concentração final no centro: 0.095045

real	0m1.536s
user	0m1.073s
sys	0m0.297s
