In [1]:
%%writefile Layer_Norm.cu
#include <iostream>
#include <cmath>
#include <cuda_runtime.h>

__global__ void LayerNorm(const float* A, float* B, int rows, int cols) {
    int row = blockIdx.x;

    if (row < rows) {
        extern __shared__ float shared[];
        float* row_data = shared + threadIdx.x * cols;

        for (int col = threadIdx.y; col < cols; col += blockDim.y) {
            row_data[col] = A[row * cols + col];
        }
        __syncthreads();

        float mean = 0.0f;
        for (int col = 0; col < cols; col++) {
            mean += row_data[col];
        }
        mean /= cols;

        float variance = 0.0f;
        for (int col = 0; col < cols; col++) {
            float diff = row_data[col] - mean;
            variance += diff * diff;
        }
        variance /= cols;
        float stddev = sqrtf(variance + 1e-7f);

        for (int col = threadIdx.y; col < cols; col += blockDim.y) {
            B[row * cols + col] = (row_data[col] - mean) / stddev;
        }
    }
}

int main() {
    const int rows = 10, cols = 10;
    float *A, *B;

    A = (float*)malloc(rows * cols * sizeof(float));
    B = (float*)malloc(rows * cols * sizeof(float));

    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            A[i * cols + j] = static_cast<float>(rand()) / RAND_MAX;
        }
    }

    float *d_a, *d_b;
    cudaMalloc(&d_a, rows * cols * sizeof(float));
    cudaMalloc(&d_b, rows * cols * sizeof(float));

    cudaMemcpy(d_a, A, rows * cols * sizeof(float), cudaMemcpyHostToDevice);

    dim3 blockDim(1, 32);
    dim3 gridDim(rows);
    size_t shared_memory_size = cols * sizeof(float);
    LayerNorm<<<gridDim, blockDim, shared_memory_size>>>(d_a, d_b, rows, cols);

    cudaDeviceSynchronize();

    cudaMemcpy(B, d_b, rows * cols * sizeof(float), cudaMemcpyDeviceToHost);

    printf("A:\n");
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%.2f ", A[i * cols + j]);
        }
        printf("\n");
    }

    printf("\nB:\n");
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            printf("%.2f ", B[i * cols + j]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_b);
    free(A);
    free(B);

    return 0;
}


Writing Layer_Norm.cu


In [2]:
# Compile with the specified architecture
!nvcc Layer_Norm.cu -o Layer_Norm -gencode arch=compute_75,code=sm_75

# Run the executable
!./Layer_Norm


A:
0.84 0.39 0.78 0.80 0.91 0.20 0.34 0.77 0.28 0.55 
0.48 0.63 0.36 0.51 0.95 0.92 0.64 0.72 0.14 0.61 
0.02 0.24 0.14 0.80 0.16 0.40 0.13 0.11 1.00 0.22 
0.51 0.84 0.61 0.30 0.64 0.52 0.49 0.97 0.29 0.77 
0.53 0.77 0.40 0.89 0.28 0.35 0.81 0.92 0.07 0.95 
0.53 0.09 0.19 0.66 0.89 0.35 0.06 0.02 0.46 0.06 
0.24 0.97 0.90 0.85 0.27 0.54 0.38 0.76 0.51 0.67 
0.53 0.04 0.44 0.93 0.93 0.72 0.28 0.74 0.64 0.35 
0.69 0.17 0.44 0.88 0.83 0.33 0.23 0.89 0.35 0.69 
0.96 0.59 0.66 0.86 0.44 0.92 0.40 0.81 0.68 0.91 

B:
1.01 -0.76 0.78 0.84 1.29 -1.54 -1.00 0.72 -1.22 -0.13 
-0.51 0.15 -1.00 -0.36 1.55 1.40 0.18 0.53 -1.98 0.05 
-0.99 -0.25 -0.60 1.57 -0.53 0.26 -0.62 -0.69 2.20 -0.33 
-0.39 1.16 0.08 -1.43 0.20 -0.34 -0.49 1.80 -1.45 0.84 
-0.24 0.59 -0.67 1.00 -1.06 -0.83 0.71 1.09 -1.79 1.20 
0.69 -0.87 -0.49 1.18 1.98 0.06 -0.95 -1.10 0.45 -0.95 
-1.48 1.45 1.18 0.97 -1.37 -0.27 -0.93 0.61 -0.38 0.24 
-0.11 -1.91 -0.45 1.36 1.35 0.59 -1.01 0.65 0.29 -0.76 
0.53 -1.46 -0.41 1.26 1.06 -0.83 -