In [1]:
!nvidia-smi

Mon Nov 24 07:44:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [5]:
%%writefile brute_cuda.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <openssl/sha.h>
#include <sys/time.h>

// Constant K table for SHA-256
__constant__ unsigned int Kc[64] = {
  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};

__device__ unsigned int ROTR32(unsigned int x, unsigned int n){
    return (x >> n) | (x << (32-n));
}

// ------------------------------------------------------------------
// A simplified GPU SHA256 for short messages
//
// The SHA-256 logic below (bitwise rotations, majority/choice functions)
// is based on the standard FIPS 180-2 specification.
// I referenced the C implementation by Brad Conte (crypto-algorithms)
// and adapted it to run as a single CUDA device function.
// ------------------------------------------------------------------
__device__ void sha256_cuda(const unsigned char* msg, int len, unsigned char* out) {

    unsigned int H[8] = {
      0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,
      0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
    };

    unsigned char block[64];
    // Initialize block to zero
    for (int i = 0; i < 64; i++) block[i] = 0;
    // Copy message to block
    for (int i = 0; i < len; i++) block[i] = msg[i];
    // Add padding (1 bit followed by zeros)
    block[len] = 0x80;

    // Append length in bits at the end of the block (Big Endian)
    unsigned long long bits = len * 8ULL;
    for (int i = 0; i < 8; i++)
        block[63 - i] = (bits >> (i*8)) & 0xff;

    unsigned int W[64];

    // Message schedule loop 1
    for (int i = 0; i < 16; i++) {
        W[i] = (block[i*4]<<24) | (block[i*4+1]<<16)
             | (block[i*4+2]<<8) | block[i*4+3];
    }

    // Message schedule loop 2
    for (int i = 16; i < 64; i++) {
        unsigned int s0 = ROTR32(W[i-15],7) ^ ROTR32(W[i-15],18) ^ (W[i-15]>>3);
        unsigned int s1 = ROTR32(W[i-2],17) ^ ROTR32(W[i-2],19) ^ (W[i-2]>>10);
        W[i] = W[i-16] + s0 + W[i-7] + s1;
    }

    unsigned int a=H[0],b=H[1],c=H[2],d=H[3],e=H[4],f=H[5],g=H[6],h=H[7];

    // Compression loop
    for (int i=0;i<64;i++){
        unsigned int S1 = ROTR32(e,6) ^ ROTR32(e,11) ^ ROTR32(e,25);
        unsigned int ch = (e & f) ^ (~e & g);
        unsigned int temp1 = h + S1 + ch + Kc[i] + W[i];
        unsigned int S0 = ROTR32(a,2) ^ ROTR32(a,13) ^ ROTR32(a,22);
        unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
        unsigned int temp2 = S0 + maj;

        h=g; g=f; f=e;
        e = d + temp1;
        d=c; c=b; b=a;
        a = temp1 + temp2;
    }

    H[0]+=a; H[1]+=b; H[2]+=c; H[3]+=d;
    H[4]+=e; H[5]+=f; H[6]+=g; H[7]+=h;

    // Write hash to output
    for (int i=0;i<8;i++){
        out[i*4]   = (H[i]>>24)&0xff;
        out[i*4+1] = (H[i]>>16)&0xff;
        out[i*4+2] = (H[i]>>8)&0xff;
        out[i*4+3] = H[i]&0xff;
    }
}

// Kernel Code: Each thread checks a range of PINs

__global__ void kernel_crack(const unsigned char* target, int pinLen, unsigned long long total,
                             int* foundFlag, char* result, unsigned long long* threadID)
{
    // Calculate global thread ID
    unsigned long long id = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned long long jump = gridDim.x * blockDim.x;

    unsigned char msg[16];
    unsigned char hh[32];

    // Strided loop to cover all keys
    while (id < total) {

        // Check if another thread already found the password
        if (atomicAdd(foundFlag,0) == 1)
            return;

        // Convert current ID to PIN string (reverse order loop)
        unsigned long long tmp = id;
        for (int i = pinLen-1; i >= 0; i--) {
            msg[i] = '0' + (tmp % 10);
            tmp /= 10;
        }

        // Compute SHA-256 for this PIN
        sha256_cuda(msg, pinLen, hh);

        // Check if hash matches target
        int match = 1;
        for (int j=0;j<32;j++)
            if (hh[j] != target[j]) { match = 0; break; }

        if (match) {
            // AtomicCAS ensures only one thread writes the result
            if (atomicCAS(foundFlag,0,1)==0){
                for (int k=0;k<pinLen;k++)
                    result[k] = msg[k];
                result[pinLen] = 0;
                *threadID = id; // Store ID of successful thread
            }
            return;
        }

        // Jump to next candidate
        id += jump;
    }
}

int main(int argc, char** argv){

    if (argc < 4){
        printf("Usage: %s <length> <pin> <threads>\n", argv[0]);
        return 0;
    }

    int pinLen = atoi(argv[1]);
    const char* PIN = argv[2];
    int threadsPerBlock = atoi(argv[3]); // Read threads from command line

    // Calculate target hash on Host (CPU)
    unsigned char hostHash[32];
    SHA256((unsigned char*)PIN, strlen(PIN), hostHash);

    unsigned long long totalKeys = 1;
    for (int i=0;i<pinLen;i++) totalKeys *= 10ULL;

    printf("Target PIN: %s\n", PIN);
    printf("Hash: ");
    for (int i=0;i<32;i++) printf("%02x", hostHash[i]);
    printf("\nTotal Keys: %llu\n", totalKeys);

    // Allocate memory on Device (GPU)
    unsigned char *d_hash;
    int *d_found;
    char *d_pin;
    unsigned long long *d_tid;

    int h_found = 0;
    unsigned long long h_tid = 0;

    cudaMalloc(&d_hash, 32);
    cudaMalloc(&d_found, sizeof(int));
    cudaMalloc(&d_pin, 16);
    cudaMalloc(&d_tid, sizeof(unsigned long long));

    // Copy data from Host to Device
    cudaMemcpy(d_hash, hostHash, 32, cudaMemcpyHostToDevice);
    cudaMemcpy(d_found, &h_found, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_tid, &h_tid, sizeof(unsigned long long), cudaMemcpyHostToDevice);

    struct timeval t1,t2;
    gettimeofday(&t1,NULL);

    printf("Starting CUDA crack with %d threads...\n", threadsPerBlock);

    // Configure Kernel parameters (Grid = 1, Block = User Input)
    dim3 block(threadsPerBlock);
    dim3 grid(1);

    // Launch Kernel
    kernel_crack<<<grid,block>>>(d_hash, pinLen, totalKeys, d_found, d_pin, d_tid);

    // Wait for GPU to finish
    cudaDeviceSynchronize();

    gettimeofday(&t2,NULL);
    double timeSpent = (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec)/1000000.0;

    // Copy results back from Device to Host
    cudaMemcpy(&h_found, d_found, sizeof(int), cudaMemcpyDeviceToHost);

    if (h_found){
        char result[20];
        cudaMemcpy(result, d_pin, pinLen+1, cudaMemcpyDeviceToHost);
        cudaMemcpy(&h_tid, d_tid, sizeof(unsigned long long), cudaMemcpyDeviceToHost);

        printf("\nFOUND PIN: %s\n", result);
        printf("Thread ID: %llu\n", h_tid);
        printf("Time: %.5f sec\n", timeSpent);
    } else {
        printf("PIN NOT FOUND\n");
        printf("Time: %.5f sec\n", timeSpent);
    }

    // Free GPU memory
    cudaFree(d_hash);
    cudaFree(d_found);
    cudaFree(d_pin);
    cudaFree(d_tid);

    return 0;
}

Overwriting brute_cuda.cu


#Compile the code

In [None]:
!nvcc brute_cuda.cu -o brute_cuda -lssl -lcrypto -gencode arch=compute_75,code=sm_75

# Run the code with different no of threads (1, 2, 4, 8, 16)

In [6]:
!./brute_cuda 7 7654322 1

Target PIN: 7654322
Hash: 2fea2f55c12e9bb62c3bff7117cf218414f97ad13fd9c95cc6fd25915fd79a44
Total Keys: 10000000
Starting CUDA crack with 1 threads...

FOUND PIN: 7654322
Thread ID: 7654322
Time: 26.07839 sec


In [7]:
!./brute_cuda 7 7654322 2

Target PIN: 7654322
Hash: 2fea2f55c12e9bb62c3bff7117cf218414f97ad13fd9c95cc6fd25915fd79a44
Total Keys: 10000000
Starting CUDA crack with 2 threads...

FOUND PIN: 7654322
Thread ID: 7654322
Time: 13.17536 sec


In [8]:
!./brute_cuda 7 7654322 4

Target PIN: 7654322
Hash: 2fea2f55c12e9bb62c3bff7117cf218414f97ad13fd9c95cc6fd25915fd79a44
Total Keys: 10000000
Starting CUDA crack with 4 threads...

FOUND PIN: 7654322
Thread ID: 7654322
Time: 6.70041 sec


In [9]:
!./brute_cuda 7 7654322 8

Target PIN: 7654322
Hash: 2fea2f55c12e9bb62c3bff7117cf218414f97ad13fd9c95cc6fd25915fd79a44
Total Keys: 10000000
Starting CUDA crack with 8 threads...

FOUND PIN: 7654322
Thread ID: 7654322
Time: 3.44850 sec


In [10]:
!./brute_cuda 7 7654322 16

Target PIN: 7654322
Hash: 2fea2f55c12e9bb62c3bff7117cf218414f97ad13fd9c95cc6fd25915fd79a44
Total Keys: 10000000
Starting CUDA crack with 16 threads...

FOUND PIN: 7654322
Thread ID: 7654322
Time: 1.78277 sec
