In [1]:
!nvidia-smi

Mon Jan 19 04:03:49 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


# Stack

In [7]:
%%writefile practice_work_5_stack.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cstdlib>
#include <ctime>
#include <random>

using namespace std;

#define CUDA_CHECK(err) do { \
    cudaError_t e = (err); \
    if (e != cudaSuccess) { \
        cerr << "CUDA Error: " << cudaGetErrorString(e) \
             << " at line " << __LINE__ << endl; \
        exit(1); \
    } \
} while(0)

// ==============================
// Параметры
// ==============================
const int MAX_STACK_SIZE = 10000000;
const int BLOCK_SIZE = 256;
const int INITIAL_STACK_SIZE = 1000000;

// ==============================
// Глобальный стек на GPU
// ==============================
__device__ int d_stack[MAX_STACK_SIZE];
__device__ int d_top = 0;

// ==============================
// Инициализация стека
// ==============================
__global__ void init_stack_kernel(int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        d_stack[idx] = idx; // детерминированные значения
    }
    if (idx == 0) {
        d_top = n;
    }
}

// ==============================
// Push
// ==============================
__global__ void push_kernel(int* values, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    int pos = atomicAdd(&d_top, 1);
    if (pos < MAX_STACK_SIZE) {
        d_stack[pos] = values[idx];
    } else {
        atomicSub(&d_top, 1);
    }
}

// ==============================
// Pop
// ==============================
__global__ void pop_kernel(int* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    int pos = atomicSub(&d_top, 1);
    if (pos > 0) {
        output[idx] = d_stack[pos - 1];
    } else {
        atomicAdd(&d_top, 1);
        output[idx] = -1;
    }
}

// ==============================
// Печать стека (CPU)
// ==============================
void print_stack(const vector<int>& data, int top) {
    cout << "Stack size = " << top << "\n";

    cout << "First 10: ";
    for (int i = 0; i < min(10, top); i++)
        cout << data[i] << " ";
    cout << "\n";

    cout << "Last 10: ";
    for (int i = max(0, top - 10); i < top; i++)
        cout << data[i] << " ";
    cout << "\n\n";
}

// ==============================
// MAIN
// ==============================
int main() {
    cout << "=== Parallel Stack (LIFO) on GPU ===\n";

    const int N = 10000;

    // Данные для push
    vector<int> h_values(N);
    mt19937 gen(time(nullptr));
    uniform_int_distribution<int> dist(1, 1000000);
    for (int i = 0; i < N; ++i)
        h_values[i] = dist(gen);

    int *d_values, *d_output;
    CUDA_CHECK(cudaMalloc(&d_values, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(),
                          N * sizeof(int), cudaMemcpyHostToDevice));

    dim3 block(BLOCK_SIZE);
    dim3 grid_init((INITIAL_STACK_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);

    // ==============================
    // ИНИЦИАЛИЗАЦИЯ СТЕКА
    // ==============================
    init_stack_kernel<<<grid_init, block>>>(INITIAL_STACK_SIZE);
    CUDA_CHECK(cudaDeviceSynchronize());

    vector<int> h_stack(MAX_STACK_SIZE);
    int h_top;

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_top, d_top, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_stack.data(), d_stack,
                                   MAX_STACK_SIZE * sizeof(int)));

    cout << "--- Before push ---\n";
    print_stack(h_stack, h_top);

    // ==============================
    // PUSH
    // ==============================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    push_kernel<<<grid, block>>>(d_values, N);
    cudaEventRecord(stop);
    CUDA_CHECK(cudaDeviceSynchronize());

    float push_time;
    cudaEventElapsedTime(&push_time, start, stop);

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_top, d_top, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_stack.data(), d_stack,
                                   MAX_STACK_SIZE * sizeof(int)));

    cout << "--- After push ---\n";
    print_stack(h_stack, h_top);

    cout << "Expected stack size: " << INITIAL_STACK_SIZE + N << "\n";
    cout << "Actual stack size:   " << h_top << "\n\n";

    // ==============================
    // POP
    // ==============================
    cudaEventRecord(start);
    pop_kernel<<<grid, block>>>(d_output, N);
    cudaEventRecord(stop);
    CUDA_CHECK(cudaDeviceSynchronize());

    float pop_time;
    cudaEventElapsedTime(&pop_time, start, stop);

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_top, d_top, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_stack.data(), d_stack,
                                   MAX_STACK_SIZE * sizeof(int)));

    cout << "--- After pop ---\n";
    print_stack(h_stack, h_top);

    cout << "Push time: " << push_time << " ms\n";
    cout << "Pop time:  " << pop_time << " ms\n";

    cudaFree(d_values);
    cudaFree(d_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Overwriting practice_work_5_stack.cu


In [8]:
!nvcc practice_work_5_stack.cu -o practice_work_5_stack -arch=sm_75
!./practice_work_5_stack

=== Parallel Stack (LIFO) on GPU ===
--- Before push ---
Stack size = 1000000
First 10: 0 1 2 3 4 5 6 7 8 9 
Last 10: 999990 999991 999992 999993 999994 999995 999996 999997 999998 999999 

--- After push ---
Stack size = 1010000
First 10: 0 1 2 3 4 5 6 7 8 9 
Last 10: 728980 339935 106896 207316 899240 900786 877071 874990 874340 61444 

Expected stack size: 1010000
Actual stack size:   1010000

--- After pop ---
Stack size = 1000000
First 10: 0 1 2 3 4 5 6 7 8 9 
Last 10: 999990 999991 999992 999993 999994 999995 999996 999997 999998 999999 

Push time: 0.041376 ms
Pop time:  0.042688 ms


# Queue


In [9]:
%%writefile practice_work_5_queue.cu
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cstdlib>
#include <ctime>
#include <random>

using namespace std;

#define CUDA_CHECK(err) do { \
    cudaError_t e = (err); \
    if (e != cudaSuccess) { \
        cerr << "CUDA Error: " << cudaGetErrorString(e) \
             << " at line " << __LINE__ << endl; \
        exit(1); \
    } \
} while(0)

// ==============================
// Параметры
// ==============================
const int MAX_QUEUE_SIZE = 10000000;
const int BLOCK_SIZE = 256;
const int INITIAL_QUEUE_SIZE = 1000000;

// ==============================
// Глобальная очередь на GPU
// ==============================
__device__ int d_queue[MAX_QUEUE_SIZE];
__device__ int d_head = 0;
__device__ int d_tail = 0;

// ==============================
// Инициализация очереди
// ==============================
__global__ void init_queue_kernel(int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        d_queue[idx] = idx; // детерминированные значения
    }
    if (idx == 0) {
        d_head = 0;
        d_tail = n;
    }
}

// ==============================
// Enqueue (добавление)
// ==============================
__global__ void enqueue_kernel(int* values, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    int pos = atomicAdd(&d_tail, 1);
    if (pos < MAX_QUEUE_SIZE) {
        d_queue[pos] = values[idx];
    } else {
        atomicSub(&d_tail, 1); // rollback
    }
}

// ==============================
// Dequeue (удаление)
// ==============================
__global__ void dequeue_kernel(int* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    int pos = atomicAdd(&d_head, 1);
    if (pos < d_tail) {
        output[idx] = d_queue[pos];
    } else {
        atomicSub(&d_head, 1); // rollback
        output[idx] = -1;
    }
}

// ==============================
// Печать очереди (CPU)
// ==============================
void print_queue(const vector<int>& data, int head, int tail) {
    int size = tail - head;
    cout << "Queue size = " << size << "\n";

    cout << "First 10: ";
    for (int i = head; i < min(head + 10, tail); i++)
        cout << data[i] << " ";
    cout << "\n";

    cout << "Last 10: ";
    for (int i = max(head, tail - 10); i < tail; i++)
        cout << data[i] << " ";
    cout << "\n\n";
}

// ==============================
// MAIN
// ==============================
int main() {
    cout << "=== Parallel Queue (FIFO) on GPU ===\n";

    const int N = 10000;

    // Данные для enqueue
    vector<int> h_values(N);
    mt19937 gen(time(nullptr));
    uniform_int_distribution<int> dist(1, 1000000);
    for (int i = 0; i < N; ++i)
        h_values[i] = dist(gen);

    int *d_values, *d_output;
    CUDA_CHECK(cudaMalloc(&d_values, N * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_output, N * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(),
                          N * sizeof(int), cudaMemcpyHostToDevice));

    dim3 block(BLOCK_SIZE);
    dim3 grid_init((INITIAL_QUEUE_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);

    // ==============================
    // ИНИЦИАЛИЗАЦИЯ
    // ==============================
    init_queue_kernel<<<grid_init, block>>>(INITIAL_QUEUE_SIZE);
    CUDA_CHECK(cudaDeviceSynchronize());

    vector<int> h_queue(MAX_QUEUE_SIZE);
    int h_head, h_tail;

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_head, d_head, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(&h_tail, d_tail, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_queue.data(), d_queue,
                                   MAX_QUEUE_SIZE * sizeof(int)));

    cout << "--- Before enqueue ---\n";
    print_queue(h_queue, h_head, h_tail);

    // ==============================
    // ENQUEUE
    // ==============================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    enqueue_kernel<<<grid, block>>>(d_values, N);
    cudaEventRecord(stop);
    CUDA_CHECK(cudaDeviceSynchronize());

    float enqueue_time;
    cudaEventElapsedTime(&enqueue_time, start, stop);

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_head, d_head, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(&h_tail, d_tail, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_queue.data(), d_queue,
                                   MAX_QUEUE_SIZE * sizeof(int)));

    cout << "--- After enqueue ---\n";
    print_queue(h_queue, h_head, h_tail);

    cout << "Expected queue size: " << INITIAL_QUEUE_SIZE + N << "\n";
    cout << "Actual queue size:   " << (h_tail - h_head) << "\n\n";

    // ==============================
    // DEQUEUE
    // ==============================
    cudaEventRecord(start);
    dequeue_kernel<<<grid, block>>>(d_output, N);
    cudaEventRecord(stop);
    CUDA_CHECK(cudaDeviceSynchronize());

    float dequeue_time;
    cudaEventElapsedTime(&dequeue_time, start, stop);

    CUDA_CHECK(cudaMemcpyFromSymbol(&h_head, d_head, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(&h_tail, d_tail, sizeof(int)));
    CUDA_CHECK(cudaMemcpyFromSymbol(h_queue.data(), d_queue,
                                   MAX_QUEUE_SIZE * sizeof(int)));

    cout << "--- After dequeue ---\n";
    print_queue(h_queue, h_head, h_tail);

    cout << "Enqueue time: " << enqueue_time << " ms\n";
    cout << "Dequeue time: " << dequeue_time << " ms\n";

    cudaFree(d_values);
    cudaFree(d_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

Writing practice_work_5_queue.cu


In [10]:
!nvcc practice_work_5_queue.cu -o practice_work_5_queue -arch=sm_75
!./practice_work_5_queue

=== Parallel Queue (FIFO) on GPU ===
--- Before enqueue ---
Queue size = 1000000
First 10: 0 1 2 3 4 5 6 7 8 9 
Last 10: 999990 999991 999992 999993 999994 999995 999996 999997 999998 999999 

--- After enqueue ---
Queue size = 1010000
First 10: 0 1 2 3 4 5 6 7 8 9 
Last 10: 238521 970246 35117 46561 133912 368268 923614 151159 652000 633506 

Expected queue size: 1010000
Actual queue size:   1010000

--- After dequeue ---
Queue size = 1000000
First 10: 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 
Last 10: 238521 970246 35117 46561 133912 368268 923614 151159 652000 633506 

Enqueue time: 0.068864 ms
Dequeue time: 0.040256 ms
