## Load necessary libraries

In [1]:
import arrayfire as af
import librosa
import subprocess
import os
import time
import signal

## Set arrayfire backend and global variables

In [2]:
backend = "opencl"
seed = 42
memory_log_interval_ms = 10
part_count = 10

af.set_backend(backend)
af.set_seed(seed)
print(af.info_str())

ArrayFire v3.9.0 (OpenCL 64bit)
[0] NVIDIA CUDA: NVIDIA_GeForce_RTX_2060 (Compute 3.0)



## Matrix multiplcation

### Check needed memory for data
* 2 2D arrays for matrix multiplication

In [3]:
#   Set size for matrices
k = 2
pwr = 4
N = int(k * 10**pwr)
M = int(k * 10**pwr)

needed_memory_matrix =  (N*M * 32) / (8*1e6) 

print(
    f"""Size of matrix A: {N} x {M} -> {needed_memory_matrix} MB
Size of matrix B: {N} x {M} -> {needed_memory_matrix} MB"""
)

Size of matrix A: 20000 x 20000 -> 1600.0 MB
Size of matrix B: 20000 x 20000 -> 1600.0 MB


### Perform matrix multiplication and log memory usage

Split data into 10 parts and perform matrix multiplicatios for matrix sizes 1/10 to 10/10

In [4]:
file_path_time_matmul_res_log = os.path.join("logs","matmul",backend,"_results.csv")
os.makedirs(os.path.dirname(file_path_time_matmul_res_log), exist_ok=True)
log_file_matmul_res = open(file_path_time_matmul_res_log, "w")

log_file_matmul_res.write("size,time_s,size_MB")

for i in range(part_count):
    N_prim = int(N / part_count * (i+1))
    M_prim = int(M / part_count * (i+1))

    # start memory usage logging
    log_file_name = f"ram_monitor_log_matmul_{N_prim}x{M_prim}.txt" if backend=="cpu" else f"nvidia_smi_log_matmul_{N_prim}x{M_prim}.txt"
    file_path_matmul = os.path.join("logs", "matmul", backend, log_file_name)
    os.makedirs(os.path.dirname(file_path_matmul), exist_ok=True)
    log_file_matmul = open(file_path_matmul, "w")
    if backend == "cpu":
        process = subprocess.Popen(f"python ram_monitor.py --interval {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_matmul)
    else:
        process = subprocess.Popen(f"nvidia-smi --query-gpu memory.used,memory.total --format csv -lms {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_matmul)

    # so memory usage before matrix init is logged
    time.sleep(1)

    A_matmul = af.randu(N_prim,M_prim)
    B_matmul = af.randu(M_prim,N_prim)

    af.sync()

    def matmul_test():
        return af.matmul(A_matmul, B_matmul)

    res_matmul = af.timeit(matmul_test)

    # Free GPU memory
    A_matmul = None
    B_matmul = None
    del A_matmul
    del B_matmul
    # call garbage collector
    af.device.device_gc()

    af.sync()

    # End memory logging
    # so memory usage after matrix multiplication is logged
    time.sleep(1)

    os.kill(process.pid, signal.SIGINT)
    log_file_matmul.close()

    arr_size_MB = (N_prim*M_prim * 32) / (8*1e6)
    print("\n================================")
    print(f"""Matrix multiplication for
        Matrix A of size {N_prim} x {M_prim} -> {arr_size_MB} MB;
        Matrix B of size {M_prim} x {N_prim} -> {arr_size_MB} MB;
    took {res_matmul:.10f} seconds""")
    
    log_file_matmul_res.write(f"\n{N_prim}x{M_prim},{res_matmul},{arr_size_MB}")

log_file_matmul_res.close()



Matrix multiplication for
        Matrix A of size 2000 x 2000 -> 16.0 MB;
        Matrix B of size 2000 x 2000 -> 16.0 MB;
    took 0.0058556487 seconds

Matrix multiplication for
        Matrix A of size 4000 x 4000 -> 64.0 MB;
        Matrix B of size 4000 x 4000 -> 64.0 MB;
    took 0.0346072401 seconds

Matrix multiplication for
        Matrix A of size 6000 x 6000 -> 144.0 MB;
        Matrix B of size 6000 x 6000 -> 144.0 MB;
    took 0.1012217522 seconds

Matrix multiplication for
        Matrix A of size 8000 x 8000 -> 256.0 MB;
        Matrix B of size 8000 x 8000 -> 256.0 MB;
    took 0.2296109200 seconds

Matrix multiplication for
        Matrix A of size 10000 x 10000 -> 400.0 MB;
        Matrix B of size 10000 x 10000 -> 400.0 MB;
    took 0.4433233738 seconds

Matrix multiplication for
        Matrix A of size 12000 x 12000 -> 576.0 MB;
        Matrix B of size 12000 x 12000 -> 576.0 MB;
    took 0.7410244942 seconds

Matrix multiplication for
        Matrix A of size 14

## Vector sorting

### Check needed memory for data
* 1 1D vector for sorting

In [5]:
# Set vector length
L = 250000000

needed_memory_vector =  (L * 32) / (8*1e6) 

print(
    f"""
Length of vector A: {L} -> {needed_memory_vector} MB
    """
)


Length of vector A: 250000000 -> 1000.0 MB
    


### Perform vector sorting and log memory usage

In [6]:
file_path_time_vec_sort_res_log = os.path.join("logs","vec_sort",backend,"_results.csv")
os.makedirs(os.path.dirname(file_path_time_vec_sort_res_log), exist_ok=True)
log_file_vec_sort_res = open(file_path_time_vec_sort_res_log, "w")

log_file_vec_sort_res.write("size,time_s,size_MB")

for i in range(part_count):
    L_prim = int(L / part_count * (i+1))

    # start memory usage logging
    log_file_name = f"ram_monitor_log_vector_sort_{L_prim}.txt" if backend=="cpu" else f"nvidia_smi_log_vector_sort_{L_prim}.txt"
    file_path_vector = os.path.join("logs", "vec_sort", backend, log_file_name)
    os.makedirs(os.path.dirname(file_path_vector), exist_ok=True)
    log_file_vector = open(file_path_vector, "w")
    if backend == "cpu":
        process = subprocess.Popen(f"python ram_monitor.py --interval {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_vector)
    else:
        process = subprocess.Popen(f"nvidia-smi --query-gpu memory.used,memory.total --format csv -lms {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_vector)

    # so memory usage before matrix init is logged
    time.sleep(1)

    A_vector = af.randu(L_prim)

    af.sync()


    def vector_sort_test():
        return af.sort(A_vector)

    res_vector = af.timeit(vector_sort_test)


    # Free GPU memory
    A_vector = None
    del A_vector
    # call garbage collector
    af.device.device_gc()

    af.sync()

    # End memory logging
    # so memory usage after matrix multiplication is logged
    time.sleep(1)

    os.kill(process.pid, signal.SIGINT)
    log_file_vector.close()

    vec_size_MB = (L_prim * 32) / (8*1e6)
    print("\n================================")
    print(f"""Sorting
        Vector with length {L_prim} -> {vec_size_MB} MB
    took {res_vector:.10f} seconds""")

    log_file_vec_sort_res.write(f"\n{L_prim},{res_vector},{vec_size_MB}")

log_file_vec_sort_res.close()


Sorting
        Vector with length 25000000 -> 100.0 MB
    took 4.0411674976 seconds

Sorting
        Vector with length 50000000 -> 200.0 MB
    took 8.2858226299 seconds

Sorting
        Vector with length 75000000 -> 300.0 MB
    took 12.5783755779 seconds

Sorting
        Vector with length 100000000 -> 400.0 MB
    took 16.8379557133 seconds

Sorting
        Vector with length 125000000 -> 500.0 MB
    took 21.1275432110 seconds

Sorting
        Vector with length 150000000 -> 600.0 MB
    took 25.2554528713 seconds

Sorting
        Vector with length 175000000 -> 700.0 MB
    took 29.3585093021 seconds

Sorting
        Vector with length 200000000 -> 800.0 MB
    took 33.5722186565 seconds

Sorting
        Vector with length 225000000 -> 900.0 MB
    took 37.8209168911 seconds

Sorting
        Vector with length 250000000 -> 1000.0 MB
    took 41.7929964066 seconds


## Fast Fourier Transform

### Load data
* Audiobook - The Wonderful Wizard of Oz*

*retrieved from https://librivox.org/the-wonderful-wizard-of-oz/

In [7]:
audio_path = os.path.join("data", "Wizard_of_Oz_mp3","wizardofoz_full.mp3")
# Load only 3000 seconds = 50 minutes because of VRAM
audio_data, sample_rate = librosa.load(audio_path, sr=None, duration=3000)
audio_data_length = len(audio_data)
full_duration = librosa.get_duration(y=audio_data, sr=sample_rate)

In [8]:
file_path_time_fft_res_log = os.path.join("logs","fft",backend,"_results.csv")
os.makedirs(os.path.dirname(file_path_time_fft_res_log), exist_ok=True)
log_file_fft_res = open(file_path_time_fft_res_log, "w")

log_file_fft_res.write("size,dur_s,time_s,size_MB")

for i in range(part_count):
    len_prim = int(audio_data_length / part_count * (i+1))

    audio_data_prim = audio_data[:len_prim]

    duration_prim = librosa.get_duration(y=audio_data_prim, sr=sample_rate)

    # start memory usage logging
    log_file_name = f"ram_monitor_log_fft_{len_prim}.txt" if backend=="cpu" else f"nvidia_smi_log_fft_{len_prim}.txt"
    file_path_fft = os.path.join("logs", "fft", backend, log_file_name)
    os.makedirs(os.path.dirname(file_path_fft), exist_ok=True)
    log_file_fft = open(file_path_fft, "w")
    if backend == "cpu":
        process = subprocess.Popen(f"python ram_monitor.py --interval {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_fft)
    else:
        process = subprocess.Popen(f"nvidia-smi --query-gpu memory.used,memory.total --format csv -lms {memory_log_interval_ms}", cwd=r"{}".format(os.getcwd()), stdout=log_file_fft)

    # so memory usage before matrix init is logged
    time.sleep(1)

    audio_vector = af.from_ndarray(audio_data_prim)

    af.sync()
    
    def fft_test():
        return af.fft(audio_vector)

    res_fft_time = af.timeit(fft_test)
    af.sync()

    # Free GPU memory
    audio_vector = None
    del audio_vector
    # call garbage collector
    af.device.device_gc()
    
    # End memory logging
    # so memory usage after matrix multiplication is logged
    time.sleep(1)

    os.kill(process.pid, signal.SIGINT)
    log_file_fft.close()

    audio_size_MB = (len_prim * 32) / (8*1e6)

    
    print("\n================================")
    print(f"""FFT for
        audio vector with length {len_prim} -> {audio_size_MB} MB
    took {res_fft_time:.10f} seconds""")

    log_file_fft_res.write(f"\n{len_prim},{duration_prim:.2f},{res_fft_time},{audio_size_MB}")

log_file_fft_res.close()


FFT for
        audio vector with length 6615000 -> 26.46 MB
    took 0.1885008017 seconds

FFT for
        audio vector with length 13230000 -> 52.92 MB
    took 0.3969383240 seconds

FFT for
        audio vector with length 19845000 -> 79.38 MB
    took 0.6382944584 seconds

FFT for
        audio vector with length 26460000 -> 105.84 MB
    took 0.8616967201 seconds

FFT for
        audio vector with length 33075000 -> 132.3 MB
    took 1.1439409256 seconds

FFT for
        audio vector with length 39690000 -> 158.76 MB
    took 0.7410175800 seconds

FFT for
        audio vector with length 46305000 -> 185.22 MB
    took 0.8986248970 seconds

FFT for
        audio vector with length 52920000 -> 211.68 MB
    took 0.9763882160 seconds

FFT for
        audio vector with length 59535000 -> 238.14 MB
    took 1.1200046539 seconds

FFT for
        audio vector with length 66150000 -> 264.6 MB
    took 1.2516534328 seconds
