# **Cosine similarity example**

In [None]:
import numpy as np
from sklearn.metrics import pairwise_distances

import torch
import cupy as cp

from sys import getsizeof
import time

In [None]:
np.show_config()  # check that numpy uses openblas for multithreading vectorized operations

blas_mkl_info:
  NOT AVAILABLE
blis_info:
  NOT AVAILABLE
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
    runtime_library_dirs = ['/usr/local/lib']
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
    runtime_library_dirs = ['/usr/local/lib']
lapack_mkl_info:
  NOT AVAILABLE
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
    runtime_library_dirs = ['/usr/local/lib']
lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
    runtime_library_dirs = ['/usr/local/lib']
Supported SIMD extensions in this NumPy install:
    baseline = SSE,SSE2,SSE3
    found = SSSE3,SSE4

In [None]:
torch.cuda.is_available()  # check gpu is available

True

In [None]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          12991        1126        8630           1        3234       11636
Swap:             0           0           0


In [None]:
!nvidia-smi

Sat Feb 26 22:02:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    34W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
warmup = (torch.from_numpy(np.array([1,2,3])).to('cuda'))**2  # warmup the gpu as first call is slow

# **Arrays and Operations fit in memory**

In [None]:
vecs_1M = np.random.rand(1000000, 300)
vecs_1K = np.random.rand(100, 300)
print(f'vecs_1M: {getsizeof(vecs_1K)/1024/1024} MB')
print(f'vecs_1K: {getsizeof(vecs_1M)/1024/1024} MB')

vecs_1M: 0.22899627685546875 MB
vecs_1K: 2288.818473815918 MB


### **CPU: sklearn and vectorized numpy**

**Cosine Distance**

In [None]:
def run_numpy(A, B):
  start = time.time()
  t = []
  for i in range(5):
    t0 = time.time()
    sim = np.inner(A, B)/np.outer(np.linalg.norm(A, ord=2, axis=1), np.linalg.norm(B, ord=2, axis=1))
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return np.mean(sim)

def run_sklearn(A, B):
  start = time.time()
  t = []
  for i in range(5):
    t0 = time.time()
    sim = 1-pairwise_distances(A, B, metric="cosine")
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return np.mean(sim)

In [None]:
sim_sklearn = run_sklearn(vecs_1K, vecs_1K)
print(sim_sklearn)

0.01272892951965332
0.005011558532714844
0.0010542869567871094
0.0012395381927490234
0.00424504280090332
average 0.0 seconds per loop
total running time: 0.028494834899902344
0.753365071247367


In [None]:
sim_numpy = run_numpy(vecs_1K, vecs_1K)
print(sim_numpy)

0.00519108772277832
0.0005660057067871094
0.0005333423614501953
0.005492210388183594
0.0005049705505371094
average 0.0 seconds per loop
total running time: 0.01632523536682129
0.753365071247367


In [None]:
sim_sklearn = run_sklearn(vecs_1M, vecs_1K)
print(sim_sklearn)

7.0094006061553955
7.134569406509399
5.298669815063477
6.805742263793945
5.834937572479248
average 6.42 seconds per loop
total running time: 32.085944175720215
0.7505618006601283


In [None]:
sim_numpy = run_numpy(vecs_1M, vecs_1K)
print(sim_numpy)

4.438237905502319
4.655865907669067
4.366970062255859
4.407408952713013
4.416436433792114
average 4.46 seconds per loop
total running time: 22.289359092712402
0.7505618006601283


In [None]:
def run_cupy(A, B):
  start = time.time()
  A = cp.asarray(A)
  B = cp.asarray(B)
  t = []
  for i in range(5):
    t0 = time.time()
    sim = cp.inner(A, B)/cp.outer(cp.linalg.norm(A, ord=2, axis=1), cp.linalg.norm(B, ord=2, axis=1))
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return cp.mean(sim)


def run_torch(A, B):
  start = time.time()
  A = torch.from_numpy(A).to('cuda')
  B = torch.from_numpy(B).to('cuda')
  t_load = time.time()-start
  t = []
  for i in range(5):
    t0 = time.time()
    sim = torch.inner(A, B)/torch.outer(torch.linalg.norm(A, ord=2, dim=1), torch.linalg.norm(B, ord=2,dim=1))
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("loading time CPU -> GPU {}".format(t_load))
  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return torch.mean(sim)

In [None]:
sim_torch = run_torch(vecs_1M, vecs_1K)
print(sim_torch)

0.007815361022949219
0.002759695053100586
0.0007944107055664062
0.0008935928344726562
0.0008795261383056641
loading time CPU -> GPU 0.35512828826904297
average 0.0 seconds per loop
total running time: 0.36974406242370605
tensor(0.7506, device='cuda:0', dtype=torch.float64)


In [None]:
sim_cupy = run_cupy(vecs_1M, vecs_1K)
print(sim_cupy)

0.5818169116973877
0.004094362258911133
0.001142740249633789
0.0004963874816894531
0.001016855239868164
average 0.12 seconds per loop
total running time: 3.442216396331787
0.7505618006601301


**Standard Deviation**

In [None]:
t0 = time.time()
np.std(vecs_1M, axis=1)
print("cpu: {} seconds".format(time.time()-t0))

cpu: 2.983079671859741 seconds


In [None]:
vecs_1M_gpu = torch.from_numpy(vecs_1M).to('cuda')
t0 = time.time()
torch.std(vecs_1M_gpu, axis=1)
print("gpu: {} seconds".format(time.time()-t0))

gpu: 0.014722347259521484 seconds


# **Operations do not fit in memory**

In [None]:
torch.cuda.empty_cache()

In [None]:
import dask.array as da

In [None]:
vecs_1M_dask_gpu = da.from_array(cp.asarray(vecs_1M), chunks=(1000, 300)) # split the array in chunks of size (1000, 300)
vecs_1M_dask_cpu = da.from_array(vecs_1M, chunks=(1000, 300)) 

In [None]:
def cp_cdist(A, B):
  return cp.dot(A, B.T)/cp.outer(cp.linalg.norm(A, ord=2, axis=1), cp.linalg.norm(B, ord=2, axis=1))

def np_cdist(A, B):
  return np.dot(A, B.T)/np.outer(np.linalg.norm(A, ord=2, axis=1), np.linalg.norm(B, ord=2, axis=1))

def run_cupy_dask(A, B):
  start = time.time()
  t = []
  for i in range(5):
    t0 = time.time()
    sim = da.map_blocks(cp_cdist, A, B, dtype=cp.float32)
    sim = sim[:100000,:100000].compute()
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return sim

def run_numpy_dask(A, B):
  start = time.time()
  t = []
  for i in range(5):
    t0 = time.time()
    sim = da.map_blocks(np_cdist, A, B, dtype=np.float32)
    sim = sim[:100000, :100000].compute()
    t1 = time.time()-t0
    print(t1)
    t.append(t1)

  print("average {} seconds per loop".format(round(np.mean(t), 2)))
  print("total running time: {}".format(time.time()-start))
  return sim

In [None]:
sim_dask_cpu = run_numpy_dask(vecs_1M_dask_cpu, vecs_1M_dask_cpu)
print(sim_dask_cpu)

4.6633546352386475
4.572694301605225
4.875424146652222
4.839929580688477
4.660008192062378
average 4.72 seconds per loop
total running time: 23.619914531707764
[[1.         0.75994951 0.76283928 ... 0.73395492 0.74478955 0.72235199]
 [0.75994951 1.         0.75144537 ... 0.7554739  0.74143814 0.75645132]
 [0.76283928 0.75144537 1.         ... 0.73379885 0.77120154 0.75987018]
 ...
 [0.73462304 0.72124727 0.77225656 ... 1.         0.75498603 0.74495495]
 [0.73336735 0.72781681 0.76226259 ... 0.75498603 1.         0.74946872]
 [0.77486488 0.74794437 0.74772344 ... 0.74495495 0.74946872 1.        ]]


In [None]:
sim_dask = run_cupy_dask(vecs_1M_dask_gpu, vecs_1M_dask_gpu)
print(sim_dask)

0.19527626037597656
0.18866705894470215
0.1677556037902832
0.18540024757385254
0.17505121231079102
average 0.18 seconds per loop
total running time: 0.9199235439300537
[[1.         0.75994951 0.76283928 ... 0.73395492 0.74478955 0.72235199]
 [0.75994951 1.         0.75144537 ... 0.7554739  0.74143814 0.75645132]
 [0.76283928 0.75144537 1.         ... 0.73379885 0.77120154 0.75987018]
 ...
 [0.73462304 0.72124727 0.77225656 ... 1.         0.75498603 0.74495495]
 [0.73336735 0.72781681 0.76226259 ... 0.75498603 1.         0.74946872]
 [0.77486488 0.74794437 0.74772344 ... 0.74495495 0.74946872 1.        ]]
