In [2]:
import numba
from numba import jit, int32, prange, vectorize, float64
import numpy as np
import math

In [None]:
# Function that calculates the value of Pi using monte-carlo method
# Minimal use of Numpy
def native_python_monte_carlo(n):
    within_circle = 0
    
    for i in range(int(n)):
        x = np.random.random()
        y = np.random.random()
        
        if x**2 + y**2 <= 1.0:
            within_circle += 1
        
    return 4.0 * within_circle / n

In [44]:
%%time
print(native_python_monte_carlo(10000000))

3.1415
CPU times: user 13.9 s, sys: 35 ms, total: 14 s
Wall time: 14 s


In [None]:
# Function that calculates the value of Pi using monte-carlo method
# Leveraged numpy as far as possible

def numpy_python_monte_carlo(n):
    within_circle = 0
    
    x = np.random.random(int(n))
    y = np.random.random(int(n))
        
    within_circle = np.sum( (x**2 + y**2) <= 1.0 )
        
    return 4.0 * within_circle / n

In [None]:
%%time
print(numpy_python_monte_carlo(10000000))

### Use @jit on both functions and see what is the speedup acheived (ignore the first run)
* Native python :
* Numpy :

* Natve python with Numba :
* Numpy wuth Numba: 

In [42]:
@jit
def native_python_monte_carlo(n):
    within_circle = 0
    
    for i in range(int(n)):
        x = np.random.random()
        y = np.random.random()
        
        if x**2 + y**2 <= 1.0:
           within_circle += 1
        
    return 4.0 * within_circle / n



In [None]:
@jit
def numpy_python_monte_carlo(n):
    within_circle = 0
    
    x = np.random.random(int(n))
    y = np.random.random(int(n))
        
    within_circle = np.sum( (x**2 + y**2) <= 1.0 )
        
    return 4.0 * within_circle / n

### What is the difference If we use eager compilation (ignore first run)?

* Natve python with Numba (Lazy):
* Natve python with Numba (Eager):

In [5]:


@jit(numba.float32(int32))
def native_python_monte_carlo_eager(n):
    within_circle = 0
    
    for i in range(int(n)):
        x = np.random.random()
        y = np.random.random()
        
        if x**2 + y**2 <= 1.0:
            within_circle += 1
        
    return 4.0 * within_circle / n

CPU times: user 142 ms, sys: 6.53 ms, total: 148 ms
Wall time: 161 ms


In [8]:
%%time
print(native_python_monte_carlo_eager(10000000))

3.1419079303741455
CPU times: user 125 ms, sys: 3.77 ms, total: 129 ms
Wall time: 135 ms


In [6]:
%%time
@jit
def native_python_monte_carlo_lazy(n):
    within_circle = 0
    
    for i in range(int(n)):
        x = np.random.random()
        y = np.random.random()
        
        if x**2 + y**2 <= 1.0:
            within_circle += 1
        
    return 4.0 * within_circle / n

CPU times: user 322 µs, sys: 15 µs, total: 337 µs
Wall time: 416 µs


In [9]:
%%time
print(native_python_monte_carlo_lazy(10000000))

3.1412092
CPU times: user 208 ms, sys: 5.46 ms, total: 213 ms
Wall time: 220 ms


### What is the difference with and without automatic parallelization (ignore the first run)?
hint: use *numba.prange* instead of *range*
* With parallel:
* Without parallel:

In [20]:
@jit(parallel=True)
def native_python_monte_carlo_parallel(n):
    within_circle = 0
    
    for i in numba.prange(int(n)):
        x = np.random.random()
        y = np.random.random()
        
        if x**2 + y**2 <= 1.0:
            within_circle += 1
        
    return 4.0 * within_circle / n

In [23]:
%%time
print(native_python_monte_carlo_parallel(10000000))

3.141594
CPU times: user 221 ms, sys: 4.38 ms, total: 226 ms
Wall time: 92.5 ms


### Offload the computation to a GPU
* Assumptions:
    * N <= 512
* Hints:
    * Launch one block with threads <= 512

In [29]:
@cuda.jit
def native_python_monte_kernel(array_a, array_b, array_c):
    # Thread id in a 1D block
    tx = cuda.threadIdx.x
 
    if tx < an_array.size:  # Check array boundaries
        array_c[tx] = 1 if array_a[tx]**2 + array_b[tx]**2 <= 1.0  else 0

NameError: name 'cuda' is not defined

In [30]:
n = 512
array_a = np.random.random(n)
array_b = np.random.random(n)
array_c = np.zeros(n)


native_python_monte_kernel[1, n](array_a, array_b, array_c)
print(4.0 * np.sum(array_c) / n)

NameError: name 'native_python_monte_kernel' is not defined