# High-performance computing in Python

In [5]:
from functools import partial
import multiprocessing as mp
import itertools
import sys
import os

sys.path.insert(0, "../02-sensitivity-analysis/python")
sys.path.insert(0, "src")

import matplotlib.pyplot as plt
from numpy import f2py
import numpy as np

from ishigami import compute_simulation_total_effect
from ishigami import compute_simulation_main_effect
from ishigami import evaluate_ishigami_readable
from ishigami import evaluate_ishigami

from auxiliary import evaluate_ishigami_readable_loop
from auxiliary import evaluate_ishigami_numba_loop
from auxiliary import task_mp_no_communication
from auxiliary import task_mp_management
from auxiliary import task_mp_queue
from functools import partial




ModuleNotFoundError: No module named 'ishigami_f2py'

We can compare the implementation of a for loop with a vectorized counterparts for a set of random input parameters.

In [None]:
num_draws = 1000
inputs = np.random.uniform(low=-np.pi, high=np.pi, size=(num_draws, 3))

## Pure Python and peformand scientific libraries

In [None]:
np.testing.assert_almost_equal(evaluate_ishigami_readable_loop(inputs), evaluate_ishigami(inputs))

%timeit evaluate_ishigami_readable_loop(inputs)
%timeit evaluate_ishigami(inputs)

## Compilation

* just-in-time 

In [10]:
np.testing.assert_almost_equal(evaluate_ishigami_numba_loop(inputs), evaluate_ishigami(inputs))

%timeit evaluate_ishigami_readable_loop(inputs)
%timeit evaluate_ishigami_numba_loop(inputs)

7.44 ms ± 492 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
41.8 µs ± 321 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


* ahead-of-time

In [7]:
src = open('src/ishigami.f90', 'rb').read()
f2py.compile(src, 'ishigami_f2py', "", extension='.f90')
#assert f2py.compile(src, 'ishigami_f2py', "", extension='.f90') == 0




1

In [23]:
from auxiliary import evaluate_ishigami_f2py_loop

In [24]:
np.testing.assert_almost_equal(evaluate_ishigami_f2py_loop(inputs), evaluate_ishigami(inputs))

%timeit evaluate_ishigami_readable_loop(inputs)
%timeit evaluate_ishigami_f2py_loop(inputs)

7.39 ms ± 621 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
32.3 µs ± 727 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Parallel processing

We first want to get a sense how many CPU's we have available.

In [None]:
print(f"Number of cpu : {mp.cpu_count()}")

### mp.Process

* without communication

In [None]:
num_outer = num_inner = 100
task_partial = partial(task_mp_no_communication, num_outer, num_inner)

In [None]:
processes = list()
for which in range(3):
    p = mp.Process(target=task_partial, args=(which, ))
    processes.append(p)

# We can execute our work.
[p.start() for p in processes]
[p.join() for p in processes];

* with communication

In [None]:
# We need to prepare a container for communication. One
# alternative is to derine a shared variable.
qout = mp.Queue()

# We can partial out most function arguments
task_partial = partial(task_mp_queue, num_outer, num_inner, qout)

In [None]:
# We can prepare our work.
processes = []
for which in range(3):
    p = mp.Process(target=task_partial, args=(which, ))
    processes.append(p)
    
# We can execute our work.
[p.start() for p in processes]
[p.join() for p in processes]
    
# We need to ensure that we have a unique
# order of the results.
unsorted_result = [qout.get() for p in processes]
result = [t[1] for t in sorted(unsorted_result)] 
print(result)

### mp.Pool

In [None]:
task_partial = partial(task_mp_no_communication, num_outer, num_inner)

pool = mp.Pool(processes=3)
pool.map(task_partial, range(3))

pool.close()
pool.join()

In [None]:
task_partial = partial(task_mp_management, num_outer, num_inner)

tasks = list(itertools.product(["main", "total"], range(3)))

pool = mp.Pool(processes=3)
rslt = pool.map(task_partial, tasks)

pool.close()
pool.join()

## Distributed computing