In [1]:
# Just to know last time this was run:
import time
print time.ctime()

Wed Oct 31 17:17:43 2018


# I Calling Fortran from Python

This is part of the Python lecture given by Christophe Morisset at IA-UNAM. More informations at: http://python-astro.blogspot.mx/

In [2]:
import numpy as np

The following is part of this excellent web page: http://nbviewer.ipython.org/github/jrjohansson/scientific-python-lectures/blob/master/Lecture-6A-Fortran-and-C.ipynb

In [3]:
# simple python algorithm: example of a SLOW implementation
# Why? Because the loop is implemented in python.
def py_dcumsum(a):
    b = np.empty_like(a)
    b[0] = a[0]
    for n in range(1,len(a)):
        b[n] = b[n-1]+a[n]
    return b

In [4]:
# The numpy version of the cumsum
def numpy_cumsum(a):
    return np.cumsum(a)

We write here a fortran function with some special code to interact with python

In [6]:
%%writefile dcumsum.f
c File dcumsum.f
       subroutine dcumsum(a, b, n)
       double precision a(n)
       double precision b(n)
       integer n
cf2py  intent(in) :: a
cf2py  intent(out) :: b
cf2py  intent(hide) :: n

       b(1) = a(1)
       do 100 i=2, n
           b(i) = b(i-1) + a(i)
100    continue
       end

Overwriting dcumsum.f


In [7]:
# Compiling. On my OSX, gfortran is used
!f2py --f77exec=gfortran -c dcumsum.f -m dcumsum

[39mrunning build[0m
[39mrunning config_cc[0m
[39munifing config_cc, config, build_clib, build_ext, build commands --compiler options[0m
[39mrunning config_fc[0m
[39munifing config_fc, config, build_clib, build_ext, build commands --fcompiler options[0m
[39mrunning build_src[0m
[39mbuild_src[0m
[39mbuilding extension "dcumsum" sources[0m
[39mf2py options: [][0m
[39mf2py:> /var/folders/bb/jg97y_ln7cn8wbgbl8zs8rvr0000gn/T/tmpcaefpp/src.macosx-10.5-x86_64-2.7/dcumsummodule.c[0m
[39mcreating /var/folders/bb/jg97y_ln7cn8wbgbl8zs8rvr0000gn/T/tmpcaefpp/src.macosx-10.5-x86_64-2.7[0m
Reading fortran codes...
	Reading file 'dcumsum.f' (format:fix,strict)
Post-processing...
	Block: dcumsum
			Block: dcumsum
Post-processing (stage 2)...
Building modules...
	Building module "dcumsum"...
		Constructing wrapper function "dcumsum"...
		  b = dcumsum(a)
	Wrote C/API module "dcumsum" to file "/var/folders/bb/jg97y_ln7cn8wbgbl8zs8rvr0000gn/T/tmpcaefpp/src.macosx-10.5-x86_64-2.7/dcum

In [8]:
# Importing the function as if it where a python package
import dcumsum

In [10]:
a = np.linspace(10,100, 1000)

In [11]:
%timeit py_dcumsum(a)

1000 loops, best of 3: 278 µs per loop


In [12]:
%timeit numpy_cumsum(a)

The slowest run took 9.87 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.86 µs per loop


In [13]:
%timeit a.cumsum()

The slowest run took 28.97 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.31 µs per loop


In [14]:
%timeit dcumsum.dcumsum(a)

The slowest run took 10.30 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.25 µs per loop


The Fortran call is still 2 times faster than the numpy object method, and 10 times faster than the loop.

### cython

In [3]:
# Integration of a function by summing values
def f(x):    
   return x**2 - x
def integrate_f(a, b, N):
    s  = 0    
    dx = float(b - a) / N
    for i in range(N):   
       s += f(a + i*dx)
    return s*dx

In [4]:
# To allow the use of %%cython
%load_ext Cython

In [5]:
%%cython
cdef double cy_f(x):    
   return x**2 - x
def cy_integrate_f(double a, double b, int N):
    cdef int i
    cdef double s, dx
    s  = 0
    dx = (b - a) / N
    for i in range(N):
        s += cy_f(a + i*dx)
    return s*dx

In [26]:
%timeit integrate_f(0,3,10**3)

1000 loops, best of 3: 269 µs per loop


In [27]:
# Really faster!!!
%timeit cy_integrate_f(0,3,10**3)

10000 loops, best of 3: 65.8 µs per loop


In [28]:
# Same values are obtain (hopefully!)
print integrate_f(0,3,10**3), cy_integrate_f(0,3,10**3)

4.4910045 4.4910045


In [29]:
a = 0.
b = 3.
N= 1e3
xs = np.linspace(a, b, N+1)



In [30]:
%timeit s = f(xs).sum()

The slowest run took 66.38 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 5.35 µs per loop


Let's now compare when doing havy matrix operations, taken from http://technicaldiscovery.blogspot.mx/2011/06/speeding-up-python-numpy-cython-and.html

In [31]:
dx = 0.1
dy = 0.1
dx2 = dx*dx
dy2 = dy*dy

# The looping way
def py_update(u):
    nx, ny = u.shape
    for i in xrange(1,nx-1):
        for j in xrange(1, ny-1):
            u[i,j] = ((u[i+1, j] + u[i-1, j]) * dy2 +
                      (u[i, j+1] + u[i, j-1]) * dx2) / (2*(dx2+dy2))

def calc(N, Niter=100, func=py_update, args=()):
    u = np.zeros([N, N])
    u[0] = 1
    for i in range(Niter):
        func(u,*args)
    return u

In [32]:
%timeit calc(20)

10 loops, best of 3: 43.1 ms per loop


In [33]:
# The numpy way
def num_update(u):
    u[1:-1,1:-1] = ((u[2:,1:-1]+u[:-2,1:-1])*dy2 + 
                    (u[1:-1,2:] + u[1:-1,:-2])*dx2) / (2*(dx2+dy2))

In [34]:
%timeit calc(20, func=num_update)

1000 loops, best of 3: 1.17 ms per loop


In [35]:
%%cython
cimport numpy as np

def cy_update(np.ndarray[double, ndim=2] u, double dx2, double dy2):
    cdef unsigned int i, j
    for i in xrange(1,u.shape[0]-1):
        for j in xrange(1, u.shape[1]-1):
            u[i,j] = ((u[i+1, j] + u[i-1, j]) * dy2 +
                      (u[i, j+1] + u[i, j-1]) * dx2) / (2*(dx2+dy2))


In [36]:
%timeit calc(20, func=cy_update, args=(dx2, dy2))

1000 loops, best of 3: 329 µs per loop
