In [1]:
v = [.5, .75, 1., 1.5, 2.0]
v

[0.5, 0.75, 1.0, 1.5, 2.0]

In [2]:
m = [v, v, v]
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [3]:
m[1]

[0.5, 0.75, 1.0, 1.5, 2.0]

In [4]:
m[1][0]

0.5

In [5]:
v1 = [.5, 1.5]
v2 = [1, 2]

In [6]:
m = [v1, v2]
c = [m, m]
c

[[[0.5, 1.5], [1, 2]], [[0.5, 1.5], [1, 2]]]

In [7]:
c[1]

[[0.5, 1.5], [1, 2]]

In [8]:
c[1][1]

[1, 2]

In [9]:
c[1][1][0]

1

Note that combining objects in the way just presented generally works with reference
pointers to the original objects. What does that mean in practice? Have a look at the
following operations:

In [10]:
m = [v, v, v]
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [11]:
v[0] = 'Python'
m

[['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0]]

This can be avoided by using the deepcopy() function of the copy module:

In [12]:
from copy import deepcopy

In [13]:
v = [.5, .75, 1., 1.5, 2.0]
m = 3 * [deepcopy(v), ]
m
# Instead of reference pointer, physical copies are used.

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [14]:
v[0] = 'Python'
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [15]:
v = [.5, .75, 1., 1.5, 2.0]

In [16]:
import array

In [17]:
a = array.array('f', v)
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0])

In [18]:
a.append(.5)
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5])

In [19]:
a.extend([5.0, 6.75])
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75])

In [20]:
2 * a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75, 0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75])

In [21]:
a.append('string')

TypeError: must be real number, not str

In [None]:
a.tolist()

In [None]:
f = open('array.apy', 'wb')
a.tofile(f)
f.close()

In [None]:
with open('array.apy', 'wb') as f:
    a.tofile(f)

In [None]:
%ls -n arr*

In [None]:
b = array.array('f')
with open('array.apy', 'rb') as f:
    b.fromfile(f, 5)
b

In [None]:
b = array.array('d')
with open('array.apy', 'rb') as f:
    b.fromfile(f, 2)
b
# The difference in type codes leads to “wrong” numbers.

In [22]:
import numpy as np

In [23]:
a = np.array(['a', 'b', 'c'])
a

array(['a', 'b', 'c'], dtype='<U1')

In [24]:
a = np.array([0, .5, 1.0, 1.5, 2.0])
a

array([0. , 0.5, 1. , 1.5, 2. ])

In [25]:
type(a)

numpy.ndarray

In [26]:
a = np.arange(2, 20, 2)
a

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18])

In [27]:
a = np.arange(8, dtype=np.float64)
a

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [28]:
a[5: ]

array([5., 6., 7.])

In [29]:
a[: 2]

array([0., 1.])

In [30]:
a.sum()

28.0

In [31]:
a.std()

2.29128784747792

In [32]:
a.cumsum()

array([ 0.,  1.,  3.,  6., 10., 15., 21., 28.])

In [33]:
l = [0, .5, 1.5, 3, 5]

In [34]:
l * 2

[0, 0.5, 1.5, 3, 5, 0, 0.5, 1.5, 3, 5]

In [35]:
a

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [36]:
a * 2

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [37]:
a ** 2

array([ 0.,  1.,  4.,  9., 16., 25., 36., 49.])

In [38]:
a * a

array([ 0.,  1.,  4.,  9., 16., 25., 36., 49.])

In [39]:
np.exp(a)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03])

In [40]:
np.sqrt(a)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131])

In [41]:
np.sqrt(2.5)

1.5811388300841898

In [42]:
import math

In [43]:
math.sqrt(2.5)

1.5811388300841898

In [44]:
math.sqrt(a)

TypeError: only size-1 arrays can be converted to Python scalars

In [45]:
%timeit np.sqrt(2.5)

802 ns ± 45.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [46]:
%timeit math.sqrt(2.5)

119 ns ± 7.42 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [47]:
b = np.array([a, a * 2])
b

array([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
       [ 0.,  2.,  4.,  6.,  8., 10., 12., 14.]])

In [48]:
b[0]

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [49]:
b[0, 2]

2.0

In [50]:
b[:, 1]

array([1., 2.])

In [51]:
b.sum()

84.0

In [52]:
b.sum(axis=0)

array([ 0.,  3.,  6.,  9., 12., 15., 18., 21.])

In [53]:
b.sum(axis=1)

array([28., 56.])

In [54]:
c = np.zeros((2, 3), dtype='i', order='C')
c

array([[0, 0, 0],
       [0, 0, 0]], dtype=int32)

In [55]:
c = np.ones((2, 3, 4), dtype='i', order='C')
c

array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]], dtype=int32)

In [56]:
d = np.zeros_like(c, dtype='f', order='C')
d

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]], dtype=float32)

In [59]:
d = np.ones_like(c, dtype='f', order='C')
d

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]], dtype=float32)

In [71]:
# Creates an ndarray object not prepopulated with anything (numbers depend on
# the bits present in the memory).
e = np.empty((2, 3, 2))
e

array([[[0.0078125, 0.0078125],
        [0.0078125, 0.0078125],
        [0.0078125, 0.0078125]],

       [[0.0078125, 0.0078125],
        [0.0078125, 0.0078125],
        [0.0078125, 0.0078125]]])

In [72]:
f = np.empty_like(c)
f

array([[[1065353216, 1065353216, 1065353216, 1065353216],
        [1065353216, 1065353216, 1065353216, 1065353216],
        [1065353216, 1065353216, 1065353216, 1065353216]],

       [[1065353216, 1065353216, 1065353216, 1065353216],
        [1065353216, 1065353216, 1065353216, 1065353216],
        [1065353216, 1065353216, 1065353216, 1065353216]]], dtype=int32)

In [73]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [79]:
g = np.linspace(5, 15, 12)
g

array([ 5.        ,  5.90909091,  6.81818182,  7.72727273,  8.63636364,
        9.54545455, 10.45454545, 11.36363636, 12.27272727, 13.18181818,
       14.09090909, 15.        ])

## Metainformation

In [81]:
g.size, g.itemsize, g.ndim, g.shape, g.dtype, g.nbytes
# The number of bytes used to represent one element.(itemsize)
# The total number of bytes used in memory.(nbytes)

(12, 8, 1, (12,), dtype('float64'), 96)

In [82]:
g = np.arange(15)
g

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [83]:
g.shape

(15,)

In [84]:
np.shape(g)

(15,)

While reshaping in general just provides another view on the same data, resizing in general creates a new (temporary) object.

In [86]:
# While reshaping in general just provides another view on the same data, 
# resizing in general creates a new (temporary) object.
g.reshape((3, 5))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [90]:
h = g.reshape((5, 3))
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [88]:
h.T

array([[ 0,  3,  6,  9, 12],
       [ 1,  4,  7, 10, 13],
       [ 2,  5,  8, 11, 14]])

In [89]:
h.transpose()

array([[ 0,  3,  6,  9, 12],
       [ 1,  4,  7, 10, 13],
       [ 2,  5,  8, 11, 14]])

In [91]:
g

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [92]:
np.resize(g, (3, 1))

array([[0],
       [1],
       [2]])

In [94]:
np.resize(g, (1, 5))

array([[0, 1, 2, 3, 4]])

In [95]:
np.resize(g, (2, 5))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [96]:
n = np.resize(g, (5, 4))
n

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14,  0],
       [ 1,  2,  3,  4]])

In [97]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [102]:
np.stack((h, 2 * h))

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11],
        [12, 13, 14]],

       [[ 0,  2,  4],
        [ 6,  8, 10],
        [12, 14, 16],
        [18, 20, 22],
        [24, 26, 28]]])

In [100]:
np.hstack((h, 2 * h))

array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16],
       [ 9, 10, 11, 18, 20, 22],
       [12, 13, 14, 24, 26, 28]])

In [101]:
np.vstack((h, 2 * h))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22],
       [24, 26, 28]])

In [103]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [104]:
h.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [105]:
h.flatten(order='C')

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [106]:
h.flatten(order='F')

array([ 0,  3,  6,  9, 12,  1,  4,  7, 10, 13,  2,  5,  8, 11, 14])

In [107]:
for i in h.flat:
    print(i, end=',')

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,

In [108]:
# The ravel() method is an alternative to flatten().
for i in h.ravel(order='C'):
    print(i, end=',')

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,

In [109]:
for i in h.ravel(order='F'):
    print(i, end=',')

0,3,6,9,12,1,4,7,10,13,2,5,8,11,14,

## Boolean Arrays

In [113]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [114]:
h > 8

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [ True,  True,  True],
       [ True,  True,  True]])

In [115]:
h <= 7

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True, False],
       [False, False, False],
       [False, False, False]])

In [116]:
h == 5

array([[False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

In [117]:
(h == 5).astype(int)

array([[0, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [118]:
(h > 4) & (h <= 12)

array([[False, False, False],
       [False, False,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True, False, False]])

In [119]:
h[h > 8]

array([ 9, 10, 11, 12, 13, 14])

In [120]:
h[(h > 4) & (h <= 12)]

array([ 5,  6,  7,  8,  9, 10, 11, 12])

In [123]:
h[(h < 4) | (h >= 12)]

array([ 0,  1,  2,  3, 12, 13, 14])

A powerful tool in this regard is the np.where() function, which allows the definition of actions/operations depending on whether a condition is True or False. The result of applying np.where() is a new ndarray object of the same shape as the original one:

In [126]:
np.where(h > 7, 1, 0)

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [1, 1, 1],
       [1, 1, 1]])

In [127]:
np.where(h % 2 == 0, 'even', 'odd')

array([['even', 'odd', 'even'],
       ['odd', 'even', 'odd'],
       ['even', 'odd', 'even'],
       ['odd', 'even', 'odd'],
       ['even', 'odd', 'even']], dtype='<U4')

In [129]:
np.where(h <= 7, h * 2, h / 2)

array([[ 0. ,  2. ,  4. ],
       [ 6. ,  8. , 10. ],
       [12. , 14. ,  4. ],
       [ 4.5,  5. ,  5.5],
       [ 6. ,  6.5,  7. ]])

## Speed Comparison

In [130]:
import random
I = 5000

In [131]:
%time mat = [[random.gauss(0, 1) for j in range(I)] for i in range(I)]

CPU times: total: 13.6 s
Wall time: 13.6 s


In [133]:
mat[0][: 5]

[-0.03192545935952081,
 0.25585002011720787,
 -0.9946192956138111,
 -1.678106408420488,
 1.063769867691863]

In [135]:
%time sum([sum(l) for l in mat])

CPU times: total: 219 ms
Wall time: 236 ms


-4699.621014436749

In [136]:
import sys

In [137]:
sum([sys.getsizeof(l) for l in mat])

209400000

In [142]:
%time mat = np.random.standard_normal((I, I))

CPU times: total: 453 ms
Wall time: 463 ms


In [143]:
%time mat.sum()

CPU times: total: 31.2 ms
Wall time: 40 ms


-5141.296974701469

In [144]:
mat.nbytes

200000000

In [145]:
sys.getsizeof(mat)

200000128

## Structured NumPy Arrays

The specialization of the ndarray class obviously brings a number of valuable benefits with it. However, a too narrow specialization might turn out to be too large a burden to carry for the majority of array-based algorithms and applications. Therefore,nNumPy provides structured ndarray and record recarray objects that allow you to have a different dtype per column. 

In a sense, this construction comes quite close to the operation for initializing tables in a SQL database: one has column names and column data types, with maybe some additional information (e.g., maximum number of characters per str object).

In [146]:
dt = np.dtype([('Name', 'S10'), ('Age', 'i4'), ('Height', 'f'), ('Children/Pets', 'i4', 2)])
dt

dtype([('Name', 'S10'), ('Age', '<i4'), ('Height', '<f4'), ('Children/Pets', '<i4', (2,))])

In [152]:
dt = np.dtype({
    'names': ['Name', 'Age', 'Height', 'Children/Pets'],
    'formats': 'O int float int,int'.split()
})
dt

dtype([('Name', 'O'), ('Age', '<i4'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i4'), ('f1', '<i4')])])

In [154]:
s = np.array([('Smith', 45, 1.83, (0,1 )), 
              ('Jones', 53, 1.72, (2, 2))], 
             dtype=dt)
s

array([('Smith', 45, 1.83, (0, 1)), ('Jones', 53, 1.72, (2, 2))],
      dtype=[('Name', 'O'), ('Age', '<i4'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i4'), ('f1', '<i4')])])

The single columns can now be easily accessed by their names and the rows by their index values:

In [155]:
s['Name']

array(['Smith', 'Jones'], dtype=object)

In [156]:
s['Height'].mean()

1.775

In [157]:
s[0]

('Smith', 45, 1.83, (0, 1))

In [158]:
s[1]['Age']

53

In summary, structured arrays are a generalization of the regular ndarray object type in that the data type only has to be the same per column, like in tables in SQL databases. One advantage of structured arrays is that a single element of a column can be another multidimensional object and does not have to conform to the basic NumPy data types.

They bring SQL table–like data structures to Python, with most of the benefits of regular ndarray objects (syntax, methods, performance).

# Vectorization of Code
Vectorization is a strategy to get more compact code that is possibly executed faster. The fundamental idea is to conduct an operation on or to apply a function to a complex object “at once” and not by looping over the single elements of the object. In Python, functional programming tools such as map() and filter() provide some basic means for vectorization. However, NumPy has vectorization built in deep down in its core.

## Basic Vectorization

In [159]:
np.random.seed(39)

In [161]:
r = np.arange(12).reshape((4, 3))
s = np.arange(12).reshape((4, 3)) * .5

In [162]:
r

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [163]:
s

array([[0. , 0.5, 1. ],
       [1.5, 2. , 2.5],
       [3. , 3.5, 4. ],
       [4.5, 5. , 5.5]])

In [164]:
r + s

array([[ 0. ,  1.5,  3. ],
       [ 4.5,  6. ,  7.5],
       [ 9. , 10.5, 12. ],
       [13.5, 15. , 16.5]])

In [165]:
r + 3

array([[ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [166]:
2 * r

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22]])

In [167]:
2 * r + 3

array([[ 3,  5,  7],
       [ 9, 11, 13],
       [15, 17, 19],
       [21, 23, 25]])

In [168]:
r, r.shape

(array([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]]),
 (4, 3))

In [170]:
s = np.arange(0, 12, 4)
s

array([0, 4, 8])

In [171]:
r + s

array([[ 0,  5, 10],
       [ 3,  8, 13],
       [ 6, 11, 16],
       [ 9, 14, 19]])

In [172]:
s = np.arange(0, 12, 3)
s

array([0, 3, 6, 9])

In [173]:
r + s

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [174]:
r.transpose() + s

array([[ 0,  6, 12, 18],
       [ 1,  7, 13, 19],
       [ 2,  8, 14, 20]])

In [177]:
sr = s.reshape(-1, 1)
sr, sr.shape

(array([[0],
        [3],
        [6],
        [9]]),
 (4, 1))

In [178]:
r + s.reshape(-1, 1)

array([[ 0,  1,  2],
       [ 6,  7,  8],
       [12, 13, 14],
       [18, 19, 20]])

In [195]:
def f(x):
    return 3 * x + 5

In [196]:
f(0.5)

6.5

In [197]:
f(r)

array([[ 5,  8, 11],
       [14, 17, 20],
       [23, 26, 29],
       [32, 35, 38]])

## Memory Layout

In [217]:
x = np.random.standard_normal((1000000, 5))

In [218]:
y = 2 * x + 3

In [219]:
C = np.array((x, y), order='C')

In [220]:
F = np.array((x, y), order='F')

In [221]:
x = 0.0; y = 0.0

In [222]:
C[:2].round(2)

array([[[-0.89,  0.62, -1.83,  0.88, -0.9 ],
        [-1.66,  0.11, -0.32,  0.34,  0.03],
        [-0.56, -0.22, -0.95,  1.2 ,  0.18],
        ...,
        [-1.61, -1.13, -1.04,  0.49, -1.74],
        [-0.69, -0.69, -1.99, -1.04,  0.84],
        [ 0.35, -0.91,  0.5 , -0.6 , -0.74]],

       [[ 1.22,  4.23, -0.65,  4.75,  1.19],
        [-0.32,  3.22,  2.36,  3.68,  3.05],
        [ 1.87,  2.57,  1.11,  5.4 ,  3.37],
        ...,
        [-0.22,  0.73,  0.92,  3.97, -0.48],
        [ 1.63,  1.62, -0.99,  0.92,  4.68],
        [ 3.71,  1.18,  4.  ,  1.79,  1.52]]])

In [223]:
%timeit C.sum()

10.2 ms ± 255 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [224]:
%timeit F.sum()

10.3 ms ± 94.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [225]:
%timeit C.sum(axis=0)

20.1 ms ± 776 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [226]:
%timeit C.sum(axis=1)

32.7 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [227]:
%timeit F.sum(axis=0)

54.1 ms ± 317 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [228]:
%timeit F.sum(axis=1)

73.7 ms ± 733 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [229]:
F = 0.0; C = 0.0

We can summarize the performance results as follows:
- When calculating the sum of all elements, the memory layout does not really matter.
- The summing up over the C-ordered ndarray objects is faster both over rows and over columns (an absolute speed advantage).
- With the C-ordered (row-major) ndarray object, summing up over rows is relatively faster compared to summing up over columns.
- With the F-ordered (column-major) ndarray object, summing up over columns is relatively faster compared to summing up over rows.