In [1]:
import numpy as np

### Array-Oriented Programming with Arrays

vectorization - concise array expressions instead writing loops

In [3]:
# suppose we wished to evaluate the function sqrt(x^2 + y^2) across a regular grid of values.
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points

In [5]:
# The np.meshgrid function takes two 1D arrays and produces two 2D matrices corresponding to all pairs of (x, y) in the two arrays
xs, ys = np.meshgrid(points, points)

In [6]:
ys

array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ..., 
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])

In [7]:
# now we can write simple expression using these two points
z = np.sqrt(xs**2+ys**2)

In [8]:
z

array([[ 7.07106781,  7.06400028,  7.05693985, ...,  7.04988652,
         7.05693985,  7.06400028],
       [ 7.06400028,  7.05692568,  7.04985815, ...,  7.04279774,
         7.04985815,  7.05692568],
       [ 7.05693985,  7.04985815,  7.04278354, ...,  7.03571603,
         7.04278354,  7.04985815],
       ..., 
       [ 7.04988652,  7.04279774,  7.03571603, ...,  7.0286414 ,
         7.03571603,  7.04279774],
       [ 7.05693985,  7.04985815,  7.04278354, ...,  7.03571603,
         7.04278354,  7.04985815],
       [ 7.06400028,  7.05692568,  7.04985815, ...,  7.04279774,
         7.04985815,  7.05692568]])

#### Expressing Conditional Logic as Array Operations

numpy.where function is a vectorized version of the ternary expression x if condition else y

In [9]:
# Suppose we had a boolean array and two arrays of values:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [10]:
# Suppose we wanted to take a value from xarr whenever the corresponding value in cond is True, and otherwise take the value from yarr
# A list comprehension doing this might look like:
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]

In [11]:
result

[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]

In [12]:
# With np.where you can write this very concisely:
result = np.where(cond, xarr, yarr) # second and third arguments to np.where don’t need to be arrays; one or both of them can be scalars

In [13]:
result

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

A typical use of where in data analysis is to produce a new array of values based on another array

In [14]:
# Suppose you had a matrix of randomly generated data and you wanted to replace all positive values with 2 and all negative values with –2.
arr = np.random.randn(4,4); arr

array([[-0.96071252,  0.99611295, -0.2748049 , -0.19456268],
       [-0.96254158, -0.79940964, -1.58482718,  0.84283093],
       [ 1.00612867, -0.7377589 , -0.20834224, -1.05964486],
       [ 1.52067195, -0.56823106,  0.05892374,  0.69970932]])

In [15]:
np.where(arr>0, 2, -2)

array([[-2,  2, -2, -2],
       [-2, -2, -2,  2],
       [ 2, -2, -2, -2],
       [ 2, -2,  2,  2]])

In [16]:
np.where(arr>0, 2, arr) #set only positive values to 2

array([[-0.96071252,  2.        , -0.2748049 , -0.19456268],
       [-0.96254158, -0.79940964, -1.58482718,  2.        ],
       [ 2.        , -0.7377589 , -0.20834224, -1.05964486],
       [ 2.        , -0.56823106,  2.        ,  2.        ]])

#### Mathematical and Statistical Methods

aggregations (often called reductions) like sum, mean, and std (standard deviation) either by calling the array instance method or using the top-level NumPy function.

In [17]:
arr = np.random.randn(5, 4)

In [18]:
arr

array([[ 0.95165954, -0.15301995,  0.75389643, -1.56297837],
       [ 0.95988329,  1.2693263 ,  0.7989305 ,  0.0711624 ],
       [ 0.54987067, -0.93577098, -0.46680358,  1.06199422],
       [-1.20256258, -1.76577738, -0.97211552, -0.58623694],
       [-0.50955231,  0.09720992,  0.71041818, -1.53686228]])

In [21]:
arr.mean()

-0.12336642250495888

In [22]:
np.mean(arr)

-0.12336642250495888

Functions like mean and sum take an optional axis argument that computes the statistic over the given axis, resulting in an array with one fewer dimension

In [25]:
arr.mean(axis=1) # compute mean across the columns

array([-0.00261059,  0.77482562,  0.05232258, -1.1316731 , -0.30969662])

In [26]:
arr.sum(axis=0) # compute sum down the rows

array([ 0.7492986 , -1.48803208,  0.82432601, -2.55292098])

cumsum and cumprod do not aggregate, instead producing an array of the intermediate results

In [27]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])

In [28]:
arr.cumsum()

array([ 0,  1,  3,  6, 10, 15, 21, 28], dtype=int32)

In multidimensional arrays, accumulation functions like cumsum return an array of the same size, but with the partial aggregates computed along the indicated axis according to each lower dimensional slice

In [29]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])

In [30]:
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [31]:
arr.cumsum(axis=0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

In [32]:
arr.cumprod(axis=1)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]], dtype=int32)

#### Methods for Boolean Arrays

sum is often used as a means of counting True values in a boolean array

In [34]:
arr = np.random.randn(100)

In [36]:
(arr>0).sum() # Number of positive values

43

any() method tests whether one or more values in an array is True, while all() method checks if every value is True

In [37]:
# These methods also work with non-boolean arrays, where non-zero elements evaluate to True
bools = np.array([False, False, True, False])

In [38]:
bools.any()

True

In [39]:
bools.all()

False

#### Sorting

Like Python’s built-in list type, NumPy arrays can be sorted in-place with the sort method

In [41]:
arr = np.random.randn(6)

In [42]:
arr

array([-0.61473376,  0.89782678,  1.67936797, -0.00431791,  0.23816103,
        0.90301274])

In [43]:
arr.sort()

In [44]:
arr

array([-0.61473376, -0.00431791,  0.23816103,  0.89782678,  0.90301274,
        1.67936797])

We can sort each one-dimensional section of values in a multidimensional array in-place along an axis by passing the axis number to sort

In [45]:
arr = np.random.randn(5, 3)

In [46]:
arr

array([[-1.17897077, -0.78150587, -0.219813  ],
       [-0.46903511, -0.52636125, -1.07658704],
       [ 0.27929165, -0.04528121, -0.53780722],
       [-0.26520536,  1.2099513 , -0.44695186],
       [-0.08857196,  0.08825693,  1.00795253]])

In [47]:
# The top-level method np.sort returns a sorted copy of an array instead of modifying the array in-place
arr.sort(1)

In [48]:
arr 

array([[-1.17897077, -0.78150587, -0.219813  ],
       [-1.07658704, -0.52636125, -0.46903511],
       [-0.53780722, -0.04528121,  0.27929165],
       [-0.44695186, -0.26520536,  1.2099513 ],
       [-0.08857196,  0.08825693,  1.00795253]])

A quick-and-dirty way to compute the quantiles of an array is to sort it and select the value at a particular rank

In [49]:
large_arr = np.random.randn(1000)

In [50]:
large_arr.sort()

In [51]:
large_arr[int(0.05*len(large_arr))] # 5% quantile

-1.7186270663715184

#### Unique and Other Set Logic

commonly used method is np.unique(), which returns the sorted unique values in an array

In [52]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

In [55]:
np.unique(names)

array(['Bob', 'Joe', 'Will'],
      dtype='<U4')

In [56]:
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])

In [57]:
np.unique(ints)

array([1, 2, 3, 4])

In [59]:
# Contrast np.unique with the pure Python alternative
sorted(set(names))

['Bob', 'Joe', 'Will']

Another function, np.in1d(), tests membership of the values in one array in another, returning a boolean array

In [64]:
values = np.array([6, 0, 0, 3, 2, 5, 6])

In [65]:
np.in1d(values, [2, 3, 6])

array([ True, False, False,  True,  True, False,  True], dtype=bool)