In [1]:
#### Chapter 4: NumPy Basics ####

### NumPy Arrays

In [2]:
# One of the key features of NumPy is its n-dimensional array, or ndarray,
# which is a fast, flexible container for large datasets. Arrays enable you
# to perform math operations on whole blocks of data using similar syntax to
# the equivalent operations between scalar elements.

In [3]:
import numpy as np

In [4]:
# Generate some random data
data = np.random.randn(2, 3)

In [5]:
data

array([[-1.35698486, -0.21936342,  0.76934461],
       [ 0.45442913, -0.78467991,  0.52922043]])

In [6]:
# scalar multiplication
data * 10

array([[-13.56984865,  -2.19363418,   7.69344613],
       [  4.54429126,  -7.84679915,   5.29220433]])

In [7]:
# Element-wise addition
data + (data * 2)

array([[-4.07095459, -0.65809025,  2.30803384],
       [ 1.36328738, -2.35403974,  1.5876613 ]])

In [8]:
# Note that ndarrays are of homogeneous data, that is, all of the 
# elements must be of the same type. Every array has a shape, a
# tuple indicating the size of each dimension, and a dtype, an
# object describing the data type of the array.

In [9]:
data.shape

(2, 3)

In [10]:
data.dtype

dtype('float64')

### Creating Arrays

In [11]:
# The easiest way to create an array is to use the array() function, which
# accepts any sequence-like object and produces an ndarray

In [12]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

array([6. , 7.5, 8. , 0. , 1. ])

In [13]:
arr1.shape

(5,)

In [14]:
arr1.dtype

dtype('float64')

In [15]:
# Nested sequences, like a list of lists, will be converted into a 
# multi-dimensional array

In [16]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [17]:
arr2.shape

(2, 4)

In [18]:
arr2.ndim

2

In [19]:
# Unless specified, np.array() tries to infer a good data type for the 
# array. The data type is stored in a special dtype metadata object. 

In [20]:
arr2.dtype

dtype('int32')

In [21]:
# In addition to np.array(), np.zeros() creates an array of all zeros while
# np.ones() creates an array of all ones. There's also np.emtpy() which 
# creates an array without any particular initializations. You may pass a 
# length or a shape (tuple) as an argument.

In [22]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [23]:
np.zeros((3, 6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [24]:
np.empty((2, 3, 2))

array([[[1.16882241e-311, 3.16202013e-322],
        [0.00000000e+000, 0.00000000e+000],
        [1.69119330e-306, 1.42671594e-071]],

       [[1.00606990e-046, 1.65888234e-076],
        [9.59226010e-043, 1.91892275e-076],
        [6.05201999e-066, 4.81607404e-038]]])

In [25]:
# np.arange() is an array-valued version of the built-in Python range() function:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [26]:
# You can also produce an array of the given shape and dtype with all values set
# to the indicated fill value:
np.full((5, 5), 7)

array([[7, 7, 7, 7, 7],
       [7, 7, 7, 7, 7],
       [7, 7, 7, 7, 7],
       [7, 7, 7, 7, 7],
       [7, 7, 7, 7, 7]])

In [27]:
# Create an nxn identity matrix:
np.identity(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

### Data Types

In [28]:
# The data type or dtype is a special object containing the metadata
# the ndarray needs to interpret a chunk of memory as a particular
# type of data

In [29]:
arr1 = np.array([1, 2, 3], dtype=np.float64)

In [30]:
arr2 = np.array([1, 2, 3], dtype=np.int32)

In [31]:
arr1.dtype

dtype('float64')

In [32]:
arr2.dtype

dtype('int32')

In [33]:
# You can explicitly convert or cast an array from one dtype to another:
arr = np.array([1, 2, 3, 4, 5])
arr.dtype

dtype('int32')

In [34]:
float_arr = arr.astype(np.float64)
float_arr.dtype

dtype('float64')

In [35]:
# If you cast floats to ints the decimal part will be truncated:
arr = np.array([1.2, 2.5, 3.7, 4.1])
arr.astype(np.int32)

array([1, 2, 3, 4])

In [36]:
# You can also use another array's dtype attribute:
int_array = np.arange(10)
calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)
int_array.astype(calibers.dtype)
int_array.dtype

dtype('int32')

### Arithmetic

In [37]:
# Vectorization enables you to express batch operations on data without using 
# any for loops. Any arithmetic operations between equal-size arrays applies
# the operations element-wise:
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [38]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [39]:
arr - arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [40]:
# Arithmetic operations with scalars propogate the scalar argument to each element
# in the array:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [41]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [42]:
# Comparisons between arrays of the same size yield boolean arrays:
arr2 = np.array([[0.0, 4.0, 1.0], [7.0, 2.0, 12.0]])
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [43]:
arr2 > arr

array([[False,  True, False],
       [ True, False,  True]])

### Indexing and Slicing

In [44]:
# One-dimensional arrays are simple, they can be sliced similarly to lists:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [45]:
arr[5]

5

In [46]:
arr[5:8]

array([5, 6, 7])

In [47]:
arr[5:8] = 12

In [48]:
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [49]:
# An important distinction from Python's built-in lists is that array slices
# are views on the original array. This means that the data is not copied, 
# and any modifications to the view will be reflected in the source array. 

In [50]:
arr_slice = arr[5:8]
arr_slice

array([12, 12, 12])

In [51]:
# Now, when I change values in arr_slice, the mutations are reflected in
# the original array:
arr_slice[1] = 12345
arr

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,
           9])

In [52]:
# If you want a copy of a slice of an ndarray instead of a view, you will
# need to copy the array:
arr[5:8].copy()

array([   12, 12345,    12])

In [53]:
# In a two-dimensional array, the elements at each single index are 
# one-dimensional arrays:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2]

array([7, 8, 9])

In [54]:
# Individual elements can be accessed recursively, or you can pass a 
# a comma-separated list of indices:

In [55]:
arr2d[0][2]

3

In [56]:
arr2d[0, 2]

3

In [57]:
# NumPy uses axes to specify a dimension. Axis 0 refers to the rows, whereas
# axis 1 refers to the columns for two-dimensional arrays 

In [58]:
# In multidimensional arrays, if you omit later indices, the returned object
# will be a lower dimensional ndarray consisting of all the data along the higher
# dimensions:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [59]:
# Similarly, arr3d[1, 0] gives you all of the values whose indices start with (1, 0),
# forming a 1-dimensional array:
arr3d[1, 0]

array([7, 8, 9])

In [60]:
# Like one-dimensional objects such as lists, ndarrays can be sliced with similar syntax:
arr[1:6]

array([ 1,  2,  3,  4, 12])

In [61]:
# Two-dimensional arrays slice a bit differently, here it slices along axis 0, which 
# selects a range of elements along that axis.
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [62]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [63]:
# Note that a colon by itself means to take the entire axis, so you can slice only
# higher dimensional axes by doing so:
arr2d[:2, :]

array([[1, 2, 3],
       [4, 5, 6]])

In [64]:
arr2d[:, :1]

array([[1],
       [4],
       [7]])

In [65]:
# You can pass multiple slices just like you can pass multiple indexes. When slicing 
# like this, you always obtain array views of the same number of dimensions:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [66]:
# By mixing integer indexes and slices, you get lower dimensional slices:
arr2d[:2, 2]

array([3, 6])

In [67]:
arr2d[1, :2]

array([4, 5])

### Boolean Indexing

In [68]:
# Let's consider an example where we have some data in an array and an array
# of names with duplicates:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)

In [69]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [70]:
data

array([[-0.06602746, -0.94197342,  1.19977644, -0.45894599],
       [ 2.04654199,  0.13573696,  0.32625558, -1.02302004],
       [-0.69739355, -1.04537176,  0.5678742 ,  0.04393262],
       [-0.64797905, -0.10413356,  1.71519034,  0.47715131],
       [ 0.40121343, -0.81309303, -0.84033566,  0.28725344],
       [-1.09483839, -1.2083969 , -0.70257571,  0.48374532],
       [ 1.18600188,  0.61335192, -0.00792292, -1.6642279 ]])

In [71]:
# Now suppose each name corresponds to a row in the data array and we wanted to
# select all the rows with corresponding name 'Bob'. Like arithmetic, comparisons
# with arrays are also vectorized:
names == 'Bob'

array([ True, False, False,  True, False, False, False])

In [72]:
# This boolean array can be passed when indexing the array:
data[names == 'Bob']

array([[-0.06602746, -0.94197342,  1.19977644, -0.45894599],
       [-0.64797905, -0.10413356,  1.71519034,  0.47715131]])

In [73]:
# The boolean array must be of the same length as the array axis it's indexing.

In [74]:
# You can also mix and match boolean arrays with slices or integers:
data[names == 'Bob', 2:]

array([[ 1.19977644, -0.45894599],
       [ 1.71519034,  0.47715131]])

In [75]:
data[names == 'Bob', 3:]

array([[-0.45894599],
       [ 0.47715131]])

In [76]:
# To select everything but 'Bob' you can use either != or negation with ~
data[~(names == 'Bob')]

array([[ 2.04654199,  0.13573696,  0.32625558, -1.02302004],
       [-0.69739355, -1.04537176,  0.5678742 ,  0.04393262],
       [ 0.40121343, -0.81309303, -0.84033566,  0.28725344],
       [-1.09483839, -1.2083969 , -0.70257571,  0.48374532],
       [ 1.18600188,  0.61335192, -0.00792292, -1.6642279 ]])

In [77]:
data[names != 'Bob']

array([[ 2.04654199,  0.13573696,  0.32625558, -1.02302004],
       [-0.69739355, -1.04537176,  0.5678742 ,  0.04393262],
       [ 0.40121343, -0.81309303, -0.84033566,  0.28725344],
       [-1.09483839, -1.2083969 , -0.70257571,  0.48374532],
       [ 1.18600188,  0.61335192, -0.00792292, -1.6642279 ]])

In [78]:
# The ~ operator can be useful when you want to invert a general condition:
cond = names == 'Bob'
data[~cond]

array([[ 2.04654199,  0.13573696,  0.32625558, -1.02302004],
       [-0.69739355, -1.04537176,  0.5678742 ,  0.04393262],
       [ 0.40121343, -0.81309303, -0.84033566,  0.28725344],
       [-1.09483839, -1.2083969 , -0.70257571,  0.48374532],
       [ 1.18600188,  0.61335192, -0.00792292, -1.6642279 ]])

In [79]:
# Selecting two of the three names to combine multiple boolean conditions can
# be done using & or | operators:
mask = (names == 'Bob') | (names == 'Will')
mask

array([ True, False,  True,  True,  True, False, False])

In [80]:
# The Python keywords 'and' and 'or' don't work with boolean arrays. Use & and |
data[mask]

array([[-0.06602746, -0.94197342,  1.19977644, -0.45894599],
       [-0.69739355, -1.04537176,  0.5678742 ,  0.04393262],
       [-0.64797905, -0.10413356,  1.71519034,  0.47715131],
       [ 0.40121343, -0.81309303, -0.84033566,  0.28725344]])

In [81]:
# Note that selecting data from an array by boolean indexing always creates a copy
# of the data, even if the returned array is unchanged.

In [82]:
# Setting values with boolean arrays works in a common-sense way. To set all of the 
# negative values in data to 0 we need only do:
data[data < 0] = 0
data

array([[0.        , 0.        , 1.19977644, 0.        ],
       [2.04654199, 0.13573696, 0.32625558, 0.        ],
       [0.        , 0.        , 0.5678742 , 0.04393262],
       [0.        , 0.        , 1.71519034, 0.47715131],
       [0.40121343, 0.        , 0.        , 0.28725344],
       [0.        , 0.        , 0.        , 0.48374532],
       [1.18600188, 0.61335192, 0.        , 0.        ]])

In [83]:
# Setting whole rows or columns using a one-dimensional boolean array:
data[names != 'Joe'] = 7
data

array([[7.        , 7.        , 7.        , 7.        ],
       [2.04654199, 0.13573696, 0.32625558, 0.        ],
       [7.        , 7.        , 7.        , 7.        ],
       [7.        , 7.        , 7.        , 7.        ],
       [7.        , 7.        , 7.        , 7.        ],
       [0.        , 0.        , 0.        , 0.48374532],
       [1.18600188, 0.61335192, 0.        , 0.        ]])

### Fancy Indexing

In [84]:
# Fancy indexing describes indexing using integer arrays:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [85]:
# To select out a subset of the rows in a particular order, you can simply pass
# a list or ndarray of integers specifying the desired order:
arr[[4, 3, 0, 6]]

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.],
       [0., 0., 0., 0.],
       [6., 6., 6., 6.]])

In [86]:
# Using negative indices selects rows from the end:
arr[[-3, -5, -7]]

array([[5., 5., 5., 5.],
       [3., 3., 3., 3.],
       [1., 1., 1., 1.]])

In [87]:
# Passing multiple index arrays selects a one-dimensional array of elements 
# corresponding to each tuple of indices:
arr = np.arange(32).reshape((8, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [88]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]]

array([ 4, 23, 29, 10])

In [89]:
# To retrieve a rectangular region formed by selecting a subset of the matrix's
# rows and columns:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [90]:
# Thus, to rearrange columns:
arr[:, [0, 3, 1, 2]]

array([[ 0,  3,  1,  2],
       [ 4,  7,  5,  6],
       [ 8, 11,  9, 10],
       [12, 15, 13, 14],
       [16, 19, 17, 18],
       [20, 23, 21, 22],
       [24, 27, 25, 26],
       [28, 31, 29, 30]])

In [91]:
# Note that fancy indexing (unlike slicing) always copies the data into a new array

### Transposition

In [92]:
# Transposing is a special form of reshaping that similarly returns a view on the
# underlying data without copying anything

In [93]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [94]:
# Using the transpose attribute:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [95]:
# Using the transpose method:
arr.transpose()

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [96]:
# When computing the inner matrix product:
arr = np.random.randn(6, 3)
arr

array([[-0.02677916,  1.66449312,  0.13763121],
       [ 1.78034088, -1.80502272, -0.86201638],
       [ 0.2423237 , -0.48677189,  0.72110489],
       [ 0.18616008,  0.068406  , -0.61475286],
       [-0.74369482,  1.20736791, -0.9855229 ],
       [ 1.24957706, -1.53059673, -0.04350083]])

In [98]:
np.dot(arr.T, arr)

array([[ 5.37823193, -6.17386318, -0.79949964],
       [-6.17386318, 10.07073421,  0.26867247],
       [-0.79949964,  0.26867247,  2.63307563]])

In [99]:
# For higher dimensional arrays, transpose() will accept a tuple of axis numbers 
# to permute the axes:
arr = np.arange(16).reshape((2, 2, 4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [101]:
# Here, the axes have been reordered with the second axis first, the first axis second,
# and the last axis unchanged:
arr.transpose((1, 0, 2))

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [102]:
# Simple transposing with array.T is a special case of swapping axes. There is a method
# swapaxes() that takes a pair of axis numbers and switches the indicated axes to rearrange
# the data:
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [104]:
# This will return a view on the data wihtout making a copy
arr.swapaxes(1, 2)

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

### Universal Functions

In [105]:
# A universal function performs element-wise operations on data in ndarrays. You can
# think of them as fast vectorized wrappers for simple functions that take one or more
# scalar values and produce one or more scalar results. 

In [106]:
arr = np.arange(10)
arr 

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [107]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [108]:
np.exp(arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

In [110]:
# These are referred to as unary functions. Others, such as add() or maximum() take
# two arrays, so they are binary functions:
x = np.random.randn(8)
y = np.random.randn(8)
# Compute element-wise maximum between x and y
np.maximum(x, y)

array([ 2.52452194, -0.72826441,  2.12049382, -0.3105261 ,  0.18390447,
        0.59069665, -0.70515317,  0.11717659])

In [111]:
# While not common, a universal function can return multiple arrays. For example,
# modf() returns the fractional and integral parts of a floating-point array:
arr = np.random.randn(7) * 5
arr

array([-3.16375972,  0.88459416, -1.37405903,  1.58504543,  1.73400229,
       -2.67131551, -2.82500242])

In [112]:
remainder, whole_part = np.modf(arr)

In [113]:
remainder

array([-0.16375972,  0.88459416, -0.37405903,  0.58504543,  0.73400229,
       -0.67131551, -0.82500242])

In [114]:
whole_part

array([-3.,  0., -1.,  1.,  1., -2., -2.])