# NumPy Array Basics

This notebook demonstrates the basics of NumPy arrays: how to create them and how to perform math on them.

In [2]:
import numpy as np

## NumPy arrays and Python lists



In [3]:
# Python lists are designed to be as flexible as possible. They can contain
# different types of values, and it is possible to dynamically add and
# remove elements from them.

my_lst = []
my_lst.append(3)
my_lst.append(4.9)
my_lst.append("Hello")
my_lst.remove("Hello")
my_lst

[3, 4.9]

In [4]:
# NumPy arrays are not designed to be flexible like Python lists. Once you create
# a NumPy array, it cannot be resized.
my_arr = np.array([1, 2, 3])
my_arr

array([1, 2, 3])

In [5]:
# NumPy arrays must contain only one type of value. If types in the input list are
# inconsistent, NumPy will attempt to select a safe type that can be used for all
# values in the list, and convert them all.
#
# For example, the presence of a float in the input list results in an array
# where all values are floats.
np.array([1, 2, 3.1])

array([1. , 2. , 3.1])

In [6]:
# As a side note, NumPy has an append() function.
# (https://numpy.org/doc/stable/reference/generated/numpy.append.html)
# But unlike Python lists, it does not change the content of the array. Instead,
# it creates a new array. In this example, we append 4 to my_arr, and see that
# my_arr remains unchanged:
my_arr_2 = np.append(my_arr, 4)
print(my_arr, my_arr_2)

[1 2 3] [1 2 3 4]


In [7]:
# We can also create multidimensional arrays by giving np.array() lists of lists.
# The dimensions of the input lists must be consistent.
my_arr_2d = np.array([
    [3, 5, 10, 99],
    [9, 21, 3, 4]
])
my_arr_2d

array([[ 3,  5, 10, 99],
       [ 9, 21,  3,  4]])

In [8]:
# NumPy arrays have some useful attributes. The "shape" attribute gives us the
# dimensions of the array. Here, we see that my_arr_2d is a 2x4 array.
my_arr_2d.shape

(2, 4)

In [9]:
# We can also access the "dtype" attribute to see the type of the array.
my_arr_2d.dtype

dtype('int32')

## Creating NumPy arrays programmatically

In [10]:
# NumPy's arange() function works like Python's built-in range() function.
# (https://numpy.org/doc/stable/reference/generated/numpy.arange.html).
# Here, we request a NumPy array with integer values from 0 to 10.
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
# Here, we request a NumPy array with integer values from 5 to 11.
np.arange(5, 11)

array([ 5,  6,  7,  8,  9, 10])

In [12]:
# Here, we request a NumPy array with integer values from 5 to 11, using a step
# size of 2 between each value.
np.arange(5, 11, 2)

array([5, 7, 9])

In [13]:
# NumPy's linspace() function allows us to create an array of a fixed number of
# values evenly spaced between a start and end point.
# (https://numpy.org/doc/stable/reference/generated/numpy.linspace.html)
# Here, we request a NumPy array containing 10 values between 5 and 50.
np.linspace(5, 50, 10)

array([ 5., 10., 15., 20., 25., 30., 35., 40., 45., 50.])

In [14]:
# Here, we request a NumPy array containing 11 values between 5 and 50.
np.linspace(5, 50, 11)

array([ 5. ,  9.5, 14. , 18.5, 23. , 27.5, 32. , 36.5, 41. , 45.5, 50. ])

In [15]:
# Sometimes, it is useful to create an array of a fixed size initialized to some
# value. There are several functions we can use:
#
# ones(): Fill the array with all ones (https://numpy.org/doc/stable/reference/generated/numpy.ones.html)
# zeros(): Fill the array with all zeros (https://numpy.org/doc/stable/reference/generated/numpy.zeros.html)
# empty(): Allocate but do not initialize the array. It will contain random uninitialized values
#          (https://numpy.org/doc/stable/reference/generated/numpy.empty.html)
#
# All of these functions accept a tuple describing the desired shape of the array.
# Here, we ask for a 3x3 array containing all ones:
np.ones((3, 3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [16]:
# Here, we ask for an uninitialized 3x2 array:
np.empty((3, 2))

array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [17]:
# Here, we ask for a 2x3x1 array containing all zeros:
np.zeros((2,3,1))

array([[[0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.]]])

In [18]:
# Here, we ask for a 10x10 array containing random floating-point values between
# 0.0 (inclusive) and 1.0 (exclusive):
np.random.random((10,10))

array([[0.62082672, 0.76968012, 0.80052772, 0.31517545, 0.43757039,
        0.0302309 , 0.57126924, 0.30711223, 0.70919207, 0.73164364],
       [0.2158012 , 0.96756168, 0.28919571, 0.4973461 , 0.75810241,
        0.26370614, 0.05545817, 0.48403772, 0.62916714, 0.5376728 ],
       [0.68983538, 0.08033691, 0.51412087, 0.59732415, 0.17463539,
        0.76457282, 0.75547304, 0.4246529 , 0.86491239, 0.53361901],
       [0.58331412, 0.72712779, 0.92998695, 0.57925131, 0.74982189,
        0.97530878, 0.47824464, 0.76272281, 0.0629487 , 0.11690405],
       [0.29785653, 0.4823994 , 0.95994804, 0.6129815 , 0.59111683,
        0.4462911 , 0.41191572, 0.57023229, 0.92613226, 0.13169809],
       [0.23772512, 0.6344949 , 0.39219997, 0.1394328 , 0.9290494 ,
        0.01611539, 0.60975606, 0.3335388 , 0.03307847, 0.07980548],
       [0.00101473, 0.89291461, 0.24208574, 0.41050106, 0.0296905 ,
        0.65007406, 0.87630791, 0.88255764, 0.79818117, 0.72545438],
       [0.16054131, 0.19934878, 0.0695398

In [19]:
# Here, we ask for a 4x4 array of integers between -10 and 10. randint(),
# unlike random(), requires us to specify the upper and lower bounds of the
# random values.
np.random.randint(-10, 10, (4,4))

array([[-6,  3,  0,  9],
       [-6,  3, -2,  2],
       [-8,  2,  3, -9],
       [-1, -5, -8, -1]])

## Accessing values in NumPy arrays

In [20]:
# Just like Python lists, you can access elements by index:
my_list = [5, 2, 7, 4, 3, 6, 9, 10]
my_arr = np.array(my_list)

print(my_list[1], my_arr[1])

2 2


In [21]:
# There are other list behaviors we frequently use in Python. Specifically,
# slicing and replacing list elements with new values.
my_list = [5, 2, 7, 4, 3, 6, 9, 10]
my_list_2 = my_list[3:5]
my_list_2[0] = 10
print(my_list, my_list_2)

[5, 2, 7, 4, 3, 6, 9, 10] [10, 3]


In [22]:
# All of these behaviors work the same in NumPy.
my_list = [5, 2, 7, 4, 3, 6, 9, 10]
my_arr = np.array(my_list)
my_arr_2 = my_arr[3:5].copy()
my_arr_2[0] = 10
print(my_arr, my_arr_2)

[ 5  2  7  4  3  6  9 10] [10  3]


In [23]:
# All of these behaviors work the same in NumPy.
my_list = [5, 2, 7, 4, 3, 6, 9, 10]
my_arr = np.array(my_list)
my_arr_2 = my_arr[3:5]
my_arr_2[0] = 10
print(my_arr, my_arr_2)

[ 5  2  7 10  3  6  9 10] [10  3]


## Multidimensional Data

In [24]:
# Multidimensional lists (or lists of lists) in Python are possible too, since
# a list can contain any value at any element. Here, we create a 2-dimensional
# list (or a list containing lists, each of which contain values).
my_matrix = [
    [1, 2, 3],
    [4, 5, 6]
]

# The 2D list has rows and columns. Here, we access the first row, then the
# second element of the first row:
print(my_matrix[0], my_matrix[0][1])

[1, 2, 3] 2


In [25]:
# NumPy also supports multidimensional arrays:
my_matrix_arr = np.array([
    [1, 2, 3, 10],
    [4, 5, 6, 100],
    [9, 21, 3, 4]
])

# We can access the `shape` attribute available on all arrays to see how many
# dimmensions it has and how many elements are contained in each dimension. As
# we can see, we created a 2-dimensional array with 3 rows and 4 columns.
print(my_matrix_arr)

[[  1   2   3  10]
 [  4   5   6 100]
 [  9  21   3   4]]


In [26]:
# In fact, we can create arrays with arbitrarily many dimensions, so long as
# they are consistent:
my_matrix_arr_3d = np.array([
    [[1], [2], [3], [10]],
    [[4], [5], [6], [100]],
    [[9], [21], [3], [4]]
])

my_matrix_arr_3d.shape

(3, 4, 1)

In [27]:
# NumPy gives us a useful shorthand when accessing elements of multidimensional
# arrays. Here, I am asking for the 0th row and 1st column of my_matrix_arr.
#
# Note that the rightmost example, my_matrix_arr[0, 1], is better than the
# first method. The first approach must leave python, enter NumPy, retrieve the
# requested element, and bring it back into Python twice!
print(my_matrix_arr[0][1], my_matrix_arr[0, 1])

2 2


In [28]:
# We can extend this shorthand to slicing. Here, I am asking for the 0th column
# from rows 0 and 1 (alternatively, 0 to 2, exclusive of 2).
my_matrix_arr[0:2,0]

array([1, 4])

## Changing the Shape of NumPy Arrays

In [29]:
# NumPy offers many features for manipulating the contents and shape of arrays,
# far more than what Python offers for lists. One of these features involves
# changing the number of dimensions of an array, and how many elements are
# contained within each of these dimensions.
#
# For example, let's use the arange() function to create an array containing
# a sequence of integer values from 1 to 10, exclusive of 10:
np.arange(1, 10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [30]:
# We see this array is one-dimensional with 9 elements:
np.arange(1, 10).shape

(9,)

In [31]:
# Using the reshape() method, we can change it from a 9-element 1D array to a
# 3x3 2D array:
np.arange(1, 10).reshape(3,3)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [32]:
# We can see how its shape has changed:
np.arange(1, 10).reshape(3,3).shape

(3, 3)

In [33]:
# Note that reshaping is only possible if the new shape makes sense given the
# shape of the existing array! We cannot transform a 1D 9-element array to a
# 4x4 array, because there are not enough elements to fill the space:
np.arange(1, 10).reshape(4, 4)

ValueError: cannot reshape array of size 9 into shape (4,4)

In [None]:
# However, if all we want to do is add new dimensions to the array without
# changing the number of elements, we can. Here, we are adding an extra dimension
# to our 1D array, which has the effect of tranforming it to a 2D array with one
# row and as many columns as the original (9):
np.arange(1, 10).reshape(1, 9)

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [None]:
# There is no limit to how many dimensions we can add:
np.arange(1, 10).reshape(1, 1, 1, 1, 1, 9, 1, 1, 1)

array([[[[[[[[[1]]],


            [[[2]]],


            [[[3]]],


            [[[4]]],


            [[[5]]],


            [[[6]]],


            [[[7]]],


            [[[8]]],


            [[[9]]]]]]]]])

In [None]:
# We can also use -1 in the list of new dimensions to indicate "as many elements
# as there were in the original". The statement below results in a 1x9 array:
np.arange(1, 10).reshape(1, -1)

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [None]:
# We can also use the subscript operator to create new dimensions by using the
# "np.newaxis" convenience object to indicate where we want them. The code
# below is equivalent to calling np.arange(1, 10).reshape(1, -1) or
# np.arange(1, 10).reshape(1, 9).
np.arange(1, 10)[np.newaxis, :]

array([[1, 2, 3, 4, 5, 6, 7, 8, 9]])

## Changing the contents of NumPy arrays


In [None]:
# In Python, if we want to add a new element to the end of a list, we can call
# the list's append() method. Note that performing this action modifies the
# original list. That is, the list contained in the variable a has been altered
# in-place to contain a new value which was not present when it was originally
# initialized:
a = [1, 2, 3]
a.append(4)
a

[1, 2, 3, 4]

In [None]:
# NumPy has a similar append() function, which behaves similarly as the Python
# list's append() method. However, it is important to note that this function
# does not modify the original array. Instead, it returns a new array containing
# the contents of the original plus the new value:
a = [1, 2, 3]
a_arr = np.array(a)
a_new = np.append(a_arr, 4)
print(a_arr, a_new)

[1 2 3] [1 2 3 4]


In [None]:
# The + operator is the first occurence of a major difference between Python list
# functionality and NumPy array functionality. In Python, + can be used to
# concatenate two lists together:

a = [1, 2, 3]
b = [4, 5, 6]

a + b

[1, 2, 3, 4, 5, 6]

In [None]:
# To concatenate NumPy arrays, we have to use NumPy's concatenate fnuction:

a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.concatenate([a, b])

array([1, 2, 3, 4, 5, 6])

In [None]:
# We can also concatenate multiple NumPy arrays at once:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
c = np.array([7, 8, 9])
np.concatenate([a, b, c])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [None]:
# NumPy also allowes complex concatination over multidimensional
# arrays with the `axis` parameter.
a = np.zeros((3,3))
b = np.ones((3,3))

print("Row-wise concatenation:\n", np.concatenate([a, b], axis=0))
print("Column-wise concatenation:\n", np.concatenate([a, b], axis=1))

Row-wise concatenation:
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
Column-wise concatenation:
 [[0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 1. 1. 1.]]


## NumPy and arithmetic

In [None]:
# A major difference between NumPy arrays and Python lists is the functionality
# of the + operator. As we saw above, + indicates concatenation when used with
# Python lists. However, with NumPy arrays, + indicates addition:

a = np.array([1, 2, 3])
a + 10

array([11, 12, 13])

In [None]:
# We can also add two arrays together, so long as one of the following conditions
# is true:
#
#   - The arrays are the same shape
#   - The arrays are not the same shape, but they are broadcastable
#
# We will look at the simple case first. These arrays are the same shape,
# so NumPy will add corresponding elements together. We see that the final
# output array is equal to [1+2, 2+3, 3+4]:

a = np.array([1, 2, 3])
b = np.array([2, 3, 4])
a + b

array([3, 5, 7])

In [None]:
# The + symbol is a kind of useful shorthand. However, we can also use the
# NumPy function add() to add the arrays instead:
np.add(a, b)

array([3, 5, 7])

In [None]:
# All other arithmetic operators in Python work with NumPy, including multiplication,
# subtraction, division, power (**), and modulo (%). For example, this expression
# produces output equivalent to [1*2, 2*3, 3*4]:
a * b

array([ 2,  6, 12])

In [None]:
# What if the arrays aren't the same size? There are some circumstances where
# there is no clear way to perform the addition, and in those situations, NumPy
# will raise an exception. For example, there is no way to add a 3-element array
# to a 4-element array:

a = np.array([1, 2, 3])
b = np.array([2, 3, 4, 5])
a+b

ValueError: operands could not be broadcast together with shapes (3,) (4,) 

In [None]:
# But there are some situations where this works. Here are two arrays that are
# clearly not the same size, but can be added together anyway.
a = np.array([
    [1, 2, 3],
    [10, 11, 12]
])

b = np.array([2, 3, 4])

print("a.shape:", a.shape, "b.shape:", b.shape)

# We see the output of this addition is eqivalent to the following array:
# [
#     [1+2, 2+3, 3+4],
#     [10+2, 11+3, 12+4]
# ]
a+b

a.shape: (2, 3) b.shape: (3,)


array([[ 3,  5,  7],
       [12, 14, 16]])

In [None]:
# The addition in the cell above is an example of broadcasting: under some situations,
# NumPy can manipulate the shape of arrays so they can be added together.
# Broadcasting has 3 rules:
#
# Rule 1: If the two arrays differ in their number of dimensions, the shape of
#         the one with fewer dimensions is padded with ones on its leading (left) side.
# Rule 2: If the shape of the two arrays does not match in any dimension, the
#         array with shape equal to 1 in that dimension is stretched to match
#         the other shape.
# Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an
#         error is raised.
#
# To follow along with arrays a and b:
print(a.shape, b.shape)

(2, 3) (3,)


In [None]:
# Rule 1: a and b differ in their number of dimensions. We will pad another dimension
# on the left side of b.
print(a.shape, b.reshape(1, -1).shape)

(2, 3) (1, 3)


In [None]:
# Rule 2: The shape of a and b does not match in the first (0) dimension. a
# has 2 rows, but b only has 1. Because b only has 1 row, we can duplicate that
# row so it equals the number of rows in a:
print(a.shape, b.reshape(1, -1).repeat(2, axis=0).shape)

(2, 3) (2, 3)


In [None]:
# Now they are the same shape, so they can be added. Here is what they look like
# after broadcasting:
print(a)
print()
print(b.reshape(1, -1).repeat(2, axis=0))

[[ 1  2  3]
 [10 11 12]]

[[2 3 4]
 [2 3 4]]


In [None]:
# We now understand why our earlier example with differently-sized arrays did not
# work: their shapes cannot be broadcast. In this example:
#
# Rule 1 does not apply, since both arrays have an equal number of dimensions.
#
# Rule 2 does not apply. The dimensions do not match, but none of them are 1.
#
# Rule 3 applies: both arrays disagree along the same dimension and neither
# is equal to 1. Consequently, an error is raised.

a = np.array([1, 2, 3])
b = np.array([2, 3, 4, 5])
print(a.shape, b.shape)

(3,) (4,)


## NumPy and aggregations

In [None]:
# We can perform aggregations like summming, mean, median, and standard deviation
# on arrays. By default, these aggregations apply to all values in an array,
# regardless of its structure.
a = np.array([
    [1, 2, 3],
    [10, 11, 12]
])

print("Sum:", np.sum(a))
print("Mean:", np.mean(a))
print("Median:", np.median(a))
print("Standard deviation:", np.std(a))
print("Minimum:", np.min(a))
print("Maximum:", np.max(a))

Sum: 39
Mean: 6.5
Median: 6.5
Standard deviation: 4.573474244670748
Minimum: 1
Maximum: 12


In [None]:
# However, we can make use of the axis parameter to control how these aggregations
# are performed.
#
# For example, in our 2D NumPy array, giving axis=0 means the aggregation is
# performed accross the rows (dimension 0) leaving only columns (dimension 1):
print("Sum:", np.sum(a, axis=0))
print("Mean:", np.mean(a, axis=0))
print("Median:", np.median(a, axis=0))
print("Standard deviation:", np.std(a, axis=0))
print("Minimum:", np.min(a, axis=0))
print("Maximum:", np.max(a, axis=0))

Sum: [11 13 15]
Mean: [5.5 6.5 7.5]
Median: [5.5 6.5 7.5]
Standard deviation: [4.5 4.5 4.5]
Minimum: [1 2 3]
Maximum: [10 11 12]


In [None]:
# Gving axis=1 means the aggregation is performed accross the columns
# (dimension 1) leaving only rows (dimension 0):
print("Sum:", np.sum(a, axis=1))
print("Mean:", np.mean(a, axis=1))
print("Median:", np.median(a, axis=1))
print("Standard deviation:", np.std(a, axis=1))
print("Minimum:", np.min(a, axis=1))
print("Maximum:", np.max(a, axis=1))

Sum: [ 6 33]
Mean: [ 2. 11.]
Median: [ 2. 11.]
Standard deviation: [0.81649658 0.81649658]
Minimum: [ 1 10]
Maximum: [ 3 12]


## Boolean logic and masking

In [None]:
# A final point regarding NumPy arrays is that they work with Python relational
# operators like <, <=, >, >=, ==, and !=.
#
# The comparisons are performed element-wise like other mathematical operations.
# They produce equivalently-sized arrays filled with Boolean values.
#
# Here, we see an example where the resulting NumPy array is True for array
# elements where the condition is true, and False where it is not.
np.array([1, 2, 3]) < 2

array([ True, False, False])

In [None]:
# We can perform array comparisons too. The resulting array is equivalent to the
# following: [1 < 10, 2 < 1, 3 < 8]:

a = np.array([1, 2, 3])
b = np.array([10, 1, 8])

a < b

array([ True, False,  True])

In [None]:
# Broadcasting rules apply here too:
a = np.array([
    [1, 2, 3],
    [2, 3, 4]
])
b = np.array([1, 10, 0])

a < b

array([[False,  True, False],
       [False,  True, False]])

In [None]:
# Related to boolean logic is the concept of masking: using a boolean-valued
# array to select elements of another.
#
# As a simple example, this code will retrieve the 1st element of array a:

a = np.array([1, 2, 10, 50, 72, 4, 100])
b = np.array([False, True, False, False, False, False, False])
a[b]

array([2])

In [None]:
# This code will retrieve several elements from a:
a = np.array([1, 2, 10, 50, 72, 4, 100])
b = np.array([False, True, False, False, True, True, False])
a[b]

array([ 2, 72,  4])

In [None]:
# This code allows us to retrieve specific elements where a given condition
# is true:
a = np.array([1, 2, 10, 50, 72, 4, 100])
a[a < 10]

array([1, 2, 4])