### Load NumPy

In [1]:
import numpy as np

### What is NumPy array?

In [2]:
# Create a 1-d array
a = np.array([1, 2, 3, 4])
a

array([1, 2, 3, 4])

In [3]:
# Number of dimensions
a.ndim

1

In [4]:
# Number of elements
a.size

4

In [5]:
# Number of first layer elements - using Python's native way to get the length of a variable
len(a)

# This only works for other Python native variables e.g. list
# len([1, 2, 3])

4

In [6]:
# Shape of array
a.shape

(4,)

In [7]:
# Create a 2-d array
b = np.array([[1, 2, 3, 4], [4, 3, 2, 1]])
b

array([[1, 2, 3, 4],
       [4, 3, 2, 1]])

In [8]:
b.ndim

2

In [9]:
# Shape - work outside-in
b.shape

# The first layer has 2 elements while the second has 4, so it's 2 x 4

(2, 4)

In [10]:
# Number of elements
b.size

8

In [11]:
# Number of first layer elements (distinguish from b.size)
len(b)

2

In [12]:
# Homogeneity
# Every element in an array must be of the same data type (dtype)
# We specify 3.0 (as float) here, so all the other elements will be force converted to float, which is more precise
b = np.array([[1, 2, 3, 4], [4, 3.0, 2, 1]])
b

array([[1., 2., 3., 4.],
       [4., 3., 2., 1.]])

In [13]:
b.dtype

dtype('float64')

In [14]:
# Distinguish from Python native list, where you can store heterogeneous values
b_list = [1, 'two', 3.0, [1, 1, 1, 1]]
print("This is a Python list, not NumPy array:", b_list)

This is a Python list, not NumPy array: [1, 'two', 3.0, [1, 1, 1, 1]]


### Accessing and modifying elements (work "outside-in")

In [15]:
# Create a 2 x 8 array
b = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [4, 3, 2, 1, 0, -1, -2, -3]])
b

array([[ 1,  2,  3,  4,  5,  6,  7,  8],
       [ 4,  3,  2,  1,  0, -1, -2, -3]])

In [16]:
# Get element [FIRST_LAYER, SECOND_LAYER, ...] (for 2-d they will be row and then column)
# Again, work outside-in
b[0, 4]

# Returns np.int64(5), which means the value is 5 and it is stored as 'np.int64' data type

np.int64(5)

In [17]:
# Out of bound error
b[0, 10]

IndexError: index 10 is out of bounds for axis 1 with size 8

In [18]:
# Negative indices work as well (the last element will be indexed as -1, and the second last as -2, ...)
b[0, -2]

np.int64(7)

In [19]:
# Get the first layer ('row')
b[1]

array([ 4,  3,  2,  1,  0, -1, -2, -3])

In [20]:
# Slicing also works
# https://www.w3schools.com/python/numpy/numpy_array_slicing.asp
b[0, 1:5]

array([2, 3, 4, 5])

In [21]:
b[:, :]

array([[ 1,  2,  3,  4,  5,  6,  7,  8],
       [ 4,  3,  2,  1,  0, -1, -2, -3]])

In [22]:
b[:, 3:5]

array([[4, 5],
       [1, 0]])

In [23]:
b[:, ::2]

array([[ 1,  3,  5,  7],
       [ 4,  2,  0, -2]])

In [24]:
b

array([[ 1,  2,  3,  4,  5,  6,  7,  8],
       [ 4,  3,  2,  1,  0, -1, -2, -3]])

In [25]:
# Update an element
b[0, -2] = 99
b

array([[ 1,  2,  3,  4,  5,  6, 99,  8],
       [ 4,  3,  2,  1,  0, -1, -2, -3]])

In [26]:
# Update elements
b[:, :] = 0
b

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [27]:
# Create an 3 x 2 x 3 array
c = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]], [[13, 14, 15], [16, 17, 18]]])
c

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]],

       [[13, 14, 15],
        [16, 17, 18]]])

In [28]:
# Again, work outside-in
c[1, 0, 1]

np.int64(8)

In [29]:
c[:, 0, :] = 0
c

array([[[ 0,  0,  0],
        [ 4,  5,  6]],

       [[ 0,  0,  0],
        [10, 11, 12]],

       [[ 0,  0,  0],
        [16, 17, 18]]])

### Create (initialise) arrays

In [30]:
np.zeros(2, dtype=int)

array([0, 0])

In [31]:
np.ones([2, 2, 2], dtype=float)

array([[[1., 1.],
        [1., 1.]],

       [[1., 1.],
        [1., 1.]]])

In [32]:
np.full([2, 2], 19)

array([[19, 19],
       [19, 19]])

In [33]:
np.random.random([4, 2, 2])

array([[[0.34657214, 0.74499797],
        [0.33754394, 0.6893097 ]],

       [[0.88754531, 0.54387005],
        [0.81984525, 0.44500488]],

       [[0.25573905, 0.74072321],
        [0.56662586, 0.00766652]],

       [[0.2562143 , 0.04651129],
        [0.20603697, 0.70626553]]])

In [34]:
np.identity(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

How to generate the following numpy array?

In [35]:
# array([[ 1,  1,  1,  1],
#        [ 1,  9, -9,  1],
#        [ 1, -9,  9,  1],
#        [ 1,  1,  1,  1]])

In [36]:
# Answer

d = np.ones([4, 4], dtype=int)
d[1:3, 1:3] = 9
d[1, 2] = -9
d[2, 1] = -9
d

array([[ 1,  1,  1,  1],
       [ 1,  9, -9,  1],
       [ 1, -9,  9,  1],
       [ 1,  1,  1,  1]])

### Arithmetic (element-wise)

In [37]:
a = np.array([1, 2, 3, 4])
a

array([1, 2, 3, 4])

In [38]:
a + 1

array([2, 3, 4, 5])

In [39]:
a * 2

array([2, 4, 6, 8])

In [40]:
a ** 2

array([ 1,  4,  9, 16])

In [41]:
b = np.array([8, 4, 2, 1])
a + b

array([9, 6, 5, 5])

### Copies

Reference:

https://stackoverflow.com/questions/66648023

https://www.w3schools.com/python/numpy/numpy_copy_vs_view.asp

In [42]:
a = np.array([1, 2, 3, 4])

In [43]:
# View
b = a
b[0] = 99
b

array([99,  2,  3,  4])

In [44]:
# The first element of 'a' will be updated to 99
# Why?
a

array([99,  2,  3,  4])

In [45]:
# A 'real' copy
b = a.copy()
b[0] = 100
b

array([100,   2,   3,   4])

In [46]:
# 'a' is not affected
a

array([99,  2,  3,  4])

### Statistics

In [47]:
a = np.array([[1, 2, 3, 4], [4, 3, 2, 1]])
a

array([[1, 2, 3, 4],
       [4, 3, 2, 1]])

In [48]:
np.min(a)

np.int64(1)

In [49]:
a.min()

np.int64(1)

In [50]:
a.max()

np.int64(4)

In [51]:
a.sum()

np.int64(20)

In world of NumPy/pandas, axis=0 (default) usually means 'by row' while axis=1 means 'by column'. Note that this is different from the xyz coordinate system we learned in school.

See also:
"NumPy axes are the directions along the rows and columns" on https://www.sharpsightlabs.com/blog/numpy-axes-explained

In [52]:
a.sum(axis=0)

array([5, 5, 5, 5])

In [53]:
a.sum(axis=1)

array([10, 10])

In [54]:
# This implies axis=-1 (default, which means sorting alone the last axis, and in this case it's axis=1)
np.sort(a)

array([[1, 2, 3, 4],
       [1, 2, 3, 4]])

### Reshape

In [55]:
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
a

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [56]:
# Again, this is just creating a 'view' of the original array
a.reshape([4, 2])

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

### Load from file

In [57]:
census = np.genfromtxt('./data/uk_countries.csv', delimiter=',').astype(int)
census
census[1:, 2:]

# You may see a warning because in this file we still have some text strings e.g. country names which NumPy doesn't know how to convert to numbers
# Remember that a NumPy array can only store one type of data

  census = np.genfromtxt('./data/uk_countries.csv', delimiter=',').astype(int)


array([[   130278,  47055205,  49138831,  53012456,  56490048],
       [    13562,   1577836,   1685267,   1810863,   1903175],
       [    77925,   4998567,   5062011,   5295403,   5418400],
       [    20735,   2835073,   2903085,   3063456,   3107494],
       [        0, 112933362, 117578388, 126364356, 133838234],
       [        0, 169400043, 176367582, 189546534, 200757351]])

In [58]:
# Only keep the area that contains population data
population = census[1:-1, 3:]

# Work out the sum of each column (direction of the 'add' operation: axis=0 i.e. by row)
population_sum = population.sum(axis=0).astype(int)
population_sum

array([169400043, 176367582, 189546534, 200757351])

### Condition-based masking and indexing

In [59]:
# This will return an array of boolean, indicating whether each element on the relevant position meets the condition
population_sum > 60000000

array([ True,  True,  True,  True])

In [60]:
# Passing the boolean array to filter out the elements that have met the condition i.e. True
population_sum[population_sum > 60000000]

array([169400043, 176367582, 189546534, 200757351])