# Introduction to NUMPY

Numpy is a the ideal tool to work with datasets. It is much faster than using python on its own

In [1]:
import numpy as np

## raw python vs numpy

lets say I wanted to multiply all the numbers in my list with 2.

In [2]:
N=10000000
numbers_list = list(range(N))
numbers_list[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [3]:
%%time
for i in range(N):
    numbers_list[i] =  numbers_list[i] * 2

Wall time: 1.52 s


In [4]:
numbers_list[:10]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [5]:
numbers_array = np.array(list(range(N)))
numbers_array

array([      0,       1,       2, ..., 9999997, 9999998, 9999999])

In [6]:
%%time
numbers_array = numbers_array * 2

Wall time: 20 ms


We can see that numpy is so much faster, especially when we have lots of elements in our array/list

# vectorisation

We saw that numpy arrays are much faster than loops. Arrays like this can also be referred to as vectors in linear algebra. The term "vectorisation" comes from this inherent speed gain by using numpy arrays/vectors. If you find your self writing a for loop, always ask your self if you can do it in a vector operation instead.

# Creating a numpy array


In [7]:
np.array([1,2,3])

array([1, 2, 3])

In [8]:
np.zeros((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [9]:
np.ones((2,3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [10]:
np.random.randn(2,3)# normally distributed

array([[-1.16447101, -0.63977086,  0.6234382 ],
       [-0.14645496, -1.46356885, -0.16716379]])

In [11]:
np.identity(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [12]:
np.arange(100)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

# length & size

In [13]:
data = np.zeros((100,200))
data.shape

(100, 200)

In [14]:
len(data)# this will give you first axis

100

# Joining numpy arrays

In [15]:
a = np.arange(10)
b = np.arange(20)
np.hstack((a,b))

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  0,  1,  2,  3,  4,  5,  6,
        7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [16]:
np.vstack((a,a))

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

# resize

In [17]:
data = np.arange(9)
data = data.reshape(3,3)
data

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [18]:
data = np.arange(9)
data = data.reshape(3,-1) # if you dont want to do the mental arithmetic you can supply -1 to second axis
data

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

# indexing

In [19]:
data = np.random.randn(5)
data

array([-0.77997177,  0.21126988, -1.48522208, -2.8861719 , -0.97668785])

In [20]:
data[1]

0.21126988248543813

In [21]:
data[0:3]

array([-0.77997177,  0.21126988, -1.48522208])

In [22]:
data[:3]

array([-0.77997177,  0.21126988, -1.48522208])

In [23]:
data[-3:]

array([-1.48522208, -2.8861719 , -0.97668785])

# multi axis indexing

In [24]:
data = np.random.randn(3,3)
data

array([[ 1.29287986, -1.4242937 , -0.47203414],
       [ 0.22451183, -0.19039382, -0.4620172 ],
       [-0.01199695, -0.52814524, -0.954287  ]])

In [25]:
data[1,2] # 2nd row, 3rd collumn

-0.46201719645072686

In [26]:
data[:,0] # pick all elements from first column

array([ 1.29287986,  0.22451183, -0.01199695])

In [27]:
data[:2,0] # pick up to 3rd element from first column

array([1.29287986, 0.22451183])

In [28]:
data[:,0] # pick all elements from first column

array([ 1.29287986,  0.22451183, -0.01199695])

# boolean indexing

In [29]:
data = np.random.randn(3,3)
data

array([[ 0.3361404 ,  0.51762291, -1.90481833],
       [ 0.55802837,  1.45067623, -0.68407858],
       [ 0.67059616,  1.81223744, -2.68342406]])

In [30]:
data > 0

array([[ True,  True, False],
       [ True,  True, False],
       [ True,  True, False]])

In [31]:
data[data > 0]

array([0.3361404 , 0.51762291, 0.55802837, 1.45067623, 0.67059616,
       1.81223744])

# Aggregation

In [32]:
data = np.random.randn(3,3)
data

array([[ 0.70809921,  0.60610563, -0.98465825],
       [ 0.02652401, -0.68677078,  1.17277057],
       [-1.3698359 , -1.34628938,  0.81490239]])

In [33]:
data.sum()

-1.0591524977415445

In [34]:
np.sum(data)

-1.0591524977415445

In [35]:
data.sum(axis=1)

array([ 0.32954659,  0.51252379, -1.90122288])

In [36]:
data.sum(axis=1,keepdims=True)

array([[ 0.32954659],
       [ 0.51252379],
       [-1.90122288]])

In [37]:
data.sum(axis=0)

array([-0.63521267, -1.42695453,  1.00301471])

In [38]:
data.mean()

-0.11768361086017161

In [39]:
data.min()

-1.3698358960846397

In [40]:
data.max()

1.1727705658957737

# Math operations

In [41]:
data = np.random.randn(3,3)
data

array([[ 0.37876057, -0.00258137, -0.79327307],
       [ 0.00151133, -0.43516843, -0.63249574],
       [ 1.24419263, -0.48450109,  0.06697843]])

In [42]:
np.exp(data)

array([[1.46047332, 0.99742196, 0.45236176],
       [1.00151247, 0.64715566, 0.53126425],
       [3.470132  , 0.61600445, 1.06927242]])

In [43]:
np.sin(data)

array([[ 0.36976917, -0.00258137, -0.7126532 ],
       [ 0.00151133, -0.42156314, -0.59115954],
       [ 0.94713743, -0.46576693,  0.06692837]])