# What's Numpy

NumPy is the fundamental package for scientific computing in Python. 


<!--{
will be talking a bit about allow integratibility with c/c++; linear algebra capabilities
}-->

# NumPy Basics: Arrays and Vectorized Computation

In [1]:
import numpy as np
import matplotlib.pyplot as plt

### Why numpy

In [2]:
import numpy as np
my_arr = np.arange(1000000)
my_list = list(range(1000000))

In [3]:
my_arr.dtype

dtype('int32')

In [4]:
my_list.dtype

AttributeError: 'list' object has no attribute 'dtype'

In [5]:
%time for _ in range(10): my_arr2 = my_arr * 2
%time for _ in range(10): my_list2 = [x * 2 for x in my_list]

Wall time: 31.2 ms
Wall time: 1.22 s


 - accelerated computation for numerical data
 - inbuilt lists and arrays are slow
     - primary reason :- lack of info(dtype)
         - numpy arrays are containers for homogenous data

## The NumPy ndarray: A Multidimensional Array Object

N-dimensional array object:-
 - fast, flexible container for large datasets in Python. 
 - perform operations on blocks of data using similar syntax to that of between scalars.


In [6]:
# Generate some random data
data = np.random.randn(2, 3)
data

array([[ 0.11345834,  0.01378537, -1.1239487 ],
       [ 0.84497045, -1.71334994,  0.41027007]])

In [7]:
data.dtype

dtype('float64')

In [8]:
data.shape

(2, 3)

In [9]:
data * 10

array([[  1.1345834 ,   0.13785371, -11.23948697],
       [  8.44970454, -17.13349939,   4.10270072]])

<!--{talk about what's happening here :- broadcasting , explained later in the notebook}-->

NOTE:-  
$$list*scalar$$

gives a concatenation of lists

In [10]:
a_list = [6,96,6 ,324]
a_list*2

[6, 96, 6, 324, 6, 96, 6, 324]

In [11]:
data**2 - data

array([[-0.10058555, -0.01359533,  2.38720937],
       [-0.13099539,  4.64891795, -0.24194854]])

### Creating ndarrays

from a list or a list of lists(still a list) 

In [12]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

array([6. , 7.5, 8. , 0. , 1. ])

In [13]:
arr2 = np.asarray([data1,data1])

In [14]:
arr2

array([[6. , 7.5, 8. , 0. , 1. ],
       [6. , 7.5, 8. , 0. , 1. ]])

In [15]:
arr2.dtype

dtype('float64')

In [16]:
arr3 = np.asarray([data1,data1],dtype = np.int32)

In [17]:
arr3

array([[6, 7, 8, 0, 1],
       [6, 7, 8, 0, 1]])

In [18]:
arr2.ndim

2

In [19]:
arr2.shape

(2, 5)

In [20]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
np.zeros((2,5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [22]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [23]:
np.arange(20).reshape(2,10)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])

<!--{transposing further ahead in the notebook}-->

### Data Types for ndarrays

dtype is metadata (data about data)

In [24]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)

In [25]:
arr1.dtype

dtype('float64')

In [26]:
arr2.dtype

dtype('int32')

In [27]:
arr = np.array([1., 2, 3, 4, 5])
arr.dtype

dtype('float64')

In [28]:
float_arr = arr.astype(np.float64)
float_arr.dtype

dtype('float64')

In [29]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
arr
arr.astype(np.int32)

array([ 3, -1, -2,  0, 12, 10])

<!--(talk about up&down-casting)-->

### Arithmetic with NumPy Arrays

{talk about vectorization}

In [30]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [31]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [32]:
arr**1.5

array([[ 1.        ,  2.82842712,  5.19615242],
       [ 8.        , 11.18033989, 14.69693846]])

In [33]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [34]:
arr2 > arr1

array([[False,  True, False],
       [ True, False,  True]])

### Basic Indexing and Slicing

In [35]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [36]:
arr[5]

5

In [37]:
arr[5:8]

array([5, 6, 7])

In [38]:
arr[5:8] = 10
arr

array([ 0,  1,  2,  3,  4, 10, 10, 10,  8,  9])

<!--{again,broadcasting coming down the road}-->

In [39]:
arr_sliced = arr[5:8]
arr_sliced

array([10, 10, 10])

sliced assignments create views and not copies<br>
{note on performance (why)} 

In [40]:
arr_sliced[1] = 12345
arr

array([    0,     1,     2,     3,     4,    10, 12345,    10,     8,
           9])

In [41]:
arr_sliced[:] = 64
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [42]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [43]:
arr2d[2]

array([7, 8, 9])

In [44]:
arr2d[0][2]

3

In [45]:
arr2d[0,2]

3

In [46]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [47]:
arr3d.shape

(2, 2, 3)

In [48]:
arr3d[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [49]:
old_values = arr3d[0].copy()

In [50]:
arr3d[0] = 42
arr3d

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [51]:
arr3d[0] = old_values
arr3d

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [52]:
arr3d[1, 0]

array([7, 8, 9])

In [53]:
x = arr3d[1]
x

array([[ 7,  8,  9],
       [10, 11, 12]])

In [54]:
x[0]

array([7, 8, 9])

#### Indexing with slices

In [55]:
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [56]:
arr[1:6]

array([ 1,  2,  3,  4, 64])

In [57]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [58]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [59]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [60]:
arr2d[1, :2]

array([4, 5])

In [61]:
arr2d[:2, 2]

array([3, 6])

In [62]:
arr2d[:, :1]

array([[1],
       [4],
       [7]])

In [63]:
arr2d[:2, 1:] = 0
arr2d

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

### Boolean Indexing

In [153]:
data = np.random.randn(3, 3)

In [154]:
data

array([[-0.58145567,  1.33120128, -1.44546298],
       [-0.14441365,  0.0653789 ,  2.21456247],
       [-0.20145466, -0.30084359,  0.87498198]])

In [155]:
arr1 = np.arange(9).reshape(3,3)

In [156]:
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [157]:
arr1.T

array([[0, 3, 6],
       [1, 4, 7],
       [2, 5, 8]])

In [158]:
 arr2 = arr1 == arr1.T

In [159]:
arr2

array([[ True, False, False],
       [False,  True, False],
       [False, False,  True]])

In [160]:
data[arr2]

array([-0.58145567,  0.0653789 ,  0.87498198])

In [161]:
arr2 = arr1<arr1.T

In [162]:
arr2

array([[False,  True,  True],
       [False, False,  True],
       [False, False, False]])

In [163]:
data[arr2]

array([ 1.33120128, -1.44546298,  2.21456247])

In [164]:
data[~arr2]

array([-0.58145567, -0.14441365,  0.0653789 , -0.20145466, -0.30084359,
        0.87498198])

In [165]:
data[~arr2] = 0
data

array([[ 0.        ,  1.33120128, -1.44546298],
       [ 0.        ,  0.        ,  2.21456247],
       [ 0.        ,  0.        ,  0.        ]])

In [167]:
data[~arr2] = np.NaN
data

array([[        nan,  1.33120128, -1.44546298],
       [        nan,         nan,  2.21456247],
       [        nan,         nan,         nan]])

In [76]:
arr2 = (arr1<-0.5) & (arr1>0.5)

In [77]:
arr2

array([[False, False, False],
       [False, False, False],
       [False, False, False]])

In [78]:
arr2 = (arr1<-0.5) | (arr1>0.5)

In [79]:
arr2

array([[False,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [80]:
arr1[arr2] = 4242

In [81]:
arr1

array([[   0, 4242, 4242],
       [4242, 4242, 4242],
       [4242, 4242, 4242]])

### Expressing Conditional Logic as Array Operations

In [82]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

In [83]:
result = [(x if c else y)
          for x, y, c in zip(xarr, yarr, cond)]
result

[1.1, 2.2, 1.3, 1.4, 2.5]

In [84]:
result = np.where(cond, xarr, yarr)
result

array([1.1, 2.2, 1.3, 1.4, 2.5])

In [85]:
arr = np.random.randn(4, 4)
arr
arr > 0
np.where(arr > 0, 2, -2)

array([[-2, -2,  2, -2],
       [-2,  2,  2, -2],
       [ 2,  2, -2,  2],
       [-2, -2,  2,  2]])

In [86]:
np.where(arr > 0, 2, arr) # set only positive values to 2

array([[-0.49849026, -1.47807995,  2.        , -0.53758726],
       [-0.44756897,  2.        ,  2.        , -0.79877857],
       [ 2.        ,  2.        , -0.66109102,  2.        ],
       [-0.32071274, -2.54905052,  2.        ,  2.        ]])

### Transposing Arrays and Swapping Axes

In [87]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [88]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [89]:
arr = np.random.randn(6, 3)
arr

array([[-1.02094474, -1.65479213, -1.64303101],
       [ 0.4795211 , -2.3034219 ,  0.16379922],
       [-1.71930182, -0.83256985, -2.5589116 ],
       [-1.49441455, -0.83689426,  1.01251113],
       [ 0.53798409, -1.65498866, -0.59476112],
       [ 0.70047257,  1.02379547,  0.16363645]])

In [90]:
np.dot(arr,arr.T)

array([[ 6.48021606,  3.05299268,  7.3374133 ,  1.24701352,  3.16662115,
        -2.67817224],
       [ 3.05299268,  5.56252313,  0.67417041,  1.37696579,  3.97269043,
        -1.995538  ],
       [ 7.3374133 ,  0.67417041, 10.19719986,  0.67519611,  1.97487778,
        -2.47543622],
       [ 1.24701352,  1.37696579,  0.67519611,  3.95884565, -0.021123  ,
        -1.73792122],
       [ 3.16662115,  3.97269043,  1.97487778, -0.021123  ,  3.38215513,
        -1.41485139],
       [-2.67817224, -1.995538  , -2.47543622, -1.73792122, -1.41485139,
         1.56559587]])

In [91]:
arr = np.arange(16).reshape((2, 2, 4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [92]:
arr.transpose((1, 0, 2))

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [93]:
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [94]:
arr.swapaxes(1,2)

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

### Mathematical and Statistical Methods

In [95]:
arr = np.random.randn(3, 3)*10
arr

array([[ 0.06862393, 12.44342441, -0.36614093],
       [18.92097721,  5.57799934,  9.5641847 ],
       [ 0.22761782, -1.63772127,  3.96704934]])

In [96]:
arr.mean()

5.418446061934659

In [97]:
np.mean(arr)

5.418446061934659

In [98]:
arr.sum()

48.76601455741193

In [99]:
arr.mean(axis=1)

array([ 4.0486358 , 11.35438708,  0.8523153 ])

In [100]:
arr.sum(axis=0)

array([19.21721896, 16.38370249, 13.16509311])

<!--{explain how}-->

In [101]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])
arr.cumsum()

array([ 0,  1,  3,  6, 10, 15, 21, 28], dtype=int32)

In [102]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [103]:
arr.cumsum(axis=0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]], dtype=int32)

In [104]:
arr.cumprod(axis=1)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]], dtype=int32)

### Methods for Boolean Arrays

In [105]:
arr = np.random.randn(100)
(arr > 0).sum() # Number of positive values

52

In [106]:
bools = np.array([False, False, True, False])

In [107]:
bools.any()

True

In [108]:
bools.all()

False

### Sorting

In [109]:
arr = np.random.randn(6)
arr

array([ 0.75630355, -1.16304048, -0.42395081, -1.05990009, -0.4220576 ,
        1.09010189])

In [110]:
arr.sort()
arr

array([-1.16304048, -1.05990009, -0.42395081, -0.4220576 ,  0.75630355,
        1.09010189])

In [111]:
arr = np.random.randn(5, 3)
arr

array([[ 0.32474132, -0.6841474 ,  2.4480647 ],
       [-0.86834799,  1.49918595,  0.00887791],
       [-0.07264062,  1.17637938, -0.45319032],
       [-0.25651691, -0.504704  ,  0.16825096],
       [-0.29875141,  1.65992439,  0.60301037]])

In [112]:
arr.sort(1)
arr

array([[-0.6841474 ,  0.32474132,  2.4480647 ],
       [-0.86834799,  0.00887791,  1.49918595],
       [-0.45319032, -0.07264062,  1.17637938],
       [-0.504704  , -0.25651691,  0.16825096],
       [-0.29875141,  0.60301037,  1.65992439]])

## Linear Algebra

In [113]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])

In [114]:
x

array([[1., 2., 3.],
       [4., 5., 6.]])

In [115]:
y

array([[ 6., 23.],
       [-1.,  7.],
       [ 8.,  9.]])

In [116]:
x.dot(y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [117]:
np.dot(x, y)

array([[ 28.,  64.],
       [ 67., 181.]])

In [118]:
np.dot(x, np.ones(3))

array([ 6., 15.])

In [119]:
x @ np.ones(3)

array([ 6., 15.])

In [120]:
from numpy.linalg import inv, qr
X = np.random.randn(5, 2)
X

array([[ 1.20557175, -0.57682399],
       [-0.41356434,  0.68745118],
       [ 1.98962111,  1.20570104],
       [-0.59759732,  0.65908348],
       [ 2.41644161, -0.39820876]])

In [121]:
mat = X.T.dot(X)
inv(mat)

array([[ 0.08490443, -0.00187748],
       [-0.00187748,  0.35067372]])

In [122]:
mat.dot(inv(mat))

array([[1., 0.],
       [0., 1.]])

## basic intro to Broadcasting

In [123]:
arr = np.arange(5)

In [124]:
arr

array([0, 1, 2, 3, 4])

In [125]:
arr*4

array([ 0,  4,  8, 12, 16])

Here we say that the scalar value 4 has been broadcast to all of the other elements in the multiplication operation.

## sample application
(demeaning - subpart of normalizing data)

In [126]:
arr = np.random.randn(4,3)

In [127]:
arr

array([[ 2.68179613e-01,  2.35773980e-01,  7.53345159e-01],
       [ 1.67302844e+00, -2.09358250e-01,  7.39455270e-01],
       [ 1.06465068e+00,  3.78504067e-01, -1.03281437e+00],
       [ 5.53301090e-04,  7.09529540e-01, -6.39357468e-01]])

In [128]:
arr.mean(0)

array([ 0.75160301,  0.27861233, -0.04484285])

In [129]:
arr2 = arr - arr.mean(0)

In [130]:
arr2.mean(0)

array([ 2.77555756e-17, -2.77555756e-17,  0.00000000e+00])

#  <center>Broadcasting</center>
<img src ="Annotation 2019-09-22 230342.jpg">

In [131]:
arr 

array([[ 2.68179613e-01,  2.35773980e-01,  7.53345159e-01],
       [ 1.67302844e+00, -2.09358250e-01,  7.39455270e-01],
       [ 1.06465068e+00,  3.78504067e-01, -1.03281437e+00],
       [ 5.53301090e-04,  7.09529540e-01, -6.39357468e-01]])

In [132]:
row_means = arr.mean(1)

In [133]:
row_means

array([0.41909958, 0.73437515, 0.13678012, 0.02357512])

In [134]:
arr - row_means

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [135]:
arr3 = (arr.T - row_means.T).T

In [136]:
arr3

array([[-0.15091997, -0.1833256 ,  0.33424557],
       [ 0.93865329, -0.9437334 ,  0.00508012],
       [ 0.92787056,  0.24172394, -1.1695945 ],
       [-0.02302182,  0.68595442, -0.66293259]])

In [137]:
arr3.mean(1)

array([1.85037171e-17, 7.40148683e-17, 0.00000000e+00, 0.00000000e+00])

<!--{give some examples of shapes of which broadcasting works and where it doesn't}-->

In [138]:
arr = np.arange(27).reshape(3,3,3)

In [139]:
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [140]:
arr2 = np.array([1,2,3])

In [141]:
arr2.shape

(3,)

In [142]:
arr - arr2

array([[[-1, -1, -1],
        [ 2,  2,  2],
        [ 5,  5,  5]],

       [[ 8,  8,  8],
        [11, 11, 11],
        [14, 14, 14]],

       [[17, 17, 17],
        [20, 20, 20],
        [23, 23, 23]]])

In [143]:
arr2 = arr2.reshape(1,3)

In [144]:
arr - arr2

array([[[-1, -1, -1],
        [ 2,  2,  2],
        [ 5,  5,  5]],

       [[ 8,  8,  8],
        [11, 11, 11],
        [14, 14, 14]],

       [[17, 17, 17],
        [20, 20, 20],
        [23, 23, 23]]])

In [145]:
arr = np.arange(24).reshape(2,4,3)

In [146]:
arr

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]],

       [[12, 13, 14],
        [15, 16, 17],
        [18, 19, 20],
        [21, 22, 23]]])

In [147]:
arr - arr2

array([[[-1, -1, -1],
        [ 2,  2,  2],
        [ 5,  5,  5],
        [ 8,  8,  8]],

       [[11, 11, 11],
        [14, 14, 14],
        [17, 17, 17],
        [20, 20, 20]]])

In [148]:
arr2 = arr2.reshape(1,3)

In [149]:
arr3 = np.concatenate((arr2,arr2),axis=0)

In [150]:
arr3

array([[1, 2, 3],
       [1, 2, 3]])

In [151]:
arr3.shape

(2, 3)

In [152]:
arr- arr3

ValueError: operands could not be broadcast together with shapes (2,4,3) (2,3) 

<!--{discuss why this doesn't work :- even though 2 is a factor of 4 and this is possible}-->

### conclusion , head over to pandas