# Numpy  ndarray
### This is the outline:

In [1]:
import numpy as np

In [3]:
data = np.asarray([[0.9526, -0.246 , -0.8856], [ 0.5639, 0.2379, 0.9104]])
print data

[[ 0.9526 -0.246  -0.8856]
 [ 0.5639  0.2379  0.9104]]


In [4]:
data.shape

(2L, 3L)

In [5]:
data.dtype

dtype('float64')

* ### To create a ndarray

In [6]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1

array([ 6. ,  7.5,  8. ,  0. ,  1. ])

In [7]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [8]:
arr2.shape

(2L, 4L)

In [11]:
arr2.ndim

2

Unless specified, **np.array** tries to infer a good data type for the array that it creates.

In [12]:
arr2.dtype

dtype('int32')

Also, np.zeros and np.empty will create new arrays. However, it's not safe to assume that np.empty will return an array of all zeros. In many cases, as previously shown, it will return uninitialized garbage values.

In [13]:
np.zeros(10)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [15]:
np.zeros((2, 3))

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [17]:
np.empty((2, 3, 2))

array([[[  0.00000000e+000,   6.36598740e-314],
        [  0.00000000e+000,   1.27319747e-313],
        [  1.27319747e-313,   1.27319747e-313]],

       [[  6.40774317e+170,   1.99217960e-313],
        [  1.04136250e-071,   2.75859453e-313],
        [  8.61113693e-067,   1.39668731e-075]]])

**arange** is an array-valued version of the built-in python range function:

In [18]:
np.arange(15)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

* some built-in functions~

**array**: convert input to an ndarray. Copies the input data by default.

**asarray**: convert input to ndarray, but do not copy is the input is already an ndarray

**ones_like**: produce an array of all 1's with the given shape and dtype. zeros_like, empty_like are similar

**eye, identity**: create$\bf{I}_{N\times N}$

In [19]:
np.ones_like(arr2)

array([[1, 1, 1, 1],
       [1, 1, 1, 1]])

In [20]:
np.eye(4)

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

In [21]:
np.identity(4)

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

* ### data types for ndarrays

In [27]:
arr1 = np.array([1, 2, 3], dtype = np.float64)
arr1

array([ 1.,  2.,  3.])

In [28]:
arr2 = np.array([1, 2, 3], dtype=np.int32)
arr2

array([1, 2, 3])

In [29]:
print arr1.dtype
print arr2.dtype

float64
int32


#### As for other dtypes, refer to python for data analysis at Page 83

### operations between arrays and scalars

In [22]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr

array([[1, 2, 3],
       [4, 5, 6]])

In [23]:
arr * arr

array([[ 1,  4,  9],
       [16, 25, 36]])

In [25]:
1.0 / arr

array([[ 1.        ,  0.5       ,  0.33333333],
       [ 0.25      ,  0.2       ,  0.16666667]])

In [26]:
arr ** 0.5

array([[ 1.        ,  1.41421356,  1.73205081],
       [ 2.        ,  2.23606798,  2.44948974]])

### Basic indexing and slicing

In [30]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [31]:
arr[5]

5

In [32]:
arr[5:8]

array([5, 6, 7])

In [33]:
arr[5:8] = 12
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

An important thing is that array slices are on the original array. This means that the data is not copied, and any modifications to the view will be reflected in the source array:

In [34]:
arr_slice = arr[5:8]
arr_slice[1] = 12345
arr

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])

In [35]:
arr_slice[:] = 64
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

If you want a copy of a slice of an ndarray instead of a view, you will need to explicitly copy the array; for example arr[5:8].copy().

### multi-dimensional arrays

In [36]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [37]:
arr2d[2]

array([7, 8, 9])

In [38]:
arr2d[0][2]

3

In [39]:
arr2d[0, 2]

3

In [40]:
arr2d.shape

(3L, 3L)

In [41]:
arr2d[:2]

array([[1, 2, 3],
       [4, 5, 6]])

In [42]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [43]:
arr2d[1, :]

array([4, 5, 6])

In [44]:
arr2d[0, 0:]

array([1, 2, 3])

### Boolean indexing

In [47]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)

In [48]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], 
      dtype='|S4')

In [49]:
data

array([[ 0.64964176,  3.3756658 , -0.44286654, -0.64276075],
       [ 0.70435856,  0.4376423 ,  0.55492949,  1.14089177],
       [-0.54990446, -1.02285075, -0.16334528, -0.28660276],
       [-0.78802567,  0.41961283, -1.42074941,  1.08309653],
       [ 0.18011377,  1.4324546 ,  0.91915594,  0.39519993],
       [ 0.7339676 ,  0.7446452 ,  1.98514782, -1.80363421],
       [-1.13401174,  0.66870088,  0.18653094, -0.59452025]])

In [50]:
names == 'Bob'

array([ True, False, False,  True, False, False, False], dtype=bool)

In [51]:
data[names == 'Bob']

array([[ 0.64964176,  3.3756658 , -0.44286654, -0.64276075],
       [-0.78802567,  0.41961283, -1.42074941,  1.08309653]])

In [52]:
data[names == 'Bob', 2:]

array([[-0.44286654, -0.64276075],
       [-1.42074941,  1.08309653]])

To select everything but 'Bob', you can either use != or negate the condition using -:

In [53]:
data[-(names == 'Bob')]

  if __name__ == '__main__':


array([[ 0.70435856,  0.4376423 ,  0.55492949,  1.14089177],
       [-0.54990446, -1.02285075, -0.16334528, -0.28660276],
       [ 0.18011377,  1.4324546 ,  0.91915594,  0.39519993],
       [ 0.7339676 ,  0.7446452 ,  1.98514782, -1.80363421],
       [-1.13401174,  0.66870088,  0.18653094, -0.59452025]])

In [54]:
data[names != 'Bob']

array([[ 0.70435856,  0.4376423 ,  0.55492949,  1.14089177],
       [-0.54990446, -1.02285075, -0.16334528, -0.28660276],
       [ 0.18011377,  1.4324546 ,  0.91915594,  0.39519993],
       [ 0.7339676 ,  0.7446452 ,  1.98514782, -1.80363421],
       [-1.13401174,  0.66870088,  0.18653094, -0.59452025]])

In [55]:
data[data<0]

array([-0.44286654, -0.64276075, -0.54990446, -1.02285075, -0.16334528,
       -0.28660276, -0.78802567, -1.42074941, -1.80363421, -1.13401174,
       -0.59452025])

In [56]:
data[data<0] = 0

In [57]:
data

array([[ 0.64964176,  3.3756658 ,  0.        ,  0.        ],
       [ 0.70435856,  0.4376423 ,  0.55492949,  1.14089177],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.41961283,  0.        ,  1.08309653],
       [ 0.18011377,  1.4324546 ,  0.91915594,  0.39519993],
       [ 0.7339676 ,  0.7446452 ,  1.98514782,  0.        ],
       [ 0.        ,  0.66870088,  0.18653094,  0.        ]])

### Fancy indexing

In [58]:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr

array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 2.,  2.,  2.,  2.],
       [ 3.,  3.,  3.,  3.],
       [ 4.,  4.,  4.,  4.],
       [ 5.,  5.,  5.,  5.],
       [ 6.,  6.,  6.,  6.],
       [ 7.,  7.,  7.,  7.]])

In [59]:
arr[[4, 3, 0, 6]]

array([[ 4.,  4.,  4.,  4.],
       [ 3.,  3.,  3.,  3.],
       [ 0.,  0.,  0.,  0.],
       [ 6.,  6.,  6.,  6.]])

In [60]:
arr[[-3, -5, -7]]

array([[ 5.,  5.,  5.,  5.],
       [ 3.,  3.,  3.,  3.],
       [ 1.,  1.,  1.,  1.]])

In [61]:
arr = np.arange(32).reshape((8, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [62]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]]

array([ 4, 23, 29, 10])

In [63]:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [64]:
arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

### Transposing arrays and swapping axes

In [65]:
arr = np.arange(15).reshape((3, 5))

In [66]:
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [67]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [68]:
np.dot(arr.T, arr)

array([[125, 140, 155, 170, 185],
       [140, 158, 176, 194, 212],
       [155, 176, 197, 218, 239],
       [170, 194, 218, 242, 266],
       [185, 212, 239, 266, 293]])

In [69]:
arr = np.arange(16).reshape((2, 2, 4))

In [70]:
arr.T

array([[[ 0,  8],
        [ 4, 12]],

       [[ 1,  9],
        [ 5, 13]],

       [[ 2, 10],
        [ 6, 14]],

       [[ 3, 11],
        [ 7, 15]]])

In [71]:
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [72]:
arr.transpose((1, 0, 2))

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [73]:
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [74]:
arr.swapaxes(1, 2)

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

### Universal Functions: Fast Element-wise Array Functions

In [75]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [76]:
np.sqrt(arr)

array([ 0.        ,  1.        ,  1.41421356,  1.73205081,  2.        ,
        2.23606798,  2.44948974,  2.64575131,  2.82842712,  3.        ])

In [77]:
np.exp(arr)

array([  1.00000000e+00,   2.71828183e+00,   7.38905610e+00,
         2.00855369e+01,   5.45981500e+01,   1.48413159e+02,
         4.03428793e+02,   1.09663316e+03,   2.98095799e+03,
         8.10308393e+03])

In [79]:
x = np.random.randn(8)
y = np.random.randn(8)
print x
print y

[-2.27643535 -0.9038638   0.31835631  0.13552872  0.03620158  0.19654725
  0.39042082 -0.61497693]
[ 0.09162248  1.23845425 -0.4494515  -0.57170906  0.46005724 -1.07537687
  0.83097634  0.62590197]


In [80]:
np.maximum(x, y) # element-wise maximum

array([ 0.09162248,  1.23845425,  0.31835631,  0.13552872,  0.46005724,
        0.19654725,  0.83097634,  0.62590197])

In [81]:
x = np.arange(16).reshape(4, 4)
y = x.copy() + 1
print x
print y
np.maximum(x, y)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]


array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]])

In [86]:
arr = np.random.randn(7)
arr * 5

array([ -5.38528136,  -6.29127283,   1.00915822, -13.15335294,
        -4.16806499,   9.00664132,  -1.26515928])

In [88]:
np.modf(arr * 5)

(array([-0.38528136, -0.29127283,  0.00915822, -0.15335294, -0.16806499,
         0.00664132, -0.26515928]),
 array([ -5.,  -6.,   1., -13.,  -4.,   9.,  -1.]))

For more element-wise function, turn to Page 96

### Expressing Conditional Logic as Array Operations

In [2]:
import numpy as np

xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False]) 
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
result

[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]

For efficiency and concise, we can use **np.where** instead

In [3]:
result = np.where(cond, xarr, yarr)
result

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

In [4]:
# another example
arr = np.random.randn(4, 4)
print arr
np.where(arr > 0, 2, -2)

[[-1.53800123  0.26760696  1.25466642  0.2119534 ]
 [ 0.89522368 -0.16576288  0.53274767 -0.01746469]
 [ 1.56515061  1.12339838 -0.50516662  0.42088021]
 [ 0.2407559   0.86249955 -0.16324587  0.10280481]]


array([[-2,  2,  2,  2],
       [ 2, -2,  2, -2],
       [ 2,  2, -2,  2],
       [ 2,  2, -2,  2]])

In [5]:
np.where(arr > 0, 2, arr)

array([[-1.53800123,  2.        ,  2.        ,  2.        ],
       [ 2.        , -0.16576288,  2.        , -0.01746469],
       [ 2.        ,  2.        , -0.50516662,  2.        ],
       [ 2.        ,  2.        , -0.16324587,  2.        ]])

### Mathematical and statistical methods

In [6]:
arr = np.random.randn(5, 4)

In [7]:
arr.mean()

0.23601524756433961

In [8]:
np.mean(arr)

0.23601524756433961

In [9]:
arr.sum()

4.7203049512867921

In [10]:
arr

array([[ 0.35096311, -0.10542316,  1.83071422,  0.40635926],
       [ 0.7103652 ,  1.0953786 ,  0.86917113, -1.03399534],
       [ 0.25265982,  1.10763492, -0.4609766 ,  1.65267534],
       [-0.70129397,  0.25504529, -0.80937958,  0.10840354],
       [ 0.71139591,  0.24903188, -0.84765999, -0.92076463]])

In [11]:
arr.mean(axis=1)

array([ 0.62065336,  0.4102299 ,  0.63799837, -0.28680618, -0.20199921])

In [12]:
arr.mean(axis=0)

array([ 0.26481802,  0.5203335 ,  0.11637384,  0.04253563])

In [13]:
arr.sum(0)

array([ 1.32409008,  2.60166752,  0.58186918,  0.21267816])

In [14]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])

In [15]:
arr.cumsum(0)

array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]])

In [16]:
arr.cumprod(1)

array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]])

### Methods for boolean arrays

In [17]:
arr = np.random.randn(100)
(arr > 0).sum()

52

In [18]:
bools = np.array([False, False, True, False])
bools.any() # print True if any of bools is True

True

In [19]:
bools.all() # print True if all are True

False

### Sorting

In [20]:
arr = np.random.randn(8)
arr

array([ 1.1356004 ,  0.54725271, -1.24059708, -0.82078014,  0.55525726,
       -0.84713192,  0.43680132, -0.86732966])

In [21]:
arr.sort()
arr

array([-1.24059708, -0.86732966, -0.84713192, -0.82078014,  0.43680132,
        0.54725271,  0.55525726,  1.1356004 ])

In [22]:
arr = np.random.randn(5, 3)
arr

array([[-0.54440424, -0.9284118 ,  1.46626056],
       [ 0.13543007, -0.8838245 , -1.42725789],
       [-1.05829907,  0.25958211,  0.74644885],
       [-0.21781182, -0.76403345,  0.28227275],
       [ 0.88061611,  0.45962499,  0.61084327]])

In [23]:
arr.sort(1)

In [24]:
arr

array([[-0.9284118 , -0.54440424,  1.46626056],
       [-1.42725789, -0.8838245 ,  0.13543007],
       [-1.05829907,  0.25958211,  0.74644885],
       [-0.76403345, -0.21781182,  0.28227275],
       [ 0.45962499,  0.61084327,  0.88061611]])

In [26]:
large_arr = np.random.randn(1000)
large_arr.sort()
large_arr[int(0.05 * len(large_arr))] # 5% quantile 

-1.5649069019112949

### Unique and other set logic

In [27]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
np.unique(names) 

array(['Bob', 'Joe', 'Will'], 
      dtype='|S4')

In [28]:
sorted(set(names))

['Bob', 'Joe', 'Will']

In [29]:
values = np.array([6, 0, 0, 3, 2, 5, 6])
np.in1d(values, [2, 3, 6]) # compute a boolean array indicating whether each element of x is contained in y

array([ True, False, False,  True,  True, False,  True], dtype=bool)

In [30]:
np.union1d(values, [2, 3, 6]) # compute the sorted union of elements

array([0, 2, 3, 5, 6])

In [31]:
np.setdiff1d(values, [2, 3, 6]) # set difference, elements in x that are not in y

array([0, 5])

In [32]:
np.setxor1d(values, [2, 3, 6]) # Set symmetric differences; elements that are in either of the arrays, but not both

array([0, 5])

### File input and output with arrays

I'll show this part more later when learn pandas.

#### Storing arrays on disk and binary format

In [33]:
arr = np.arange(10)
# save
# np.save('some_array', arr)
# load
# np.load('some_array.npy')

#### Saving and Loading text files

In [36]:
arr = np.random.randn(5, 4)
np.savetxt('some_arr.txt', arr)

In [37]:
!cat some_arr.txt

3.644529978632979339e-01 4.426991421571346841e-01 -3.925755571679607203e-01 8.934222716070812087e-01
1.696651593963088012e-01 -1.033564070928786993e+00 6.614187565696395543e-01 -6.675349038169455795e-01
4.727918385289481829e-01 -9.487674154523418890e-01 1.716975453841113497e+00 -1.645924651256385762e-01
-4.273126077008770896e-01 1.293339599407077101e+00 -5.205211058811811536e-02 -1.182836454209269084e+00
8.755134159644130110e-01 1.897294535710043251e+00 6.264300952990280447e-01 -1.638535250436639057e+00


In [38]:
arr = np.loadtxt('some_arr.txt', delimiter=' ')
arr

array([[ 0.364453  ,  0.44269914, -0.39257556,  0.89342227],
       [ 0.16966516, -1.03356407,  0.66141876, -0.6675349 ],
       [ 0.47279184, -0.94876742,  1.71697545, -0.16459247],
       [-0.42731261,  1.2933396 , -0.05205211, -1.18283645],
       [ 0.87551342,  1.89729454,  0.6264301 , -1.63853525]])

# Linear Algebra

### Multiply

In [42]:
arr1 = np.arange(9).reshape(3, 3)
arr2 = arr1.copy()
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [43]:
arr1 * arr2

array([[ 0,  1,  4],
       [ 9, 16, 25],
       [36, 49, 64]])

In [44]:
np.dot(arr1, arr2)

array([[ 15,  18,  21],
       [ 42,  54,  66],
       [ 69,  90, 111]])

In [45]:
from numpy.linalg import inv, qr

In [46]:
X = np.random.randn(5, 5)
mat = X.T.dot(X)
mat.dot(inv(mat))

array([[  1.00000000e+00,  -6.35786734e-17,  -1.41115413e-16,
         -3.10334239e-16,  -2.74716797e-16],
       [ -1.45596426e-16,   1.00000000e+00,   1.17198714e-16,
          2.93001562e-16,   6.85490305e-16],
       [  9.35917870e-17,   4.76408491e-17,   1.00000000e+00,
          6.31033194e-18,   6.25226981e-17],
       [  1.31464940e-16,   9.90310105e-17,   5.65740525e-17,
          1.00000000e+00,   1.34669134e-16],
       [  1.34133152e-16,  -5.29775789e-16,  -1.86438859e-16,
         -1.94338646e-16,   1.00000000e+00]])

In [47]:
q, r = qr(mat)
print q, r

[[-0.88718904  0.27961284 -0.23579991 -0.15063546  0.23752817]
 [-0.17635491 -0.6711166   0.25689994  0.32204346  0.59058605]
 [ 0.11819679 -0.04732268 -0.82056769  0.55607301  0.03523562]
 [ 0.26301901 -0.24800116 -0.42867615 -0.73387893  0.38337306]
 [ 0.31407814  0.63849444  0.14596658  0.16062843  0.66826045]] [[-4.50085158 -1.85007482  0.7146285   1.24304275  2.74214988]
 [ 0.         -3.03140167 -0.44193889 -1.25612563  3.94771636]
 [ 0.          0.         -0.95672707 -0.88637959  0.67891204]
 [ 0.          0.          0.         -0.4064449   0.36518319]
 [ 0.          0.          0.          0.          0.54920207]]


## For more functions, turn to page 106

### Random number generation is after page 106. 