## NumPy-快速处理数据

In [1]:
import numpy as np

- dtpye类型会根据数组内容自动判断出合适的类型

In [2]:
a_int = np.array([[1, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]])

In [3]:
print a_int.itemsize, a_int.size, a_int.nbytes, a_int.strides

4 12 48 (16L, 4L)


In [4]:
print a_int.ndim, a_int.shape, a_int.dtype

2 (3L, 4L) int32


In [5]:
a_float = np.array([[1.0, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]])

In [6]:
print a_float.itemsize, a_float.size, a_float.nbytes, a_float.strides

8 12 96 (32L, 8L)


In [7]:
print a_float.ndim, a_float.shape, a_float.dtype

2 (3L, 4L) float64


- 当某个轴的元素为 **-1** 时，将根据数组元素的个数自动计算此轴的长度

In [8]:
a_int.shape = 2, -1
a_int

array([[ 1,  2,  3,  4,  4,  5],
       [ 6,  7,  7,  8,  9, 10]])

- reshape方法，可以创建一个改变了尺寸的新数组，原数组的shape保持不变
- 新数组和原数组共享内存区域

In [9]:
d = a_int.reshape((2, -1))
d

array([[ 1,  2,  3,  4,  4,  5],
       [ 6,  7,  7,  8,  9, 10]])

- 通过切片获取的新的数组是原始数组的一个视图，与原始数组共享内存区域

In [10]:
a = np.arange(10)
print a[3:-1]
print a[0:-1:2]

[3 4 5 6 7 8]
[0 2 4 6 8]


- 使用整数序列(**python列表或numpy数组**)作为下标获得的数组，**不**和原始数组共享数据空间

In [11]:
x = np.arange(10, 1, -1)
x

array([10,  9,  8,  7,  6,  5,  4,  3,  2])

In [12]:
y = x[[3, 3, 1, 8]]
print y

[7 7 9 2]


In [13]:
y[2] = 999
print y
print x

[  7   7 999   2]
[10  9  8  7  6  5  4  3  2]


In [14]:
b = x[np.array([3,3,-3,8])]
b

array([7, 7, 4, 2])

In [15]:
b[2] = 10
print b
print x

[ 7  7 10  2]
[10  9  8  7  6  5  4  3  2]


- 使用布尔**数组**(不是python的bool列表)作为下标获得的数组不和原始数组共享数据空间
- 布尔数组通常用布尔运算的ufunc函数产生

In [16]:
x = np.arange(0,5)

In [17]:
y = x[np.array([True, True, False, False, False])]
y

array([0, 1])

In [18]:
y[0:2] = 999, 888
print y
print x

[999 888]
[0 1 2 3 4]


In [19]:
x[x<2]

array([0, 1])

### 多维数组

- NumPy采用组元(tuple)作为数组下标
- 组元的语法定义只需要用逗号隔开即可

In [20]:
np.arange(0, 60, 10).reshape(6, -1)

array([[ 0],
       [10],
       [20],
       [30],
       [40],
       [50]])

In [21]:
np.arange(0, 6)

array([0, 1, 2, 3, 4, 5])

In [22]:
a = np.arange(0, 60, 10).reshape(6, -1) + np.arange(0, 6)
a

array([[ 0,  1,  2,  3,  4,  5],
       [10, 11, 12, 13, 14, 15],
       [20, 21, 22, 23, 24, 25],
       [30, 31, 32, 33, 34, 35],
       [40, 41, 42, 43, 44, 45],
       [50, 51, 52, 53, 54, 55]])

In [23]:
a[(0,1),(0,1)]

array([ 0, 11])

In [24]:
a[4:, [0,2,4]]

array([[40, 42, 44],
       [50, 52, 54]])

In [25]:
mask = np.array([1,0,1,0,0,0], dtype='bool')

In [26]:
a[mask,2]

array([ 2, 22])

### 结构数组

In [27]:
persontype = np.dtype([('name','S32'), ('age', 'i1'), ('weight', 'f4')])
a = np.array([("Zhang",32,75.5),("Wang",24,65.2)], dtype=persontype)
a

array([('Zhang', 32, 75.5), ('Wang', 24, 65.19999694824219)], 
      dtype=[('name', 'S32'), ('age', 'i1'), ('weight', '<f4')])

In [28]:
a[0]  # a[0]是一个结构元素，它和数组a共享内存数据

('Zhang', 32, 75.5)

In [29]:
a[0].dtype

dtype([('name', 'S32'), ('age', 'i1'), ('weight', '<f4')])

In [30]:
a[0]['name']  # 像字典样通过字符串下标获取其对应的字段值

'Zhang'

In [31]:
a[:]['age']  

array([32, 24], dtype=int8)

In [32]:
np.dtype([('f1',[('f2', 'i2')])])  # 创建一个有字段f1的结构, 其中f1的值是另外一个结构，它有字段f2，其类型为16bit整数

dtype([('f1', [('f2', '<i2')])])

In [33]:
dt = np.dtype([('f0','i4'), ('f1', 'i4', (2,3))])

In [34]:
nest_struct = np.array((1, [[1,2,3], [4,5,6]]), dtype=dt)  # 当某个字段类型为数组时，用组元的第三个参数表示
nest_struct

array((1, [[1, 2, 3], [4, 5, 6]]), 
      dtype=[('f0', '<i4'), ('f1', '<i4', (2, 3))])

In [35]:
nest_struct['f1']

array([[1, 2, 3],
       [4, 5, 6]])

### 广播

In [36]:
a = np.arange(0, 60, 10).reshape(-1, 1)
a.shape

(6L, 1L)

In [37]:
b = np.arange(0, 5)
b.shape

(5L,)

In [38]:
c = a + b

In [39]:
b2 = b.reshape((1, 5))
b2

array([[0, 1, 2, 3, 4]])

In [40]:
b3 = b2.repeat(6, axis=0)
b3

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [41]:
a2 = a.repeat(5,axis=1)
a2

array([[ 0,  0,  0,  0,  0],
       [10, 10, 10, 10, 10],
       [20, 20, 20, 20, 20],
       [30, 30, 30, 30, 30],
       [40, 40, 40, 40, 40],
       [50, 50, 50, 50, 50]])

In [42]:
a2 + b3

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24],
       [30, 31, 32, 33, 34],
       [40, 41, 42, 43, 44],
       [50, 51, 52, 53, 54]])

### ogrid

- ogrid用切片组元作为下标进行存取，返回一组可用来广播计算的数组
- ogrid是属性而非函数
- 切片下标有两种形式：
 - 开始值:结束值:步长 (类似np.arange)
 - 开始值:结束值:长度j，当第三个参数为虚数时，它表示返回的数组的长度 (类似np.linspace)

In [43]:
x, y = np.ogrid[0:5, 0:5]

In [44]:
x, y

(array([[0],
        [1],
        [2],
        [3],
        [4]]), array([[0, 1, 2, 3, 4]]))

In [45]:
x, y = np.ogrid[-2:2:5j, 1:5:5j]
x,  y

(array([[-2.],
        [-1.],
        [ 0.],
        [ 1.],
        [ 2.]]), array([[ 1.,  2.,  3.,  4.,  5.]]))

In [46]:
x**2, y**2

(array([[ 4.],
        [ 1.],
        [ 0.],
        [ 1.],
        [ 4.]]), array([[  1.,   4.,   9.,  16.,  25.]]))

In [47]:
x**2 - y**2

array([[  3.,   0.,  -5., -12., -21.],
       [  0.,  -3.,  -8., -15., -24.],
       [ -1.,  -4.,  -9., -16., -25.],
       [  0.,  -3.,  -8., -15., -24.],
       [  3.,   0.,  -5., -12., -21.]])

In [48]:
a, b = np.ogrid[0:3, 0:3]

In [49]:
a**2 + b**2

array([[0, 1, 4],
       [1, 2, 5],
       [4, 5, 8]])

### ufunc的方法

In [50]:
np.add.reduce([1,2,3])  # reduce 方法

6

In [51]:
np.add.reduce([[1,2,3],[4,5,6]], axis=1)

array([ 6, 15])

In [52]:
np.add.accumulate([1,2,3])  # accumulate 方法

array([1, 3, 6])

In [53]:
np.add.accumulate([[1,2,3], [4,5,6]], axis=1)

array([[ 1,  3,  6],
       [ 4,  9, 15]])

In [54]:
np.multiply.outer([1,2,3,4,5],[2,3,4])  # outer 方法，<op>.outer(a,b)方法

array([[ 2,  3,  4],
       [ 4,  6,  8],
       [ 6,  9, 12],
       [ 8, 12, 16],
       [10, 15, 20]])

In [55]:
a = np.array([1,2,3,4,5]).reshape(5,-1)
b = np.array([2,3,4]).reshape(1,-1)

In [56]:
a,b

(array([[1],
        [2],
        [3],
        [4],
        [5]]), array([[2, 3, 4]]))

In [57]:
a2 = a.repeat(3, axis=1)
a2

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4],
       [5, 5, 5]])

In [58]:
b2 = b.repeat(5, axis=0)
b2

array([[2, 3, 4],
       [2, 3, 4],
       [2, 3, 4],
       [2, 3, 4],
       [2, 3, 4]])

In [59]:
a2

array([[1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4],
       [5, 5, 5]])

In [60]:
b2

array([[2, 3, 4],
       [2, 3, 4],
       [2, 3, 4],
       [2, 3, 4],
       [2, 3, 4]])

In [61]:
a2 * b2  # 等价于 np.multiply.outer([1,2,3,4,5],[2,3,4])

array([[ 2,  3,  4],
       [ 4,  6,  8],
       [ 6,  9, 12],
       [ 8, 12, 16],
       [10, 15, 20]])

### 矩阵运算

- dot 对于两个一维的数组，计算的是内积；
- 对于二维数组，计算的是两个数组的矩阵乘积

In [95]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
print a.dot(b)
print np.inner(a,b)
print np.dot(a, b)

32
32
32


In [82]:
a2 = np.array([1,2,3])
b2 = a2.reshape((-1, 3))
c2 = a2.reshape((3, -1))

In [83]:
b2.dot(c2)  ##

array([[14]])

- inner : 和dot乘积一样，对于两个一维数组，计算的是这两个数组对应下标元素的乘积和
- 对于多维数组，它计算的结果数组中的每个元素都是：数组a和b的最后一维的内积，因此数组a和b的最后一维的长度必须相同

In [88]:
a = np.arange(12).reshape((3,4))
b = np.arange(12,24).reshape(3,4)

In [89]:
a

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [90]:
b

array([[12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [91]:
b.T

array([[12, 16, 20],
       [13, 17, 21],
       [14, 18, 22],
       [15, 19, 23]])

In [92]:
np.inner(a,b)  # 等价于a乘b的转置，即 np.dot(a, b.T)

array([[ 86, 110, 134],
       [302, 390, 478],
       [518, 670, 822]])

In [69]:
np.dot(a, b.T)  # a.dot(b.T)

array([[ 86, 110, 134],
       [302, 390, 478],
       [518, 670, 822]])

- outer 只按照一维数组进行计算，如果传入参数是多维数组，则先将此数组展平再进行运算
- outer乘积计算列向量与行向量的矩阵乘积

In [70]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6, 7])

In [71]:
np.outer(a, b)  # 向量外积，a的转置乘以b

array([[ 4,  5,  6,  7],
       [ 8, 10, 12, 14],
       [12, 15, 18, 21]])

In [72]:
np.inner(a.T.reshape((-1,1)), b.reshape((-1,1)))

array([[ 4,  5,  6,  7],
       [ 8, 10, 12, 14],
       [12, 15, 18, 21]])

In [97]:
np.tile([0,1,2], 3)

array([0, 1, 2, 0, 1, 2, 0, 1, 2])