# 4 Numpy基础

In [1]:
import numpy as np
my_arr = np.arange(1000000)
my_list = list(range(1000000))

%time for _ in range(10): my_arr2 = my_arr * 2
%time for _ in range(10): my_list2 = my_list * 2

CPU times: user 12.7 ms, sys: 9.9 ms, total: 22.6 ms
Wall time: 24 ms
CPU times: user 109 ms, sys: 24.8 ms, total: 134 ms
Wall time: 143 ms


从上面的例子可以看到numpy很快。numpy快的原因：

- 数组在内存中连续存储。是独立于其他Python内置对象的
- 数组遍历不是用的python的for

## 4.1 ndarray

### ndarray数据类型

In [2]:
data = np.random.randn(2, 3)
data

array([[-0.47463112,  0.3050825 , -0.64524301],
       [ 1.39613407, -0.78000095,  0.4502015 ]])

In [3]:
data * 10

array([[-4.74631121,  3.05082497, -6.45243005],
       [13.96134069, -7.80000945,  4.50201499]])

In [4]:
data + data

array([[-0.94926224,  0.61016499, -1.29048601],
       [ 2.79226814, -1.56000189,  0.900403  ]])

In [5]:
data.shape

(2, 3)

### 4.1.1 生成ndarray

#### array

In [6]:
data1 = [6, 7, 5, 8, 0, 1]

In [7]:
arr1 = np.array(data1)

In [8]:
arr1

array([6, 7, 5, 8, 0, 1])

除array外，还有asarray, arange, ones, zeros, full, eye, identity等方法来生成数组

### 4.1.2 ndarray的数据类型

array在生成数组的时候会自动推断类型，也可以指定dtype参数显示指定类型

In [9]:
arr = np.array([1,2,3,4,5])
arr.dtype

dtype('int64')

用astype返回指定的数据类型。注意原数组不会变

In [10]:
float_arr = arr.astype(np.float64)

In [11]:
float_arr.dtype

dtype('float64')

In [12]:
arr.dtype   # 注意原来的arr没有变。astype是返回了一个新的数组。

dtype('int64')

### 4.1.3 np数组运算

np数组可以直接运算，而无需用for遍历

In [13]:
arr = np.array([[1., 2., 3.], [4., 5., 6]])
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [14]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [15]:
1 / arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [16]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [17]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [18]:
arr < arr2

array([[False,  True, False],
       [ True, False,  True]])

### 4.1.4 基础索引与切片

于原生的切片不同，np的切片不会复制新的切片，而是返回一个引用。要想拷贝，可以显示调用`arr[2:4].copy()`

In [19]:
l1 = [1,2,3,4]
a = l1[1:2]
a[:] = [100]
l1

[1, 2, 3, 4]

In [20]:
arr1 = np.array([1,2,3,4])
b = arr1[1:2]
b

array([2])

In [21]:
b[:] = 100
arr1

array([  1, 100,   3,   4])

### 4.1.5 布尔索引

In [22]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = np.random.randn(7, 4)   # 生成7行4列的随机正太分布数据

In [23]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [24]:
data

array([[-0.77866453, -0.14363185,  0.45449336,  0.29282518],
       [-0.46098243,  1.16639265,  0.74755218, -1.4150666 ],
       [ 0.291673  ,  0.33597604,  0.59822318, -0.28248214],
       [ 0.97645407,  1.60809084,  0.2549675 , -1.60341974],
       [-1.64342671,  1.6774101 , -0.63317631, -0.27892935],
       [ 0.24424385,  0.16625707, -0.5176098 ,  0.45458239],
       [-0.34450861, -0.72741742,  0.46196444, -1.40009699]])

In [27]:
names == 'Bob'

array([ True, False, False,  True, False, False, False])

In [28]:
data[names == 'Bob']       # 取得Bob所在的行的数据，即第一行和第四行

array([[-0.77866453, -0.14363185,  0.45449336,  0.29282518],
       [ 0.97645407,  1.60809084,  0.2549675 , -1.60341974]])

In [29]:
data[names == 'Bob', 2:]   # 只取Bob的最后两列

array([[ 0.45449336,  0.29282518],
       [ 0.2549675 , -1.60341974]])

In [31]:
data[names == 'Bob', 3]    # 只取Bob的所在行的最后一列

array([ 0.29282518, -1.60341974])

In [32]:
mask = (names == 'Bob') | (names == 'Will')   # 取得Bob和Will的行，除了|，还有~取反，&与
data[mask]

array([[-0.77866453, -0.14363185,  0.45449336,  0.29282518],
       [ 0.291673  ,  0.33597604,  0.59822318, -0.28248214],
       [ 0.97645407,  1.60809084,  0.2549675 , -1.60341974],
       [-1.64342671,  1.6774101 , -0.63317631, -0.27892935]])

In [33]:
data[data < 0] = 0    # 将小于0的数据置为0

In [34]:
data

array([[0.        , 0.        , 0.45449336, 0.29282518],
       [0.        , 1.16639265, 0.74755218, 0.        ],
       [0.291673  , 0.33597604, 0.59822318, 0.        ],
       [0.97645407, 1.60809084, 0.2549675 , 0.        ],
       [0.        , 1.6774101 , 0.        , 0.        ],
       [0.24424385, 0.16625707, 0.        , 0.45458239],
       [0.        , 0.        , 0.46196444, 0.        ]])

### 4.1.6 神奇索引

用list进行多行索引，可以一次返回多行

In [37]:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i

In [38]:
arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [40]:
arr[[4, 2]]   # 选择第四行和第二行

array([[4., 4., 4., 4.],
       [2., 2., 2., 2.]])

In [41]:
arr[[-1]]     # 选择最后一行

array([[7., 7., 7., 7.]])

另一种用法，多个list指定多维坐标

In [42]:
arr = np.arange(32).reshape((8, 4))

In [43]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [45]:
arr[[1,5,7,2], [0,3,1,2]] # (1, 0), (5, 3), (7, 1), (2, 2)四个点的元素

array([ 4, 23, 29, 10])

### 4.1.7 转置和换轴

In [46]:
arr = np.arange(15).reshape(3, 5)
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [47]:
arr.T      # 注意原arr不会变。转置是返回一个视图

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [48]:
np.dot(arr.T, arr)   # dot计算矩阵的内积，即点乘

array([[125, 140, 155, 170, 185],
       [140, 158, 176, 194, 212],
       [155, 176, 197, 218, 239],
       [170, 194, 218, 242, 266],
       [185, 212, 239, 266, 293]])

In [49]:
arr = np.arange(16).reshape((2,2,4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [50]:
arr.swapaxes(1,2)     # 参数值轴的编号，这里置换第一个轴和第二个轴

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])