## 数据操作

In [1]:
import numpy as np

In [2]:
x = np.arange(12)
x

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [3]:
x.shape

(12,)

In [4]:
x.reshape(3, 4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [5]:
np.zeros((2, 3, 4))

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [6]:
np.ones((2, 3, 4))

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]])

In [7]:
# normal正态分布；
# 其中的每个元素都从均值为0、标准差为1的标准高斯分布（正态分布）中随机采样。
np.random.normal(0 ,1, size=(3, 4))

array([[-0.58523902, -0.09034161,  1.55278253,  0.17012943],
       [-0.45208803,  0.53693315, -0.01808207, -0.0634471 ],
       [ 0.98464971,  1.22843659,  1.04562947,  1.02618511]])

In [8]:
x = np.array([1, 2, 4, 8])
y = np.array([2, 2, 2, 2])
x + y, x - y, x * y, x / y, x ** y  # **运算符是求幂运算

(array([ 3,  4,  6, 10]),
 array([-1,  0,  2,  6]),
 array([ 2,  4,  8, 16]),
 array([0.5, 1. , 2. , 4. ]),
 array([ 1,  4, 16, 64]))

In [9]:
np.exp(x)

array([2.71828183e+00, 7.38905610e+00, 5.45981500e+01, 2.98095799e+03])

In [10]:
X = np.arange(12).reshape(3, 4)
Y = np.array([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])
X, Y

(array([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]]),
 array([[2, 1, 4, 3],
        [1, 2, 3, 4],
        [4, 3, 2, 1]]))

In [11]:
# axis: 0 按行链接；1: 按列链接；
# 也可以理解为，0 添加记录；1 添加特征；
np.concatenate([X, Y], axis=0), np.concatenate([X, Y], axis=1)

(array([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [ 2,  1,  4,  3],
        [ 1,  2,  3,  4],
        [ 4,  3,  2,  1]]),
 array([[ 0,  1,  2,  3,  2,  1,  4,  3],
        [ 4,  5,  6,  7,  1,  2,  3,  4],
        [ 8,  9, 10, 11,  4,  3,  2,  1]]))

In [12]:
X == Y

array([[False,  True, False,  True],
       [False, False, False, False],
       [False, False, False, False]])

In [13]:
X.sum()

66

In [14]:
# 广播机制；
a = np.arange(3).reshape(3, 1)
b = np.arange(2).reshape(1, 2)
a, b

(array([[0],
        [1],
        [2]]),
 array([[0, 1]]))

In [15]:
a + b

array([[0, 1],
       [1, 2],
       [2, 3]])

In [16]:
# 切片
X[-1]

array([ 8,  9, 10, 11])

In [17]:
X[-1, 2:4]

array([10, 11])

In [18]:
# n维数组对象是可以相互转化的；
# ndarray 是多维数组（计算机）；Tensor张量（数学，AI推荐这个）

## 数据预处理

In [19]:
import os

#  exist_ok：只有在目录不存在时创建目录，目录已存在时不会抛出异常;
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')

In [20]:
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [21]:
import pandas as pd

In [23]:
data = pd.read_csv(data_file)
data # Alley 巷子，房子所在的巷子类型，Pava是一种类型吧；

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [25]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
inputs, outputs

(   NumRooms Alley
 0       3.0  Pave
 1       2.0   NaN
 2       4.0   NaN
 3       3.0   NaN,
 0    127500
 1    106000
 2    178100
 3    140000
 Name: Price, dtype: int64)

In [30]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs, type(inputs)

(   NumRooms  Alley_Pave  Alley_nan
 0       3.0           1          0
 1       2.0           0          1
 2       4.0           0          1
 3       3.0           0          1,
 pandas.core.frame.DataFrame)

### 转换为张量格式

In [33]:
import torch

X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

## 线性代数

torch是不区分行向量和列向量的；所谓行向量在torch里就是一维数组（说的向量就是这个；）；所谓列向量是(n, 1)的矩阵，Tensor类型的；

In [39]:
# 矩阵
A = torch.arange(20).reshape(5,4)
A

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

In [36]:
A.T

tensor([[ 0,  4,  8, 12, 16],
        [ 1,  5,  9, 13, 17],
        [ 2,  6, 10, 14, 18],
        [ 3,  7, 11, 15, 19]])

In [37]:
B = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
B

tensor([[1, 2, 3],
        [2, 0, 4],
        [3, 4, 5]])

In [38]:
B == B.T

tensor([[True, True, True],
        [True, True, True],
        [True, True, True]])

In [41]:
# 张量
X = torch.arange(24).reshape(2, 3, 4)
X

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [42]:
A = torch.arange(20, dtype=torch.float32).reshape(5, 4)
B = A.clone()  # 通过分配新内存，将A的一个副本分配给B
A, A + B

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]),
 tensor([[ 0.,  2.,  4.,  6.],
         [ 8., 10., 12., 14.],
         [16., 18., 20., 22.],
         [24., 26., 28., 30.],
         [32., 34., 36., 38.]]))

In [43]:
A, B

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]),
 tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]))

In [44]:
# 矩阵A和B的Hadamard积为
A * B

tensor([[  0.,   1.,   4.,   9.],
        [ 16.,  25.,  36.,  49.],
        [ 64.,  81., 100., 121.],
        [144., 169., 196., 225.],
        [256., 289., 324., 361.]])

### 降维

<span class="burk">axis = 谁，就把哪一位消掉；
eg A是3x2，sum(axis=0) 结果就是,2的，按列求和
高维同样适合；</span>

In [45]:
x = torch.arange(4, dtype=torch.float32)
x, x.sum()

(tensor([0., 1., 2., 3.]), tensor(6.))

In [46]:
A

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.]])

In [47]:
A.shape, A.sum()

(torch.Size([5, 4]), tensor(190.))

In [49]:
A_sum_axis0 = A.sum(axis=0)
A_sum_axis0, A_sum_axis0.shape

(tensor([40., 45., 50., 55.]), torch.Size([4]))

In [50]:
A.sum(axis=[0, 1])

tensor(190.)

In [51]:
A.mean(), A.sum()/A.numel()

(tensor(9.5000), tensor(9.5000))

In [52]:
A

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.]])

In [54]:
A.mean(axis=0), A.sum(axis=0)/A.shape[0]

(tensor([ 8.,  9., 10., 11.]), tensor([ 8.,  9., 10., 11.]))

### 非降维求和

In [56]:
sum_A = A.sum(axis=1, keepdims=True)
sum_A

tensor([[ 6.],
        [22.],
        [38.],
        [54.],
        [70.]])

In [57]:
A

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.],
        [16., 17., 18., 19.]])

In [58]:
sum_A

tensor([[ 6.],
        [22.],
        [38.],
        [54.],
        [70.]])

In [60]:
A / sum_A # 广播了；

tensor([[0.0000, 0.1667, 0.3333, 0.5000],
        [0.1818, 0.2273, 0.2727, 0.3182],
        [0.2105, 0.2368, 0.2632, 0.2895],
        [0.2222, 0.2407, 0.2593, 0.2778],
        [0.2286, 0.2429, 0.2571, 0.2714]])

### 点积

In [61]:
y = torch.ones(4, dtype=torch.float32)
x, y, torch.dot(x, y)

(tensor([0., 1., 2., 3.]), tensor([1., 1., 1., 1.]), tensor(6.))

In [63]:
x * y # 对应位置相乘；

tensor([0., 1., 2., 3.])

In [64]:
torch.sum(x * y)

tensor(6.)

### 矩阵-向量 积