# Ch2 预备知识

In [1]:
# Getting Started:
import torch
import numpy as np

## $\S2.1$ 数据操作

In [2]:
# 张量对象
x = torch.arange(12)
## 张量的形状：
x.shape
## 张量元素的个数：
x.numel()
y = x.reshape(3, 4)
y = x.reshape(3,-1)
y = x.reshape(-1,4)
y.shape

torch.Size([3, 4])

In [3]:
## 特殊的张量创建：
z1 = np.zeros((2,3,4))
print(z1)

z2 = torch.zeros((2,3,4))
print(z2)

z3 = torch.ones((2,3,4))
print(z3)

### 随机采样：
z4 = torch.randn((2,3,4))
z4

[[[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]]
tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])


tensor([[[-0.6013,  0.7790, -1.2362, -0.0614],
         [-0.1154, -0.4540,  0.5195, -0.0896],
         [-1.6176, -0.2600, -0.5417,  0.1161]],

        [[-0.9702,  1.0474, -0.5594, -0.3055],
         [ 0.7339,  0.6678,  0.0782,  0.0627],
         [-0.6554,  0.8100,  0.3852, -0.0982]]])

### $\S2.1.2$ 运算符

In [4]:
# 运算符
## element-wise:
x = torch.tensor([1, 2, 4, 8])
y = torch.tensor([2, 2, 2, 2])
print(x+y, "\n", x-y, "\n", x*y, "\n", x/y, "\n", x**y)
print(torch.exp(x))

tensor([ 3,  4,  6, 10]) 
 tensor([-1,  0,  2,  6]) 
 tensor([ 2,  4,  8, 16]) 
 tensor([0.5000, 1.0000, 2.0000, 4.0000]) 
 tensor([ 1,  4, 16, 64])
tensor([2.7183e+00, 7.3891e+00, 5.4598e+01, 2.9810e+03])


In [5]:
# 张量连接：
X = torch.arange(12, dtype=torch.float32).reshape((3,4))
Y = torch.tensor([[2,1,4,3],[1,2,3,4],[4,3,2,1]])
## 不同的dim代表不同的方向:
torch.concat((X,Y), dim=0), torch.concat((X,Y), dim=1)

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [ 2.,  1.,  4.,  3.],
         [ 1.,  2.,  3.,  4.],
         [ 4.,  3.,  2.,  1.]]),
 tensor([[ 0.,  1.,  2.,  3.,  2.,  1.,  4.,  3.],
         [ 4.,  5.,  6.,  7.,  1.,  2.,  3.,  4.],
         [ 8.,  9., 10., 11.,  4.,  3.,  2.,  1.]]))

In [6]:
# 通过逻辑运算符构建二元张量：
X == Y

# 对张量的元素求和：
X.sum(dim=0)

tensor([12., 15., 18., 21.])

### $\S2.1.3$ 广播机制
对于两个维数不同的张量，仍然有可能通过广播机制进行相互运算。

In [7]:
a = torch.arange(3).reshape((3,1))
b = torch.arange(2).reshape((1,2))
print(a, "\n", b)
a+b

c = torch.arange(6).reshape((6,1,1))
d = torch.arange(6).reshape((1,1,6))
print(c, "\n", d)
c+d

tensor([[0],
        [1],
        [2]]) 
 tensor([[0, 1]])
tensor([[[0]],

        [[1]],

        [[2]],

        [[3]],

        [[4]],

        [[5]]]) 
 tensor([[[0, 1, 2, 3, 4, 5]]])


tensor([[[ 0,  1,  2,  3,  4,  5]],

        [[ 1,  2,  3,  4,  5,  6]],

        [[ 2,  3,  4,  5,  6,  7]],

        [[ 3,  4,  5,  6,  7,  8]],

        [[ 4,  5,  6,  7,  8,  9]],

        [[ 5,  6,  7,  8,  9, 10]]])

### $\S2.1.4$ 索引和切片

In [8]:
# 张量的切片:
print(X)
X[-1]
X[1:3]

## 连续索引返回的的是视图:
X[1,2] = 9
print(X)

## 切片返回的是视图:
X[0:2, :] = 12
print(X)

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  9.,  7.],
        [ 8.,  9., 10., 11.]])
tensor([[12., 12., 12., 12.],
        [12., 12., 12., 12.],
        [ 8.,  9., 10., 11.]])


### $\S2.1.5$ 内存节省

In [9]:
## 在这个赋值过程中，Y重新指向了新的内存空间:
before = id(Y)
Y = Y + X
id(Y) == before

## 原地修改变量内存的方法——通过切片索引:
Z = torch.zeros_like(Y)
print("id(Z):", id(Z))
Z[:] = X + Y
print("new id(Z):", id(Z))

## 原地修改变量内存的方法——通过特定操作符：
print("id(Y):", id(Y))
Y += X+Y
print("new id(Y):", id(Y))

id(Z): 1859044371792
new id(Z): 1859044371792
id(Y): 1859047577776
new id(Y): 1859047577776


### $\S2.1.6$ 转换为其他Python对象

In [10]:
# 转换为ndarray对象:
A = X.numpy()
B = torch.tensor(A)
type(A), type(B)

# 转换为Python标量:
a = torch.tensor([3.5])
a, a.item(), int(a), float(a)

(tensor([3.5000]), 3.5, 3, 3.5)

## $\S2.2$ 数据预处理

In [11]:
# 创建并写入一个数据集:
import os
os.makedirs(os.path.join("..", "data"), exist_ok=True)  ## 当文件存在时,exist_ok=True不会raise error

data_file = os.path.join("..", "data", "house_tiny.csv")
with open(data_file, "w") as f:
    f.write("NumRooms,Alley,Price\n")
    f.write("NA,Pave,127500\n")
    f.write("2,NA,106000\n")
    f.write("4,NA,178100\n")
    f.write("NA,NA,140000\n")

In [12]:
# 读取该数据集:
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


### $\S2.2.2$ 缺失值处理

In [13]:
# 通过插值法处理缺失值:
inputs, outputs = data.iloc[:,0:2], data.iloc[:,2]
inputs = inputs.fillna(inputs.mean(numeric_only=True))  # 会生成一个Series对象
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [14]:
# 将文本列转换为两个虚拟变量列:
## 注意第一次运行这个操作会修改get_dummies里的参数:
inputs = pd.get_dummies(inputs, dtype=int)
print(inputs)

   NumRooms  Alley_Pave
0       3.0           1
1       2.0           0
2       4.0           0
3       3.0           0


### $\S2.3.2$ 转换为张量格式

In [15]:
X,y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X,y

(tensor([[3., 1.],
         [2., 0.],
         [4., 0.],
         [3., 0.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


NumRooms    2
Alley       1
Price       4
dtype: int64