In [2]:
import torch
import numpy as np

# 处理图像

In [4]:
import imageio  # can also use torchvision

In [None]:
img_arr = imageio.v3.imread('./image-dog/bobby.jpg')
print(img_arr.shape)    # it is a numpy array
img = torch.from_numpy(img_arr)
out = img.permute(2, 0, 1)
print(out.shape)

In [6]:
batch_size = 3
batch = torch.zeros(batch_size, 3, 256, 256, dtype=torch.uint8)

import os
data_dir = './image-cats/'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] == '.png']
for i, filename in enumerate(filenames):
    img_arr = imageio.v3.imread(os.path.join(data_dir, filename))
    img_t = torch.from_numpy(img_arr)
    img_t = img_t.permute(2, 0, 1)
    img_t = img_t[:3]
    batch[i] = img_t    # 把img_t输入batch
batch = batch.float()   # Tensor要化成浮点
# 神经网络输入数据在0~1或-1~1范围内效果最佳
batch /= 255.0

# 处理CSV数据表

案例：分析红酒的各项指标和最终得分的关系

In [None]:
# FETCH DATA FROM CSV
import csv
wine_path = './tabular-wine/winequality-white.csv'
wineq_np = np.loadtxt(wine_path, dtype=np.float32, delimiter=';',
    skiprows=1      # cuz the first row contains names
    )
wineq = torch.from_numpy(wineq_np)
col_list = next(csv.reader(open(wine_path), delimiter=';'))
# This will output the first row.
wineq.shape, wineq.dtype

In [4]:
# SEPARATE SCORE FROM DATA
data = wineq[:, :-1]
target = wineq[:, -1]
target = target.long()
print(target.unsqueeze(1).shape)

torch.Size([4898, 1])


```
如果要处理的一个变量是“连续值”, 如葡萄酒的密度, 那么直接使用数据(用float32最佳).
如果要处理的一个变量是“分类值”, 如葡萄酒的颜色名称, 那么最好进行one-hot独热编码, 即把标签5转为(0,0,0,0,1), 标签1转为(1,0,0,0,0)等.
如果要处理的一个变量是“序数值”, 就要考虑它有没有距离概念, 如果顺序优先则视为连续数据, 否则视为分类数据, 进行one-hot.
```

In [7]:
# ONE-HOT ENCODING
target_onehot = torch.zeros(target.shape[0], 10)
print(target_onehot.shape)
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

torch.Size([4898, 10])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

对数据进行归一化

In [None]:
# torch.var()是计算样本方差variance(分母是N-1), torch.mean()是计算平均值.
data_n = (data - torch.mean(data, dim=0)) / torch.sqrt(torch.var(data, dim=0))

In [22]:
# let's try to find a threshold.
bad_indexes = target <= 3       # 逐元素比较的操作符
# print(bad, bad.sum(), bad.dtype, bad.shape)
bad_data = data[bad_indexes]
# print(bad_data.shape)
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

# a = 3.114514
# a = '%.3f' % a
# print(a) -> 3.115

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42
