# 读取表格

In [86]:
import torch
import numpy as np

torch.set_printoptions(edgeitems=2, threshold=50)

In [87]:
import pandas as pd

In [88]:
df = pd.read_csv("../../myData/p1ch4/tabular-wine/winequality-white.csv")

In [89]:
df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


可以看到，格式明显不对，因为read_csv()的默认分隔符为','，而在红酒文件中，其分隔符为';'，因此需要单独设置。

In [90]:
df = pd.read_csv("../../myData/p1ch4/tabular-wine/winequality-white.csv", delimiter=";")

In [91]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


而在书中，使用的是csv库，如下

In [92]:
import csv

In [93]:
wine_path = "../../myData/p1ch4/tabular-wine/winequality-white.csv"
# 需要注意的是，提前看到了数据的第一行是header，所以skiprows=1
# 并且所有的数据都是数字格式，所以dtype=np.float32
# 如果存在其他格式，如字符串，就需要多加考虑了
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)

In [94]:
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [95]:
reader = csv.reader(open(wine_path, "r"), delimiter=";")
reader

<_csv.reader at 0x2a58a3dff40>

In [96]:
col_list = next(reader)
wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [97]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [98]:
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.0000,  0.2700,  ...,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  ...,  0.4900,  9.5000],
         ...,
         [ 5.5000,  0.2900,  ...,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  ...,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [99]:
target = wineq[:, -1].to(torch.int)
target, target.shape

(tensor([6, 6,  ..., 7, 6], dtype=torch.int32), torch.Size([4898]))

In [100]:
labels = target.unique()
labels_n = labels.shape[0]
labels, labels_n

(tensor([3, 4, 5, 6, 7, 8, 9], dtype=torch.int32), 7)

In [101]:
target_onehot = torch.zeros(target.shape[0], labels_n)

In [102]:
target

tensor([6, 6,  ..., 7, 6], dtype=torch.int32)

scatter(): Expected dtype int64 for index

In [103]:
# target = (target - 3).to(dtype=torch.int64)
target = target.to(dtype=torch.int64)
target_onehot.scatter_(1, (target - 3).unsqueeze(1), 1.0)
target_onehot, target_onehot.shape

(tensor([[0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.],
         ...,
         [0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.]]),
 torch.Size([4898, 7]))

In [104]:
target_onehot[:5]

tensor([[0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.]])

In [105]:
data = data.to(dtype=torch.double)

In [106]:
data_mean = torch.mean(data, dim=0)
data_mean, data_mean.shape

(tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01],
        dtype=torch.float64),
 torch.Size([11]))

In [107]:
data_mean.dtype

torch.float64

In [108]:
data_var = torch.var(data, dim=0)
data_var, data_var.shape

(tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00],
        dtype=torch.float64),
 torch.Size([11]))

In [109]:
data_std = torch.std(data, dim=0)
data_std, data_std.shape

(tensor([8.4387e-01, 1.0079e-01, 1.2102e-01, 5.0721e+00, 2.1848e-02, 1.7007e+01,
         4.2498e+01, 2.9909e-03, 1.5100e-01, 1.1413e-01, 1.2306e+00],
        dtype=torch.float64),
 torch.Size([11]))

In [110]:
data_std**2 == data_var

tensor([ True,  True,  True,  True, False, False, False,  True,  True,  True,
        False])

In [111]:
torch.std(data, dim=0) ** 2 == torch.var(data, dim=0)

tensor([ True,  True,  True,  True, False, False, False,  True,  True,  True,
        False])

In [112]:
torch.std(data, dim=0) == torch.sqrt(torch.var(data, dim=0))

tensor([ True,  True,  True,  True, False,  True,  True,  True,  True,  True,
        False])

In [114]:
data, data.shape, data.dtype

(tensor([[ 7.0000,  0.2700,  ...,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  ...,  0.4900,  9.5000],
         ...,
         [ 5.5000,  0.2900,  ...,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  ...,  0.3200, 11.8000]], dtype=torch.float64),
 torch.Size([4898, 11]),
 torch.float64)

可以看到，始终存在误差

In [116]:
data_normalized = (data - data_mean) / data_std
data_normalized, data_normalized.shape

(tensor([[ 1.7208e-01, -8.1761e-02,  ..., -3.4915e-01, -1.3930e+00],
         [-6.5743e-01,  2.1587e-01,  ...,  1.3418e-03, -8.2419e-01],
         ...,
         [-1.6054e+00,  1.1666e-01,  ..., -9.6251e-01,  1.8574e+00],
         [-1.0129e+00, -6.7703e-01,  ..., -1.4882e+00,  1.0448e+00]],
        dtype=torch.float64),
 torch.Size([4898, 11]))

In [117]:
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [118]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [119]:
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >= 7]

In [120]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

In [121]:
bad_mean, mid_mean, good_mean

(tensor([7.6000e+00, 3.3325e-01, 3.3600e-01, 6.3925e+00, 5.4300e-02, 5.3325e+01,
         1.7060e+02, 9.9488e-01, 3.1875e+00, 4.7450e-01, 1.0345e+01],
        dtype=torch.float64),
 tensor([6.8869e+00, 2.8153e-01, 3.3644e-01, 6.7051e+00, 4.7841e-02, 3.5424e+01,
         1.4183e+02, 9.9447e-01, 3.1808e+00, 4.8707e-01, 1.0265e+01],
        dtype=torch.float64),
 tensor([6.7251e+00, 2.6535e-01, 3.2606e-01, 5.2615e+00, 3.8160e-02, 3.4550e+01,
         1.2525e+02, 9.9241e-01, 3.2151e+00, 5.0014e-01, 1.1416e+01],
        dtype=torch.float64))

In [126]:
print(f"{'idx':<3} {'feature':<20} {'bad':<6} {'mid':<6} {'good':<6}")

for idx, (col_name, bm, mm, gm) in enumerate(
    zip(col_list, bad_mean, mid_mean, good_mean)
):
    print(f"{idx:<3} {col_name:<20} {bm:<6.2f} {mm:<6.2f} {gm:<6.2f}")

idx feature              bad    mid    good  
0   fixed acidity        7.60   6.89   6.73  
1   volatile acidity     0.33   0.28   0.27  
2   citric acid          0.34   0.34   0.33  
3   residual sugar       6.39   6.71   5.26  
4   chlorides            0.05   0.05   0.04  
5   free sulfur dioxide  53.33  35.42  34.55 
6   total sulfur dioxide 170.60 141.83 125.25
7   density              0.99   0.99   0.99  
8   pH                   3.19   3.18   3.22  
9   sulphates            0.47   0.49   0.50  
10  alcohol              10.35  10.26  11.42 


In [127]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:, 6]

In [128]:
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

In [129]:
predicted_indexes, predicted_indexes.shape

(tensor([False,  True,  ...,  True,  True]), torch.Size([4898]))

In [131]:
temp = total_sulfur_data < total_sulfur_threshold
all(temp == predicted_indexes)

True

In [132]:
predicted_indexes.sum()

tensor(2727)

In [133]:
actual_indexes = target > 5
actual_indexes.shape, actual_indexes.sum()

(torch.Size([4898]), tensor(3258))

这里可以看出，n_matches并不是准确率，只是要求二者都为True才算数。
要求准确率的话，应该是 == 运算

In [134]:
# 真阳性
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_matches

2018

In [136]:
n_predicted = predicted_indexes.sum().item()
n_predicted

2727

In [140]:
n_actual = actual_indexes.sum().item()
n_actual

3258

In [141]:
precision = n_matches / n_predicted
precision

0.74000733406674

In [142]:
recall = n_matches / n_actual
recall

0.6193984039287906

In [138]:
n_acc = torch.sum(actual_indexes == predicted_indexes).item()
n_acc

2949

In [139]:
acc = n_acc / predicted_indexes.shape[0]
acc

0.602082482645978

In [143]:
f1 = 2 * precision * recall / (precision + recall)
f1

0.6743525480367586