In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

DIMENSIONS = 1024  # number of hypervector dimensions
BATCH_SIZE = 12  # for GPUs with enough memory we can process multiple images at ones

Using cuda device


# 数据集

In [2]:
import os
import os.path
import pandas as pd
from torch.utils import data
from typing import Callable, Optional, Tuple, List

class ISOLET(data.Dataset):
    """
    Description
    ===
    `ISOLET <https://archive.ics.uci.edu/ml/datasets/isolet>`_ dataset.
    The data file has been downloaded and extracted, existing in the directory of `root`.

    Args:
    root (string): Root directory of dataset where ``isolet1+2+3+4.data``
        and  ``isolet5.data`` exist.
    train (bool, optional): If True, creates dataset from ``isolet1+2+3+4.data``,
        otherwise from ``isolet5.data``.
    transform (callable, optional): A function/transform that takes in an torch.FloatTensor
        and returns a transformed version.
    target_transform (callable, optional): A function/transform that takes in the
        target and transforms it.

    """

    classes: List[str] = [
        "A",
        "B",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "J",
        "K",
        "L",
        "M",
        "N",
        "O",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "U",
        "V",
        "W",
        "X",
        "Y",
        "Z",        
    ]

    def __init__(
            self,
            root: str,
            train: bool = True,
            transform: Optional[Callable] = None,
            target_transform: Optional[Callable] = None
            ):
        super().__init__()
        root = os.path.join(root, "isolet")
        root = os.path.expanduser(root)
        self.root = root
        os.makedirs(self.root, exist_ok=True)

        self.train = train
        self.transform = transform
        self.target_transform = target_transform

        if not self._check_integrity():
            raise RuntimeError(
                "Dataset not found or corrupted."
            )
        
        self._load_data()

    def __len__(self) -> int:
        """
        Function
        ===
        Subclasses could optionally overwrite :meth:`__len__`.

        Return
        ---
        the number of the samples in the dataset
        """
        return self.data.size(0)
    
    def __getitem__(self, index) -> Tuple[torch.FloatTensor, torch.LongTensor]:
        """
        Function
        ===
        All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
        data sample for a given key. 

        Parameters
        ---
            index (int): Index

        Returns:
            Tuple: (sample, Target) where target is the index of tghe target class
        """
        sample = self.data[index]
        label = self.targets[index]

        if self.transform:
            sample = self.transform(sample)

        if self.target_transform:
            label = self.target_transform(label)

        return sample, label


    def _check_integrity(self) -> bool:
        """
        Function
        ===
        Unzip the dataset file.
        Check if `root`  is a legal directory and if the root directory contains the required file
        """
        if not os.path.isdir(self.root):
            return False
        
        train_file = os.path.join(self.root, "isolet1+2+3+4.data")
        test_file = os.path.join(self.root, "isolet5.data")

        has_train_file = os.path.isfile(train_file)
        has_test_file = os.path.isfile(test_file)

        if has_train_file and has_test_file:
            return True
        
        # TODO: Add more specific checks like an MD5 ckecksum

        return False

    def _load_data(self):
        """
        Function
        ===
        Load ISOLET dataset from `path` where ``isolet1+2+3+4.data`` and ``isolet5.data`` exist
        Metadata is stored in `data` and `targets`.

        Parameter
        ---
        train (bool, optional): If True, creates dataset from ``isolet1+2+3+4.data``,
        otherwise from ``isolet5.data``.

        """
        data_file = "isolet1+2+3+4.data" if self.train else "isolet5.data"
        data = pd.read_csv(os.path.join(self.root, data_file), header=None)
        self.data = torch.tensor(data.values[:, :-1], dtype=torch.float)
        self.targets = torch.tensor(data.values[:, -1], dtype=torch.long) - 1


In [4]:
train_ds = ISOLET("../../data", train=True)
train_ld = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

test_ds = ISOLET("../../data", train=False)
test_ld = data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

num_features = train_ds[0][0].size(-1)
num_classes = len(train_ds.classes)
print(num_classes, num_features, train_ds.data.shape)

26 617


您提供的代码片段实现了数据标准化（也称为 Z-score 标准化或零均值单位方差标准化）。这是一种常见的数据预处理步骤，用于机器学习和深度学习任务中。标准化的目的是将特征缩放到具有零均值和单位方差，这可以帮助加速模型的收敛，并在某些情况下提高模型性能。

让我们逐行解释：

```python
std, mean = torch.std_mean(train_ds.data, dim=0, keepdim=False)
```

- `torch.std_mean` 是 PyTorch 中的一个函数，它同时计算张量沿指定维度的标准差 (`std`) 和平均值 (`mean`)。
- `train_ds.data` 是一个包含训练数据的张量，其中每一行代表一个样本，每一列代表一个特征。
- `dim=0` 参数指定了沿着哪个维度计算标准差和均值。在这个例子中，`dim=0` 表示我们希望对每个特征（即每列）单独计算，而不是对每个样本（即每行）。
- `keepdim=False` 意味着输出的 `std` 和 `mean` 张量不会保留被缩减的维度。也就是说，如果输入是一个二维张量 `[N, F]`（N 个样本，F 个特征），那么输出将是形状为 `[F]` 的一维张量，表示每个特征的标准差和均值。

```python
def transform(sample):
    return (sample - mean) / std
```

- 这里定义了一个名为 `transform` 的函数，该函数接收一个 `sample` 作为参数，这个 `sample` 可能是单个样本或者是一批样本（一个二维张量）。
- 函数内部执行了标准化操作：对于每个特征，它从 `sample` 中减去对应的均值 `mean`，然后除以对应的标准差 `std`。这将使得转换后的特征具有零均值和单位方差。
- 注意，这里假定 `std` 和 `mean` 是之前已经计算好的，并且它们的形状与 `sample` 的特征数相匹配。

需要注意的是，在应用此转换时，`std` 不应该有任何接近于零的值，因为这会导致除零错误。因此，在实际使用中，通常会添加一个小的常数（如 1e-7）到 `std` 中，以避免这种情况发生：

```python
epsilon = 1e-7
def transform(sample):
    return (sample - mean) / (std + epsilon)
```

此外，当使用这种标准化方法时，应该确保用相同的 `mean` 和 `std` 对训练集、验证集和测试集进行变换，以保证所有数据都在相同的尺度上。这意味着 `mean` 和 `std` 应该仅基于训练数据计算得出。

In [5]:
std, mean = torch.std_mean(train_ds.data, dim=0, keepdim=False)

def transform(sample):
    return (sample - mean) / std

train_ds.transform = transform
test_ds.transform = transform

# 训练

torchhd 中为 classifier 实现了多个 HDC 方法