In [None]:
import torch
import torch.nn.functional as F

import sys
sys.path.insert(0, '/home/maria/py/dl/my_d2l')
from my_package import preData
from my_package import fig
from my_package import hdc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# European Language 数据集

In [None]:
import os
import os.path
from torch.utils import data
from typing import Callable, Optional, Tuple, List

In [None]:
class EuropeanLanguages(data.Dataset):
    """European Languages dataset.

    As used in the paper `"A Robust and Energy-Efficient Classifier Using
    Brain-Inspired Hyperdimensional Computing" <https://iis-people.ee.ethz.ch/~arahimi/papers/ISLPED16.pdf>`_.
    The dataset contains sentences in 21 European languages,
    the training data was taken from `Wortschatz Corpora <https://wortschatz.uni-leipzig.de/en/download>`_
    and the testing data from `Europarl Parallel Corpus <https://www.statmt.org/europarl/>`_.

    Args:
        root (string): Root directory of dataset where the training and testing samples are located.
        train (bool, optional): If True, creates dataset from Wortschatz Corpora,
            otherwise from Europarl Parallel Corpus.
        download (bool, optional): If True, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that takes in an torch.LongTensor
            and returns a transformed version.
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """

    classes: List[str] = [
        "Bulgarian",
        "Czech",
        "Danish",
        "Dutch",
        "German",
        "English",
        "Estonian",
        "Finnish",
        "French",
        "Greek",
        "Hungarian",
        "Italian",
        "Latvian",
        "Lithuanian",
        "Polish",
        "Portuguese",
        "Romanian",
        "Slovak",
        "Slovenian",
        "Spanish",
        "Swedish",
    ]

    files: List[str] = [
        "bul.txt",
        "ces.txt",
        "dan.txt",
        "nld.txt",
        "deu.txt",
        "eng.txt",
        "est.txt",
        "fin.txt",
        "fra.txt",
        "ell.txt",
        "hun.txt",
        "ita.txt",
        "lav.txt",
        "lit.txt",
        "pol.txt",
        "por.txt",
        "ron.txt",
        "slk.txt",
        "slv.txt",
        "spa.txt",
        "swe.txt",
    ]

    def __init__(
            self,
            root: str,
            train: bool = True,
            transform: Optional[Callable] = None,
            target_transform: Optional[Callable] = None,
    ):
        root = os.path.join(root, "Eurolang")
        root = os.path.expanduser(root)
        self.root = root
        os.makedirs(self.root, exist_ok=True)

        self.train = train
        self.transform = transform
        self.target_transform = target_transform

        if not self._check_integrity():
            raise RuntimeError(
                "Dataset not found or corrupted."
            )

        self._load_data()

    def __len__(self) -> int:
        return self.targets.size(0)

    def __getitem__(self, index) -> Tuple[str, torch.LongTensor]:
        """
        Args:
            index (int): Index

        Returns:
            Tuple[str, torch.LongTensor]: (sample, target) where target is the index of the target class
        """
        sample = self.data[index]
        target = self.targets[index]

        if self.transform:
            sample = self.transform(sample)

        if self.target_transform:
            target = self.target_transform(target)

        return sample, target
        
    def _check_integrity(self) -> bool:
        """
        Function
        ===
        Unzip the dataset file.
        Check if `root`  is a legal directory and if the root directory contains the required file
        """
        if not os.path.isdir(self.root):
            return False
        
        train_dir = os.path.join(self.root, "training")
        has_train_dir = os.path.isdir(train_dir)
        test_dir = os.path.join(self.root, "testing")
        has_test_dir = os.path.isdir(test_dir)
        if not has_train_dir or not has_test_dir:
            return False

        for file in self.files:
            has_train_file = os.path.isfile(os.path.join(train_dir, file))
            if not has_train_file:
                return False
            
            has_test_file = os.path.isfile(os.path.join(train_dir, file))
            if not has_test_file:
                return False

        return True

    def _load_data(self):
        data_dir = os.path.join(self.root, "training" if self.train else "testing")

        data = []
        targets = []

        for class_label, filename in enumerate(self.files):
            with open(os.path.join(data_dir, filename), "r") as file:
                lines = []
                for line in file:
                    cleaned_line = self._clean_line(line)
                    if self._filter_line(cleaned_line):
                        lines.append(cleaned_line)

                # lines = file.readlines()
                # lines = map(self._clean_line, lines)
                # lines = filter(self._filter_line, lines)
                # lines = list(lines)

                data += lines
                target += [class_label] * len(lines)

        self.data = data
        self.targets = torch.tensor(targets, dtype=torch.long)

    def _clean_line(self, line):
        line = line.strip()  # remove space at start and end
        line = " ".join(line.split())  # compact any whitespace to a single space
        return line

    def _filter_line(self, line):
        return line != ""


# 训练


In [None]:
MAX_INPUT_SIZE = 128
PADDING_IDX = 0

ASCII_A = ord("a")
ASCII_Z = ord("z")
ASCII_SPACE = ord(" ")
NUM_TOKENS = ASCII_Z - ASCII_A + 3  # a through z plus space and padding

def char2int(char: str) -> int:
    """
    Func:
    Map a character to its integer identifier
    """
    ascii_index = ord(char)

    if ascii_index == ASCII_SPACE:
        # Remap the space character to come after "z"
        return ASCII_Z - ASCII_A + 1

    return ascii_index - ASCII_A


def transform(x: str) -> torch.Tensor:
    """
    Func
    ===
    Transform a string into a tensor of character indeces.
    """
    char_ids = x[:MAX_INPUT_SIZE]
    char_ids = [char2int(char) + 1 for char in char_ids.lower()]

    if len(char_ids) < MAX_INPUT_SIZE:
        char_ids += [PADDING_IDX] * (MAX_INPUT_SIZE - len(char_ids))

    return torch.tensor(char_ids, dtype=torch.long)

In [None]:
BATCH_SIZE = 12
train_ds = EuropeanLanguages("../data", train=True, transform=transform, download=True)
train_ld = data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

test_ds = EuropeanLanguages("../data", train=False, transform=transform, download=True)
test_ld = data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)