In [1]:
import glob
import pathlib
import re

import torch
import torchvision
import tqdm

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import transforms, models
from torchvision.datasets import FashionMNIST, ImageFolder
from torchvision.utils import save_image

from IPython.display import Image, display_jpeg

remove_marks_regex = re.compile("[,\.\(\)\[\]\*:;]]<.*?>")
shift_marks_regex = re.compile("([?!])")

In [2]:
emb = nn.Embedding(10000, 20, padding_idx=0)
# Input of Embedding-Layer is 'int64 Tensor'.
inp = torch.tensor([1, 2, 5, 2, 10], dtype=torch.int64)
# Output is 'float32 Tensor'.
out = emb(inp)

In [6]:
def text2ids(text, vocab_dict):
    # Delete sign or signal except '!?'.
    text = remove_marks_regex.sub("", text)
    # Insert sign or signal between '!?' and words.
    text = shift_marks_regex.sub(r" \1", text)
    tokens = text.split()
    return [vocab_dict.get(token, 0) for token in tokens]

In [4]:
def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes \
        + [0] * (max_len - len(token_idxes))
    return torch.tensor(token_idxes, dtype=torch.int64), n_tokens