In [1]:
import torch
import numpy as np
import string

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


## 文本处理

1. 分词/Tokenization
```py
list(str)     ## 1. by letter
str.split()   ## 2. by word
...........   ## 3. NLTK-ngrams: 有点类似k-mer，将n个相邻词合并为一个词段，稍后embed这个词段
```
2. 创建词表 vocab

3. 词表向量化：tf-idf/one-hot/散列编码/embedding

In [2]:
str = "Sing in me, Muse, and through me tell the story of that man skilled in all ways of contending, the wanderer, harried for years on end, after he plundered the stronghold on the proud height of Troy"

for c in string.punctuation:
    str = str.replace(c,' ').replace('  ',' ').lower()

In [3]:
vocab = dict([(v,k) for k,v in enumerate(set(str.split()))])
s = [vocab.get(v) for v in str.split()]  
print(s)

[4, 21, 14, 10, 5, 23, 14, 27, 19, 1, 2, 26, 16, 13, 21, 22, 9, 2, 24, 19, 15, 7, 11, 17, 12, 18, 28, 20, 0, 19, 8, 12, 19, 3, 25, 2, 6]


In [4]:
## 1. one-hot Mtx, each line for a word
b = np.zeros((len(s),len(vocab))) 
for index,v in enumerate(s):
    b[index,v] = 1
b

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
## 2. Embedding, each line for a word
emb_layer = torch.nn.Embedding(len(vocab),20)
emb = emb_layer(torch.LongTensor(s))
emb.shape

torch.Size([37, 20])

## TorchText

```
torchtext.datasets.* 提供常见数据集  但是与torch==2.3.0不相容；且容易出错，建议转换为list()


```
* 各种文本处理工具: https://pytorch.org/text/stable/data_utils.html   
* 忘记 vocab.set_default_index 后续调用如果遇到不在vocab中的词后程序会崩溃

In [6]:
import torchtext     ## pip install torchtext; pip install torchdata
torchtext.__version__

'0.16.2+cpu'

In [7]:
test_data = list(torchtext.datasets.IMDB(split='train'))
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
def yield_tokens(data):
    for (_,text) in data:
        yield tokenizer(text)

vocab = torchtext.vocab.build_vocab_from_iterator(yield_tokens(test_data),specials=['<pad>','<unk>'],min_freq=3)
vocab.set_default_index(vocab['<unk>'])

In [8]:
len(vocab)

27771

In [9]:
next(iter(test_data))

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [10]:
next(yield_tokens(test_data))

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because',
 'of',
 'all',
 'the',
 'controversy',
 'that',
 'surrounded',
 'it',
 'when',
 'it',
 'was',
 'first',
 'released',
 'in',
 '1967',
 '.',
 'i',
 'also',
 'heard',
 'that',
 'at',
 'first',
 'it',
 'was',
 'seized',
 'by',
 'u',
 '.',
 's',
 '.',
 'customs',
 'if',
 'it',
 'ever',
 'tried',
 'to',
 'enter',
 'this',
 'country',
 ',',
 'therefore',
 'being',
 'a',
 'fan',
 'of',
 'films',
 'considered',
 'controversial',
 'i',
 'really',
 'had',
 'to',
 'see',
 'this',
 'for',
 'myself',
 '.',
 'the',
 'plot',
 'is',
 'centered',
 'around',
 'a',
 'young',
 'swedish',
 'drama',
 'student',
 'named',
 'lena',
 'who',
 'wants',
 'to',
 'learn',
 'everything',
 'she',
 'can',
 'about',
 'life',
 '.',
 'in',
 'particular',
 'she',
 'wants',
 'to',
 'focus',
 'her',
 'attentions',
 'to',
 'making',
 'some',
 'sort',
 'of',
 'documentary',
 'on',
 'what',
 'the',
 'average',
 'swede',
 'thought',


In [11]:
vocab['am']   ## index

229

In [12]:
vocab(['i','love'])

[12, 173]

In [13]:
vocab(next(yield_tokens(test_data)))

[12,
 1152,
 12,
 229,
 24140,
 49,
 71,
 350,
 1023,
 86,
 8,
 39,
 3,
 7332,
 15,
 2974,
 11,
 65,
 11,
 17,
 97,
 747,
 13,
 7813,
 2,
 12,
 110,
 563,
 15,
 37,
 97,
 11,
 17,
 17607,
 45,
 1465,
 2,
 16,
 2,
 11062,
 50,
 11,
 123,
 607,
 9,
 2635,
 14,
 799,
 4,
 1486,
 117,
 5,
 371,
 8,
 128,
 1205,
 3534,
 12,
 68,
 69,
 9,
 76,
 14,
 19,
 478,
 2,
 3,
 100,
 10,
 5504,
 190,
 5,
 265,
 3993,
 614,
 1271,
 830,
 6787,
 48,
 517,
 9,
 900,
 286,
 67,
 58,
 51,
 160,
 2,
 13,
 978,
 67,
 517,
 9,
 1215,
 56,
 12245,
 9,
 232,
 54,
 397,
 8,
 813,
 27,
 53,
 3,
 752,
 22749,
 206,
 51,
 906,
 1170,
 1461,
 151,
 22,
 3,
 2429,
 457,
 6,
 1609,
 1461,
 13,
 3,
 2670,
 1767,
 2,
 13,
 258,
 1742,
 7456,
 6,
 2317,
 1,
 8,
 16110,
 51,
 79,
 4553,
 27,
 2406,
 4,
 67,
 59,
 337,
 21,
 56,
 614,
 1549,
 4,
 9545,
 4,
 6,
 1187,
 418,
 2,
 53,
 890,
 75,
 51,
 12,
 229,
 24140,
 10,
 15,
 1595,
 207,
 702,
 4,
 14,
 17,
 1205,
 7457,
 2,
 68,
 4,
 3,
 337,
 6,
 718,
 143,
 31,
 168,
 

## DataLoader

设定 collate_fn batch ...

此例是为了torch.nn.EmbeddingBag的输入做准备


In [14]:
def collate_batch(data_batch):
    label_lst, text_lst, offset_lst = [],[],[]
    for _label, _text in data_batch:
        label_lst.append(_label)
        tk_text = vocab(tokenizer(_text))
        text_lst.append(torch.tensor(tk_text,dtype=torch.int64))
        offset_lst.append(len(tk_text))
    label_lst = torch.tensor(label_lst)
    text_lst = torch.cat(text_lst)
    offsets = torch.cat((torch.tensor([0]), torch.tensor(offset_lst[:-1]).cumsum(dim=0) ))
    return label_lst.to(device),text_lst.to(device),offsets.to(device)


test_dl = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=True, collate_fn = collate_batch)

In [15]:
for tt in test_dl:
    pass

tt

(tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([ 12, 206,  15,  ..., 689,  25,   2]),
 tensor([   0,  312,  547, 1260, 1444, 1597, 1822, 2291, 2503, 2674, 2812, 2943,
         3166, 3455, 4337, 4728, 4938, 6046, 6227, 6447]))

## Model

```
## torch.nn.EmbeddingBag 接受词表进行 Embedding，并对 Embedding 输出进行聚合：求和/均值/最大值/。。。
## 可以将一个批次中全部文本合并为一个长序列，并记录其中每一条文本的 **偏移值**（其所在位置）
```


In [16]:
class TextClassifyModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.Embed_Bag = torch.nn.EmbeddingBag(vocab_size,embed_dim)
        self.fc = torch.nn.Linear(embed_dim, 2)
    def forward(self, text, offset):
        embd = self.Embed_Bag(text,offset)
        return self.fc(embd)

model = TextClassifyModel(len(vocab),100)
model

TextClassifyModel(
  (Embed_Bag): EmbeddingBag(27771, 100, mode='mean')
  (fc): Linear(in_features=100, out_features=2, bias=True)
)

In [17]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [18]:
def train(dataloader, model, loss_fn, optimizer):
    lossSum = 0
    model.train()                                    ### set training mode
    for label_lst,text_lst,offsets in dataloader:
        # Compute prediction error
        pred = model(text_lst,offsets)
        loss = loss_fn(pred, label_lst)
        lossSum += loss.item()
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    avgTrainingLoss = lossSum/len(dataloader)
    return avgTrainingLoss

In [19]:
epochs = 10
for t in range(epochs):
    avgTrainingLoss = train(test_dl, model, loss_fn, optimizer)
    print(f'Epoch {t+1}----Train Loss:: {avgTrainingLoss:>7f}') 

Epoch 1----Train Loss:: 0.059540
Epoch 2----Train Loss:: 0.008917
Epoch 3----Train Loss:: 0.005182
Epoch 4----Train Loss:: 0.003663
Epoch 5----Train Loss:: 0.002834
Epoch 6----Train Loss:: 0.002312
Epoch 7----Train Loss:: 0.001951
Epoch 8----Train Loss:: 0.001689
Epoch 9----Train Loss:: 0.001488
Epoch 10----Train Loss:: 0.001331
