In [2]:
import pandas as pd
import numpy as np
import torch
from torchtext import data
from torchtext.vocab import Vectors

<img src="./imgs/torchText.png"  width="700" height="700" align="bottom" />


- 使用`Field`对象进行文本预处理，生成example。Field即为文本预处理方法，example为一条样本数据     
- 使用`Dataset`类生成数据集dataset    
- 使用`Iterator`生成迭代器    

### 定义预处理函数

In [3]:
def text_tokenize(x):
    return x.split()

def label_tokenize(y):
    return y

### 定义Field

In [None]:
# 文本的预处理方法
TEXT = data.Field(sequential=True, tokenize=text_tokenize, fix_length=40)

# 标签的预处理方法
LABEL = data.Field(sequential=True, tokenize=label_tokenize, use_vocab=False)

In [31]:
def get_dataset(csv_data, text_field, label_field):
    fields = [('sentence1', text_field), ('sentence2', text_field), ('gold_label', label_field)]
    examples = []
    for text1, text2, label in zip(csv_data['sentence1'], csv_data['sentence2'], csv_data['gold_label']):
        # 需要让label变成列表，否则不满足dataloader要求
        examples.append(data.Example.fromlist([text1, text2, [label]], fields))
    return examples, fields

In [32]:
train_data = pd.read_csv('./data/SNLI/snli-train.txt',sep='\t')
test_data = pd.read_csv('./data/SNLI/snli-test.txt',sep='\t')

In [None]:
train_examples, train_fields = get_dataset(train_data, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, TEXT, LABEL)

In [34]:
train = data.Dataset(train_examples, train_fields)
test = data.Dataset(test_examples, test_fields)

In [35]:
vars(train_examples[0])

{'sentence1': ['a',
  'person',
  'on',
  'a',
  'horse',
  'jumps',
  'over',
  'a',
  'broken',
  'down',
  'airplane'],
 'sentence2': ['a',
  'person',
  'is',
  'training',
  'his',
  'horse',
  'for',
  'a',
  'competition'],
 'gold_label': [2]}

In [36]:
# 加载词向量
vectors = Vectors(name='./data/glove.6B.300d.txt')

In [37]:
# 根据词向量和训练数据构建字典
TEXT.build_vocab(train, vectors=vectors)

In [38]:
# string to index
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000002255C3D0FA0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'a': 2,
             'the': 3,
             'in': 4,
             'is': 5,
             'man': 6,
             'on': 7,
             'and': 8,
             'are': 9,
             'of': 10,
             'with': 11,
             'woman': 12,
             'two': 13,
             'people': 14,
             'to': 15,
             'at': 16,
             'wearing': 17,
             'an': 18,
             'his': 19,
             'young': 20,
             'men': 21,
             'playing': 22,
             'girl': 23,
             'boy': 24,
             'white': 25,
             'shirt': 26,
             'while': 27,
             'black': 28,
             'dog': 29,
             'sitting': 30,
             'blue': 31,
             'standing': 32,
             'her': 33,
             'red': 34,
             'group': 35,
     

In [39]:
# vectors
TEXT.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2971,  0.0940, -0.0967,  ...,  0.0597, -0.2285,  0.2960],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4338,  0.1425,  0.2357,  ...,  0.6148, -0.4354,  0.2586],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [40]:
len(TEXT.vocab)

36990

In [41]:
# 字典是根据训练数据中出现了哪些词而构建的，因此字典大小为36990，而预训练词向量长度为400000
with open('./data/glove.6B.300d.txt',encoding='utf-8') as f:
    print(len(f.readlines()))

400000


In [42]:
TEXT.vocab.freqs.most_common(10)

[('a', 1441009),
 ('the', 535465),
 ('in', 407630),
 ('is', 373876),
 ('man', 264886),
 ('on', 236194),
 ('and', 206479),
 ('are', 199291),
 ('of', 192412),
 ('with', 169465)]

In [43]:
from torchtext.data import Iterator

In [44]:
train_iter = Iterator(train, batch_size=100, sort=False, device=torch.device('cpu'), repeat=False)
test_iter = Iterator(test, batch_size=100, sort=False, device=torch.device('cpu'), repeat=False)

In [45]:
batch=next(iter(train_iter))
print(batch)
print(batch.sentence1)
print(batch.sentence2)
print(batch.gold_label)


[torchtext.data.batch.Batch of size 100]
	[.sentence1]:[torch.LongTensor of size 40x100]
	[.sentence2]:[torch.LongTensor of size 40x100]
	[.gold_label]:[torch.LongTensor of size 1x100]
tensor([[  2,   2,   2,  ...,   2,   2,  14],
        [ 35,  20,   6,  ...,  72,  12, 162],
        [ 10,  23,   4,  ...,  29,  17,   7],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[  3,   3,   3,  ...,   2,   2,  56],
        [ 35,  23,   6,  ...,  72, 102,  14],
        [ 10,   5,   5,  ...,  29,   5,   9],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[0, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 2,
         2, 1, 1, 2, 0, 2, 0, 1, 0, 2, 1, 2, 0, 2, 2, 0, 1, 2, 2, 0, 2, 1, 1, 0,
         1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 