In [66]:
import torch
import torch.nn as nn
import torchtext
from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator, BucketIterator
import spacy
import numpy as np

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
tokenize = lambda x: [tok.text for tok in nlp.tokenizer(x)]

In [5]:
tokenize('Bozun is the strongest!')

['Bozun', 'is', 'the', 'strongest', '!']

In [6]:
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
LABEL = data.Field(sequential=False, use_vocab=False)

In [8]:
fields = [("id", None), ("phrase", TEXT), ("sentiment", LABEL)]
examples = []

In [10]:
example1 = data.Example.fromlist([None, 'No one is stronger than Bozun!', 5], fields)

In [11]:
example1

<torchtext.data.example.Example at 0x7fe345033d50>

In [12]:
dir(example1)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'fromCSV',
 'fromJSON',
 'fromdict',
 'fromlist',
 'fromtree',
 'phrase',
 'sentiment']

In [13]:
example1.phrase

['no', 'one', 'is', 'stronger', 'than', 'bozun', '!']

In [14]:
example1.sentiment

5

In [15]:
example1.fromCSV

<bound method Example.fromCSV of <class 'torchtext.data.example.Example'>>

In [16]:
example1.fromCSV()

TypeError: fromCSV() missing 2 required positional arguments: 'data' and 'fields'

In [17]:
example1.__class__

torchtext.data.example.Example

In [20]:
example1.get_attribute('phrase')

AttributeError: 'Example' object has no attribute 'get_attribute'

In [21]:
example1.phrase

['no', 'one', 'is', 'stronger', 'than', 'bozun', '!']

In [25]:
example2 = data.Example.fromlist([None, 'Bozun is the strongest!', 3], fields)

In [26]:
example2

<torchtext.data.example.Example at 0x7fe345011dd0>

In [27]:
example2.phrase

['bozun', 'is', 'the', 'strongest', '!']

In [28]:
example2.sentiment

3

In [29]:
examples = [example1, example2]

In [30]:
examples

[<torchtext.data.example.Example at 0x7fe345033d50>,
 <torchtext.data.example.Example at 0x7fe345011dd0>]

In [31]:
fields

[('id', None),
 ('phrase', <torchtext.data.field.Field at 0x7fe345038d50>),
 ('sentiment', <torchtext.data.field.Field at 0x7fe345038e90>)]

In [35]:
cur_dataset = data.Dataset(examples, fields)

In [36]:
type(cur_dataset)

torchtext.data.dataset.Dataset

In [40]:
from torch.utils.data import Dataset

In [42]:
issubclass(cur_dataset, Dataset)

TypeError: issubclass() arg 1 must be a class

In [43]:
issubclass(data.Dataset, Dataset)

True

In [44]:
dir(cur_dataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'download',
 'examples',
 'fields',
 'filter_examples',
 'sort_key',
 'split',
 'splits']

In [45]:
cur_dataset.fields

{'id': None,
 'phrase': <torchtext.data.field.Field at 0x7fe345038d50>,
 'sentiment': <torchtext.data.field.Field at 0x7fe345038e90>}

In [46]:
cur_dataset.split

<bound method Dataset.split of <torchtext.data.dataset.Dataset object at 0x7fe347e6e150>>

In [47]:
cur_dataset[0]

<torchtext.data.example.Example at 0x7fe345033d50>

In [48]:
len(cur_dataset)

2

In [51]:
for i, example in enumerate(cur_dataset):
    print('i:{i} phrase:{phrase}'.format(i=i, phrase=example.phrase))

i:0 phrase:['no', 'one', 'is', 'stronger', 'than', 'bozun', '!']
i:1 phrase:['bozun', 'is', 'the', 'strongest', '!']


In [53]:
def my_dropout(text, p=0.5):
    text = text.strip().split()
    len_ = len(text)
    indexs = np.random.choice(len_, int(len_ * p))
    for i in indexs:
        text[i] = ''
    return ' '.join(text)

In [58]:
for i in range(3):
    print(my_dropout('bozun is the best!'))

bozun  the 
 is the 
 is  best!


In [59]:
np.random.choice(10, 5)

array([5, 8, 3, 1, 3])

In [60]:
cur_dataset[0].__dict__

{'phrase': ['no', 'one', 'is', 'stronger', 'than', 'bozun', '!'],
 'sentiment': 5}

In [61]:
cur_dataset[1].__dict__

{'phrase': ['bozun', 'is', 'the', 'strongest', '!'], 'sentiment': 3}

In [63]:
!ls -a

[1m[36m.[m[m                  [1m[36m.git[m[m               README.md
[1m[36m..[m[m                 .gitignore         [1m[36mdata[m[m
.DS_Store          [1m[36m.ipynb_checkpoints[m[m nlp_practice.ipynb


In [64]:
!mkdir .vector_cache

In [65]:
!ls -a |grep 'vector'

.vector_cache


In [67]:
from torch.utils.data import DataLoader
issubclass(Iterator, DataLoader)

False

In [68]:
cur_iterator = BucketIterator(dataset=cur_dataset, batch_size=1, shuffle=True, sort_within_batch=False, repeat=False)
cur_iterator

<torchtext.data.iterator.BucketIterator at 0x7fe3498401d0>

In [71]:
cur_iterator.data()

[<torchtext.data.example.Example at 0x7fe345011dd0>,
 <torchtext.data.example.Example at 0x7fe345033d50>]

In [70]:
for idx, batch in enumerate(cur_iterator):
    print(idx)
    print(batch)
    # print(dir(batch))

AttributeError: 'Field' object has no attribute 'vocab'