## IMDb 데이터셋을 tsv형식으로 변환

In [4]:
import glob
import os
import io
import string

f = open("./data/IMDb_train.tsv" , "w")

path = "./data/aclImdb/train/pos/"

for fname in glob.glob(os.path.join(path, "*.txt")):
    with io.open(fname, 'r', encoding="utf-8") as ff:
        text = ff.readline()

        text = text.replace("\t", " ")
        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

path = "./data/aclImdb/train/neg/"
for fname in glob.glob(os.path.join(path, "*.txt")):
    with io.open(fname, 'r', encoding="utf-8") as ff:
        text = ff.readline()

        text = text.replace("\t" ," ")
        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

In [5]:

f = open("./data/IMDb_text.tsv" , "w")

path = "./data/aclImdb/test/pos/"

for fname in glob.glob(os.path.join(path, "*.txt")):
    with io.open(fname, 'r', encoding="utf-8") as ff:
        text = ff.readline()

        text = text.replace("\t", " ")
        text = text+"\t"+"1"+"\t"+"\n"
        f.write(text)

path = "./data/aclImdb/test/neg/"
for fname in glob.glob(os.path.join(path, "*.txt")):
    with io.open(fname, 'r', encoding="utf-8") as ff:
        text = ff.readline()

        text = text.replace("\t" ," ")
        text = text+"\t"+"0"+"\t"+"\n"
        f.write(text)

f.close()

## 전처리 및 단어 분할 함수 정의

In [6]:
import string
import re

print("구두 점 문자 : ", string.punctuation)

def preprocessing_text(text):
    text = re.sub("<br />", '', text)

    for p in string.punctuation:
        if (p ==".") or (p==","):
            continue
        else:
            text = text.replace(p, " ")

    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    return text


def tokenizer_punctuation(text):
    return text.strip().split()

def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

print(tokenizer_with_preprocessing("I like cats."))





구두 점 문자 :  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['I', 'like', 'cats', '.']


## 데이터 로더 작성

In [8]:
import torchtext

max_length = 256

TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True,
                                   batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")

LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

In [12]:
train_val_ds, test_ds = torchtext.legacy.data.TabularDataset.splits(
    path='./data/', train='IMDb_train.tsv',
    test='IMDb_test.tsv', format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])

# 동작 확인
print('훈련 및 검증 데이터 수', len(train_val_ds))
print('첫번째 훈련 및 검증 데이터', vars(train_val_ds[0]))


훈련 및 검증 데이터 수 25000
첫번째 훈련 및 검증 데이터 {'Text': ['the', 'wit', 'and', 'pace', 'and', 'three', 'show', 'stopping', 'busby', 'berkley', 'numbers', 'put', 'this', 'ahead', 'of', 'the', 'over', 'rated', '42nd', 'street', '.', 'this', 'is', 'the', 'definitive', '30', 's', 'musical', 'with', 'a', 'knockout', 'frenetic', 'performance', 'from', 'jimmy', 'cagney', '.', 'one', 'of', 'the', 'last', 'releases', 'before', 'the', 'motion', 'picture', 'production', 'code', 'was', 'strictly', 'enforced', '.', 'a', 'must', 'see', '.'], 'Label': '1'}


In [13]:
import random
train_ds, val_ds = train_val_ds.split(split_ratio=0.8, random_state=random.seed(1234))

print("훈련 데이터의 수 : ", len(train_ds))
print("검증 데이터의 수 : ", len(val_ds))
print("첫 번째 훈련 데이터: ",vars(train_ds[0]))

훈련 데이터의 수 :  20000
검증 데이터의 수 :  5000
첫 번째 훈련 데이터:  {'Text': ['if', 'you', 'want', 'to', 'know', 'what', 'kind', 'of', 'music', 'white', 'people', 'listened', 'to', 'in', '1974', ',', 'this', 'is', 'the', 'movie', 'for', 'you', '.', 'but', 'you', 'll', 'have', 'to', 'listen', 'to', 'a', 'lot', 'of', 'flutes', 'and', 'violins', ',', 'too', 'see', 'my', 'remarks', 'on', 'my', 'girl', '1', 'for', 'the', 'reference', '.', 'indulgent', 'admission', 'i', 'approached', 'my', 'girl', '2', 'with', 'cynicism', 'and', 'annoyance', ',', 'having', 'just', 'viewed', 'its', 'predecessor', '.', 'but', 'as', 'an', 'adoptee', 'preparing', 'to', 'finally', 'set', 'upon', 'a', 'search', 'for', 'my', 'birthmother', ',', 'my', 'girl', '2', 'made', 'me', 'look', ',', 'with', 'its', 'theme', 'of', 'searching', 'for', 'mother', '.', 'put', 'another', 'way', ',', 'anything', 'i', 'liked', 'about', 'my', 'girl', '2', 'had', 'nothing', 'whatsoever', 'to', 'do', 'with', 'my', 'girl', '2', ',', 'but', 'relating', 't

## vocabulary 작성

In [14]:
from torchtext.vocab import Vectors

english_fasttext_vectors = Vectors(name="data/wiki-news-300d-1M.vec")

print("한 단어를 표현하는 차원 수 : ", english_fasttext_vectors.dim)
print("단어 수: ", len(english_fasttext_vectors.itos))

  0%|          | 0/999994 [00:00<?, ?it/s]Skipping token b'999994' with 1-dimensional vector [b'300']; likely a header
100%|██████████| 999994/999994 [01:24<00:00, 11869.91it/s]


한 단어를 표현하는 차원 수 :  300
단어 수:  999994


In [15]:
TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors, min_freq=10)

print(TEXT.vocab.vectors.shape)
TEXT.vocab.vectors

TEXT.vocab.stoi

torch.Size([17969, 300])


defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7f4f7edf5610>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<cls>': 2,
             '<eos>': 3,
             'the': 4,
             '.': 5,
             ',': 6,
             'and': 7,
             'a': 8,
             'of': 9,
             'to': 10,
             'is': 11,
             'it': 12,
             'in': 13,
             'i': 14,
             'this': 15,
             'that': 16,
             's': 17,
             'was': 18,
             'as': 19,
             'with': 20,
             'for': 21,
             'movie': 22,
             'but': 23,
             'film': 24,
             'you': 25,
             't': 26,
             'on': 27,
             'not': 28,
             'he': 29,
             'are': 30,
             'his': 31,
             'have': 32,
             'be': 33,
             'one': 34,
             'all': 35,
             'at': 36,
             'the

In [16]:
train_dl = torchtext.legacy.data.Iterator(train_ds, batch_size=24, train=True)

val_dl = torchtext.legacy.data.Iterator(val_ds, batch_size=24, train=False, sort=False)

test_dl = torchtext.legacy.data.Iterator(test_ds, batch_size=24, train=False, sort=False)

batch = next(iter(val_dl))
print(batch.Text)
print(batch.Label)


(tensor([[   2,   15,   11,  ...,    1,    1,    1],
        [   2,   21,    8,  ...,    1,    1,    1],
        [   2,   14,  469,  ...,    1,    1,    1],
        ...,
        [   2,   14,  308,  ...,    1,    1,    1],
        [   2,  193, 5776,  ...,    0,    7,    3],
        [   2,  790,  437,  ...,   23,   21,    3]]), tensor([109,  60, 229, 256, 256, 256, 195, 152, 148, 172, 141, 104, 256, 232,
        150, 256, 256, 256, 152, 256, 256, 169, 256, 256]))
tensor([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0])
