In [98]:
import urllib.request
import pandas as pd

In [99]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x1d96b9f0518>)

In [100]:
df = pd.read_csv('IMDb_Reviews.csv', encoding='latin1')
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [101]:
print('전체 샘플의 개수 : {}'.format(len(df)))

전체 샘플의 개수 : 50000


In [102]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.5, random_state=0)
train_df.shape, test_df.shape

((25000, 2), (25000, 2))

In [103]:
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

In [104]:
#!pip install torchtext
from torchtext import data # torchtext.data 임포트

In [105]:
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=150)

In [106]:
LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

In [107]:
from torchtext.data import TabularDataset

In [108]:
train_data, test_data = TabularDataset.splits(
        path='.', train='train_data.csv', test='test_data.csv', format='csv',
        fields=[('text', TEXT), ('label', LABEL)], skip_header=True)

In [109]:
print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

훈련 샘플의 개수 : 25000
테스트 샘플의 개수 : 25000


In [110]:
print(vars(train_data[0]))

{'text': ['this', 'movie', 'is', 'just', 'like', 'every', 'other', 'dutch', 'movie,', 'so', 'if', 'you', 'enjoy', 'movies', 'such', 'as', 'turks', 'fruit', 'and', 'de', 'kleine', 'blonde', 'dood.', 'then', 'you', 'might', 'be', 'okay', 'with', 'this', 'one', '(even', 'though', 'those', 'two', 'have', 'much', 'better', 'stories', 'and', 'actors)', 'zomerhitte', 'starts', 'strong', 'enough,', 'but', 'even', 'that', 'one', 'good', 'scene', 'ends', 'up', 'having', 'nothing', 'to', 'do', 'with', 'the', 'storyline.', "there's", 'a', 'lot', 'of', 'nudity', '(but', 'me', 'and', 'others', 'just', 'could', 'not', 'find', 'that', 'girl', 'attractive),', 'the', 'dialog', 'is', 'laughable', '(as', 'we', 'did', 'a', 'lot', 'to', 'the', 'annoyance', 'of', 'other', 'movie', 'watchers),', 'and', 'some', 'of', 'the', 'scenes', 'are', 'so', 'completely', 'random', 'that', 'this', 'is', 'more', 'of', 'an', 'unintentional', 'comedy', 'than', 'anything', 'else', '(like', 'a', 'random', 'scene', 'in', 'which

In [111]:
print(train_data.fields.items())

dict_items([('text', <torchtext.data.field.Field object at 0x000001D964BC1860>), ('label', <torchtext.data.field.Field object at 0x000001D964BC12E8>)])


In [112]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [113]:
print('단어 집합의 크기 : {}'.format(len(TEXT.vocab)))

단어 집합의 크기 : 10002


In [114]:
print(TEXT.vocab.stoi)



In [115]:
from torchtext.data import Iterator

In [120]:
batch_size = 5

train_loader = Iterator(dataset=train_data, batch_size = batch_size, device='cuda')
test_loader = Iterator(dataset=test_data, batch_size = batch_size,  device='cuda')

In [121]:
batch = next(iter(train_loader)) # 첫번째 미니배치

In [122]:
print(batch.text)

tensor([[  10,   26,    7,   42,    3, 2624,  345,  245,    6,  154,    0,    8,
          757,  267,  121,   64,   27,   41,   22,   62,    3, 1734,    5,   17,
          803, 5193,   13,  514,  733,    7,    3,   26, 1938,    3,  158,   16,
           50,  341, 1120,    0,    4,   27,  577,  511,  201,   35,    2,  291,
            2,  130,    7, 6263,    4,    0,    2,  256,  866,    0,    7,    0,
           18,   10,  871,    5, 8219,    7,   22,    0,    0,   11,    7,   52,
           37,   32, 2198, 3303,   64,   80, 5562,    0, 4579, 6205,    4, 2834,
           94,  100, 6940,   21,    0,   51,    7,    3,  619,  495,    5, 1527,
            8,    2,  775,   13,  514,    7,   22,  232,   15,    3,  288, 1313,
           51,   21,   46,  232,   30, 6961,   18,   12,    7,  111,   11, 1327,
            2,  753,    4, 7981,  996,    6,    0,   91,   16,  753, 7068,    9,
           81,  154, 1198,  174,  278,    6, 1955, 4659,    0,  799,    1,    1,
            1,    1,    1,  

In [123]:
print(batch.text[0])

tensor([  10,   26,    7,   42,    3, 2624,  345,  245,    6,  154,    0,    8,
         757,  267,  121,   64,   27,   41,   22,   62,    3, 1734,    5,   17,
         803, 5193,   13,  514,  733,    7,    3,   26, 1938,    3,  158,   16,
          50,  341, 1120,    0,    4,   27,  577,  511,  201,   35,    2,  291,
           2,  130,    7, 6263,    4,    0,    2,  256,  866,    0,    7,    0,
          18,   10,  871,    5, 8219,    7,   22,    0,    0,   11,    7,   52,
          37,   32, 2198, 3303,   64,   80, 5562,    0, 4579, 6205,    4, 2834,
          94,  100, 6940,   21,    0,   51,    7,    3,  619,  495,    5, 1527,
           8,    2,  775,   13,  514,    7,   22,  232,   15,    3,  288, 1313,
          51,   21,   46,  232,   30, 6961,   18,   12,    7,  111,   11, 1327,
           2,  753,    4, 7981,  996,    6,    0,   91,   16,  753, 7068,    9,
          81,  154, 1198,  174,  278,    6, 1955, 4659,    0,  799,    1,    1,
           1,    1,    1,    1,    1,   