# tf.data

In [1]:
import os
import tensorflow as tf 
import numpy as np

from tensorflow.keras import preprocessing

In [2]:
samples = ['너 오늘 이뻐 보인다', 
           '나는 오늘 기분이 더러워', 
           '끝내주는데, 좋은 일이 있나봐', 
           '나 좋은 일이 생겼어', 
           '아 오늘 진짜 짜증나', 
           '환상적인데, 정말 좋은거 같아']

label = [[1], [0], [1], [1], [0], [1]]

In [3]:
MAX_LEN = 4

In [4]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)

sequences = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

word_index = tokenizer.word_index

In [5]:
print("수치화된 텍스트 데이터: \n",sequences)
print("각 단어의 인덱스: \n", word_index) 
print("라벨: ", label) 

수치화된 텍스트 데이터: 
 [[ 4  1  5  6]
 [ 7  1  8  9]
 [10  2  3 11]
 [12  2  3 13]
 [14  1 15 16]
 [17 18 19 20]]
각 단어의 인덱스: 
 {'오늘': 1, '좋은': 2, '일이': 3, '너': 4, '이뻐': 5, '보인다': 6, '나는': 7, '기분이': 8, '더러워': 9, '끝내주는데': 10, '있나봐': 11, '나': 12, '생겼어': 13, '아': 14, '진짜': 15, '짜증나': 16, '환상적인데': 17, '정말': 18, '좋은거': 19, '같아': 20}
라벨:  [[1], [0], [1], [1], [0], [1]]


In [7]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset

<TensorSliceDataset shapes: ((4,), (1,)), types: (tf.int32, tf.int32)>

In [11]:
for seq, lab in dataset:
    print(seq, lab)

tf.Tensor([4 1 5 6], shape=(4,), dtype=int32) tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([7 1 8 9], shape=(4,), dtype=int32) tf.Tensor([0], shape=(1,), dtype=int32)
tf.Tensor([10  2  3 11], shape=(4,), dtype=int32) tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([12  2  3 13], shape=(4,), dtype=int32) tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([14  1 15 16], shape=(4,), dtype=int32) tf.Tensor([0], shape=(1,), dtype=int32)
tf.Tensor([17 18 19 20], shape=(4,), dtype=int32) tf.Tensor([1], shape=(1,), dtype=int32)


In [None]:
BATCH_SIZE = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.batch(BATCH_SIZE)

In [None]:
for seq, lab in dataset:
    print(seq, lab)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.shuffle(len(sequences))

In [None]:
for seq, lab in dataset:
    print(seq, lab)

In [None]:
EPOCH = 2

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.repeat(EPOCH)

In [None]:
for seq, lab in dataset:
    print(seq, lab)

In [None]:
def mapping_fn(X, Y=None):
    input = {'x': X}
    label = Y
    return input, label

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.map(mapping_fn)

In [None]:
for seq, lab in dataset:
    print(seq, lab)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.map(mapping_fn)
dataset = dataset.shuffle(len(sequences))
dataset = dataset.batch(BATCH_SIZE) 

In [None]:
for seq, lab in dataset:
    print(seq, lab)

In [None]:
BATCH_SIZE = 2
EPOCH = 2

def mapping_fn(X, Y=None):
    input = {'x': X}
    label = Y
    return input, label

dataset = tf.data.Dataset.from_tensor_slices((sequences, label))
dataset = dataset.map(mapping_fn)
dataset = dataset.shuffle(len(sequences))
dataset = dataset.batch(BATCH_SIZE) 
dataset = dataset.repeat(EPOCH)

for seq, lab in dataset:
    print(seq, lab)