# Keras_Encoding

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def keras_tokenizer(sentences, num_words, padding, truncating, oov_token="<OOV>", one_hot=False, max_len=None):
    """
    Tokenizes, sequences, pads, and one-hot encodes a list of sentences.

    Parameters:
    sentences (list of str): A list of sentences to process.
    num_words (int): The maximum number of words to keep, based on word frequency.
    padding (str): The type of padding to apply ('pre' or 'post').
    truncating (str): The type of truncating to apply ('pre' or 'post').
    oov_token (str): Token for out-of-vocabulary words.
    max_len (int, optional): Maximum length for padding/truncating. If None, the longest sentence length is used.
    
    Returns:
    tokenizer (Tokenizer): The Keras tokenizer fitted on the input sentences.
    sequences (list of list of int): The tokenized sentences in sequence format.
    padded (ndarray): Padded sequences as a 2D array.
    one_hot_padded (ndarray): One-hot encoded padded sequences.
    word_index (dict): Dictionary mapping words to their index.
    index_word (dict): Dictionary mapping indices to words.
    """
    
    # Tokenizer 생성 및 문장 학습
    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(sentences)
    
    # 정수 시퀀스 변환
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # max_len이 주어지지 않으면 가장 긴 문장의 길이를 계산
    if max_len is None:
        max_len = max([len(seq) for seq in sequences])
    
    # 패딩 및 트렁케이팅
    padded = pad_sequences(sequences, maxlen=max_len, padding=padding, truncating=truncating)
    if one_hot:
        one_hot_padded = to_categorical(padded, num_classes=num_words)
    else:
        one_hot_padded = None
    
    # 단어 인덱스와 인덱스 단어
    word_index = tokenizer.word_index
    index_word = tokenizer.index_word
    
    return tokenizer, sequences, padded, one_hot_padded, word_index, index_word

sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?']
num_words = 100
padding = 'post'
truncating = "post"
oov_token = "<OOV>"
one_hot = False

tokenizer, sequences, padded, one_hot_padded, word_index, index_word =( 
    keras_tokenizer(sentences=sentences, num_words=num_words, padding=padding, truncating=truncating, oov_token= oov_token, one_hot=one_hot)
)

2024-11-14 08:46:03.424304: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-14 08:46:03.875020: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
