# Padding
- 고정된 길이 데이터가 처리에 효율적임. 근데 각 문장 길이가 다름 &rarr; 문장 길이를 동일하게 맞추는 작업 필요

**패딩 장점**
1. 입력형식 일관화
2. 병렬연산 최적화
3. 유연한 데이터 처리

In [4]:
preprocessed_sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'],
                          ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'],
                          ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'],
                          ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'],
                          ['barber', 'went', 'huge', 'mountain']]

### 직접 구현

In [5]:
import torch
from collections import Counter

class TokenizerForPadding:
    def __init__(self, num_words=None, oov_token='<OOV>'):
        self.num_words = num_words
        self.oov_token = oov_token
        self.word_index = {}
        self.word_index = {}
        self.word_counts = Counter()

    def fit_on_texts(self, texts):
        # 빈도수 세기
        for sentence in texts:
            self.word_counts.update(word for word in sentence if word)

        # 빈도수 기간 vocabulary  생성
        vocab = [self.oov_token] + [word for word, _ in self.word_counts.most_common(self.num_words - 2 if self.num_words else None)]

        self.word_index = {word: i+1 for i, word in enumerate(vocab)}
        self.index_word = {i+1: word for word, i in self.word_index.items()}

    def texts_to_sequences(self, texts):
        return [[self.word_index.get(word, self.word_index[self.oov_token]) for word in sentence] for sentence in texts]

In [6]:
def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0):
    if maxlen is None:
        maxlen = max(len(seq) for seq in sequences)

        padded_sequences = []

        for seq in sequences:
            if len(seq) > maxlen:
                if truncating == 'pre':
                    seq = seq[-maxlen:]
                else: # post
                    seq = seq[:maxlen]

            else:
                pad_length = maxlen - len(seq)
                if padding == 'pre':
                    seq = [value] * pad_length + seq

                else:
                    seq += [value] * pad_length

            padded_sequences.append(seq)    

    return torch.tensor(padded_sequences)

In [7]:
tokenizer = TokenizerForPadding(num_words=15)
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[2, 6],
 [2, 9, 6],
 [2, 4, 6],
 [10, 3],
 [3, 5, 4, 3],
 [4, 3],
 [2, 5, 7],
 [2, 5, 7],
 [2, 5, 3],
 [8, 8, 4, 3, 11, 2, 12],
 [2, 13, 4, 14]]

In [8]:
padded = pad_sequences(sequences, padding='pre', truncating='pre', maxlen=None)
padded

tensor([[ 0,  0,  0,  0,  0,  2,  6],
        [ 0,  0,  0,  0,  2,  9,  6],
        [ 0,  0,  0,  0,  2,  4,  6],
        [ 0,  0,  0,  0,  0, 10,  3],
        [ 0,  0,  0,  3,  5,  4,  3],
        [ 0,  0,  0,  0,  0,  4,  3],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  7],
        [ 0,  0,  0,  0,  2,  5,  3],
        [ 8,  8,  4,  3, 11,  2, 12],
        [ 0,  0,  0,  2, 13,  4, 14]])

### Keras Tokenizer 이용

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[1, 5],
 [1, 8, 5],
 [1, 3, 5],
 [9, 2],
 [2, 4, 3, 2],
 [3, 2],
 [1, 4, 6],
 [1, 4, 6],
 [1, 4, 2],
 [7, 7, 3, 2, 10, 1, 11],
 [1, 12, 3, 13]]

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences, padding='post', maxlen=3, truncating='post')
padded

array([[ 1,  5,  0],
       [ 1,  8,  5],
       [ 1,  3,  5],
       [ 9,  2,  0],
       [ 2,  4,  3],
       [ 3,  2,  0],
       [ 1,  4,  6],
       [ 1,  4,  6],
       [ 1,  4,  2],
       [ 7,  7,  3],
       [ 1, 12,  3]], dtype=int32)

##### 어린왕장 패딩
1. 텍스트 전처리 (토큰화/불용어/정제/정규화)
2. 정수 인코딩 Tokenizer (tensorflow.keras)
3. 패딩처리 pad_sequences (tensorflow.keras)

In [11]:
raw_text = """The Little Prince, written by Antoine de Saint-Exupéry, is a poetic tale about a young prince who travels from his home planet to Earth. The story begins with a pilot stranded in the Sahara Desert after his plane crashes. While trying to fix his plane, he meets a mysterious young boy, the Little Prince.

The Little Prince comes from a small asteroid called B-612, where he lives alone with a rose that he loves deeply. He recounts his journey to the pilot, describing his visits to several other planets. Each planet is inhabited by a different character, such as a king, a vain man, a drunkard, a businessman, a geographer, and a fox. Through these encounters, the Prince learns valuable lessons about love, responsibility, and the nature of adult behavior.

On Earth, the Little Prince meets various creatures, including a fox, who teaches him about relationships and the importance of taming, which means building ties with others. The fox's famous line, "You become responsible, forever, for what you have tamed," resonates with the Prince's feelings for his rose.

Ultimately, the Little Prince realizes that the essence of life is often invisible and can only be seen with the heart. After sharing his wisdom with the pilot, he prepares to return to his asteroid and his beloved rose. The story concludes with the pilot reflecting on the lessons learned from the Little Prince and the enduring impact of their friendship.

The narrative is a beautifully simple yet profound exploration of love, loss, and the importance of seeing beyond the surface of things."""

In [18]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# 문장 토큰
sent_tokens = sent_tokenize(raw_text.lower())

# 불용어
stopword_set = set(stopwords.words('english'))

# 사전 (key=단어, value=빈도)
vocab = {}

# 토큰/정제/정규화
preprocessed_sentences = []

# 소문자 변환, 토큰화, 불용어, 단어길이 2이하 제거
for st in sent_tokens:
    temp = ''
    for wt in word_tokenize(st):
        
        if wt not in stopword_set and len(wt) > 2:
            if wt in vocab:
                vocab[wt] += 1
            else:
                vocab[wt] = 1

            temp += wt + ' '
    
    preprocessed_sentences.append(temp[:-1])


vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
print(vocab)
print(preprocessed_sentences)


[('prince', 9), ('little', 6), ('pilot', 4), ('rose', 3), ('fox', 3), ('young', 2), ('planet', 2), ('earth', 2), ('story', 2), ('plane', 2), ('meets', 2), ('asteroid', 2), ('lessons', 2), ('love', 2), ('importance', 2), ('written', 1), ('antoine', 1), ('saint-exupéry', 1), ('poetic', 1), ('tale', 1), ('travels', 1), ('home', 1), ('begins', 1), ('stranded', 1), ('sahara', 1), ('desert', 1), ('crashes', 1), ('trying', 1), ('fix', 1), ('mysterious', 1), ('boy', 1), ('comes', 1), ('small', 1), ('called', 1), ('b-612', 1), ('lives', 1), ('alone', 1), ('loves', 1), ('deeply', 1), ('recounts', 1), ('journey', 1), ('describing', 1), ('visits', 1), ('several', 1), ('planets', 1), ('inhabited', 1), ('different', 1), ('character', 1), ('king', 1), ('vain', 1), ('man', 1), ('drunkard', 1), ('businessman', 1), ('geographer', 1), ('encounters', 1), ('learns', 1), ('valuable', 1), ('responsibility', 1), ('nature', 1), ('adult', 1), ('behavior', 1), ('various', 1), ('creatures', 1), ('including', 1), 

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation


# 문장 토큰
sent_tokens = sent_tokenize(raw_text.lower())

# 불용어
stopword_set = set(stopwords.words('english'))

# 사전 (key=단어, value=빈도)
vocab = {}

# 토큰/정제/정규화

# 소문자 변환, 토큰화, 불용어, 단어길이 2이하 제거

minlen = 3
for st in sent_tokens:
    word_tokens = [wt for wt in word_tokenize(st) if wt not in punctuation and wt not in stopword_set and len(wt) > minlen]


['little', 'prince', ',', 'written', 'antoine', 'de', 'saint-exupéry', ',', 'poetic', 'tale', 'young', 'prince', 'travels', 'home', 'planet', 'earth', '.']
['story', 'begins', 'pilot', 'stranded', 'sahara', 'desert', 'plane', 'crashes', '.']
['trying', 'fix', 'plane', ',', 'meets', 'mysterious', 'young', 'boy', ',', 'little', 'prince', '.']
['little', 'prince', 'comes', 'small', 'asteroid', 'called', 'b-612', ',', 'lives', 'alone', 'rose', 'loves', 'deeply', '.']
['recounts', 'journey', 'pilot', ',', 'describing', 'visits', 'several', 'planets', '.']
['planet', 'inhabited', 'different', 'character', ',', 'king', ',', 'vain', 'man', ',', 'drunkard', ',', 'businessman', ',', 'geographer', ',', 'fox', '.']
['encounters', ',', 'prince', 'learns', 'valuable', 'lessons', 'love', ',', 'responsibility', ',', 'nature', 'adult', 'behavior', '.']
['earth', ',', 'little', 'prince', 'meets', 'various', 'creatures', ',', 'including', 'fox', ',', 'teaches', 'relationships', 'importance', 'taming', ',

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# num_words: 단어사건에 사용할 개수
# oov_token: OOV토큰 명으로 사용할 이름 지정
tokenizer = Tokenizer(num_words=15, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_sentences)
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)

padded = pad_sequences(sequences, padding='post', truncating='post')
padded

array([[  2,   1,  16,  17,  18,  19,  20,  21,   6,   1,  22,  23,   7,
          8,   0,   0],
       [  9,  24,   3,  25,  26,  27,  10,  28,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 29,  30,  10,  11,  31,   6,  32,   2,   1,   0,   0,   0,   0,
          0,   0,   0],
       [  2,   1,  33,  34,  12,  35,  36,  37,  38,  39,   4,  40,  41,
          0,   0,   0],
       [ 42,  43,   3,  44,  45,  46,  47,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  7,  48,  49,  50,  51,  52,  53,  54,  55,  56,   5,   0,   0,
          0,   0,   0],
       [ 57,   1,  58,  59,  13,  14,  60,  61,  62,  63,   0,   0,   0,
          0,   0,   0],
       [  8,   2,   1,  11,  64,  65,  66,   5,  67,  68,  15,  69,  70,
         71,  72,  73],
       [  5,  74,  75,  76,  77,  78,  79,  80,   1,  81,   4,   0,   0,
          0,   0,   0],
       [ 82,   2,   1,  83,  84,  85,  86,  87,  88,  89,   0,   0,   0,
          0,   0,   0],
       [ 90,  91,   3,  92,  9