# Int Encoding
- 문자 &rarr; 숫자 (단어의 등장 빈도로 인덱스 부여)
- 빈도 높은 단어 5,000개만 선택 (학습 시 시간/공간 복잡도 줄임)

In [1]:
raw_text = """The Little Prince, written by Antoine de Saint-Exupéry, is a poetic tale about a young prince who travels from his home planet to Earth. The story begins with a pilot stranded in the Sahara Desert after his plane crashes. While trying to fix his plane, he meets a mysterious young boy, the Little Prince.

The Little Prince comes from a small asteroid called B-612, where he lives alone with a rose that he loves deeply. He recounts his journey to the pilot, describing his visits to several other planets. Each planet is inhabited by a different character, such as a king, a vain man, a drunkard, a businessman, a geographer, and a fox. Through these encounters, the Prince learns valuable lessons about love, responsibility, and the nature of adult behavior.

On Earth, the Little Prince meets various creatures, including a fox, who teaches him about relationships and the importance of taming, which means building ties with others. The fox's famous line, "You become responsible, forever, for what you have tamed," resonates with the Prince's feelings for his rose.

Ultimately, the Little Prince realizes that the essence of life is often invisible and can only be seen with the heart. After sharing his wisdom with the pilot, he prepares to return to his asteroid and his beloved rose. The story concludes with the pilot reflecting on the lessons learned from the Little Prince and the enduring impact of their friendship.

The narrative is a beautifully simple yet profound exploration of love, loss, and the importance of seeing beyond the surface of things."""

### 인코딩

### 토큰화 + 정제/정규화

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# 문장 토큰
sent_tokens = sent_tokenize(raw_text.lower())

# 불용어
stopword_set = set(stopwords.words('english'))

# 사전 (key=단어, value=빈도)
vocab = {}


# 토큰/정제/정규 결과
preprocessed_sentences = []

# 토큰 iter
    # 소문자 변환
    # 단어 토큰
    # 불용어 제거
    # 길이 2이하인 단어 제거
for st in sent_tokens:
    temp = ''
    for wt in word_tokenize(st):
        
        if wt not in stopword_set and len(wt) > 2:
            if wt in vocab:
                vocab[wt] += 1
            else:
                vocab[wt] = 1

            temp += wt + ' '
    
    preprocessed_sentences.append(temp[:-1])


vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
print(vocab)
print(preprocessed_sentences)


[('prince', 9), ('little', 6), ('pilot', 4), ('rose', 3), ('fox', 3), ('young', 2), ('planet', 2), ('earth', 2), ('story', 2), ('plane', 2), ('meets', 2), ('asteroid', 2), ('lessons', 2), ('love', 2), ('importance', 2), ('written', 1), ('antoine', 1), ('saint-exupéry', 1), ('poetic', 1), ('tale', 1), ('travels', 1), ('home', 1), ('begins', 1), ('stranded', 1), ('sahara', 1), ('desert', 1), ('crashes', 1), ('trying', 1), ('fix', 1), ('mysterious', 1), ('boy', 1), ('comes', 1), ('small', 1), ('called', 1), ('b-612', 1), ('lives', 1), ('alone', 1), ('loves', 1), ('deeply', 1), ('recounts', 1), ('journey', 1), ('describing', 1), ('visits', 1), ('several', 1), ('planets', 1), ('inhabited', 1), ('different', 1), ('character', 1), ('king', 1), ('vain', 1), ('man', 1), ('drunkard', 1), ('businessman', 1), ('geographer', 1), ('encounters', 1), ('learns', 1), ('valuable', 1), ('responsibility', 1), ('nature', 1), ('adult', 1), ('behavior', 1), ('various', 1), ('creatures', 1), ('including', 1), 

In [3]:
len(vocab)

109

##### 빈도수 기반 정제

In [4]:
# 인덱스 단어사전 생성
word_to_idx = {word: i+1 for i, (word, cnt) in enumerate(vocab)}

# 인덱스 단어사전 생성
idx_to_word = {i+1: word for i, (word, cnt) in enumerate(vocab)}
idx_to_word

{1: 'prince',
 2: 'little',
 3: 'pilot',
 4: 'rose',
 5: 'fox',
 6: 'young',
 7: 'planet',
 8: 'earth',
 9: 'story',
 10: 'plane',
 11: 'meets',
 12: 'asteroid',
 13: 'lessons',
 14: 'love',
 15: 'importance',
 16: 'written',
 17: 'antoine',
 18: 'saint-exupéry',
 19: 'poetic',
 20: 'tale',
 21: 'travels',
 22: 'home',
 23: 'begins',
 24: 'stranded',
 25: 'sahara',
 26: 'desert',
 27: 'crashes',
 28: 'trying',
 29: 'fix',
 30: 'mysterious',
 31: 'boy',
 32: 'comes',
 33: 'small',
 34: 'called',
 35: 'b-612',
 36: 'lives',
 37: 'alone',
 38: 'loves',
 39: 'deeply',
 40: 'recounts',
 41: 'journey',
 42: 'describing',
 43: 'visits',
 44: 'several',
 45: 'planets',
 46: 'inhabited',
 47: 'different',
 48: 'character',
 49: 'king',
 50: 'vain',
 51: 'man',
 52: 'drunkard',
 53: 'businessman',
 54: 'geographer',
 55: 'encounters',
 56: 'learns',
 57: 'valuable',
 58: 'responsibility',
 59: 'nature',
 60: 'adult',
 61: 'behavior',
 62: 'various',
 63: 'creatures',
 64: 'including',
 65: 'teac

In [5]:
vocab_szie = 15
word_to_idx = {word: index for word, index in word_to_idx.items() if index <= vocab_szie}
word_to_idx

{'prince': 1,
 'little': 2,
 'pilot': 3,
 'rose': 4,
 'fox': 5,
 'young': 6,
 'planet': 7,
 'earth': 8,
 'story': 9,
 'plane': 10,
 'meets': 11,
 'asteroid': 12,
 'lessons': 13,
 'love': 14,
 'importance': 15}

### OOV 처리

**OoV (out of vocabulary)** : 단어사전에 정의되지 않은 단어를 가리키는 키워드

In [6]:
word_to_idx['OOV'] = len(word_to_idx) + 16
word_to_idx

{'prince': 1,
 'little': 2,
 'pilot': 3,
 'rose': 4,
 'fox': 5,
 'young': 6,
 'planet': 7,
 'earth': 8,
 'story': 9,
 'plane': 10,
 'meets': 11,
 'asteroid': 12,
 'lessons': 13,
 'love': 14,
 'importance': 15,
 'OOV': 31}

##### 수열처리 (정수 인코딩)

In [7]:
encoded_sentences = []
oov_idx = word_to_idx['OOV']

for sentence in preprocessed_sentences:
    encoded_sentence = [word_to_idx.get(token, oov_idx) for token in sentence]
    print(sentence)
    print(encoded_sentences)
    print()
    encoded_sentences.append(encoded_sentence)

little prince written antoine saint-exupéry poetic tale young prince travels home planet earth
[]

story begins pilot stranded sahara desert plane crashes
[[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]]

trying fix plane meets mysterious young boy little prince
[[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31], [31, 31, 31, 31, 31, 31, 31,

### keras tokenizer

In [8]:
!pip install tensorflow



In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=15, oov_token='<OOV>')
tokenizer.fit_on_texts(preprocessed_sentences)
tokenizer.word_index # corpus 전체 대상으로 생성

{'<OOV>': 1,
 'prince': 2,
 'little': 3,
 'pilot': 4,
 'rose': 5,
 'fox': 6,
 'young': 7,
 'planet': 8,
 'earth': 9,
 'story': 10,
 'plane': 11,
 'meets': 12,
 'asteroid': 13,
 'lessons': 14,
 'love': 15,
 'importance': 16,
 'written': 17,
 'antoine': 18,
 'saint': 19,
 'exupéry': 20,
 'poetic': 21,
 'tale': 22,
 'travels': 23,
 'home': 24,
 'begins': 25,
 'stranded': 26,
 'sahara': 27,
 'desert': 28,
 'crashes': 29,
 'trying': 30,
 'fix': 31,
 'mysterious': 32,
 'boy': 33,
 'comes': 34,
 'small': 35,
 'called': 36,
 'b': 37,
 '612': 38,
 'lives': 39,
 'alone': 40,
 'loves': 41,
 'deeply': 42,
 'recounts': 43,
 'journey': 44,
 'describing': 45,
 'visits': 46,
 'several': 47,
 'planets': 48,
 'inhabited': 49,
 'different': 50,
 'character': 51,
 'king': 52,
 'vain': 53,
 'man': 54,
 'drunkard': 55,
 'businessman': 56,
 'geographer': 57,
 'encounters': 58,
 'learns': 59,
 'valuable': 60,
 'responsibility': 61,
 'nature': 62,
 'adult': 63,
 'behavior': 64,
 'various': 65,
 'creatures': 66

In [None]:
tokenizer.index_word # corpus 전체 대상으로 생성

{1: '<OOV>',
 2: 'prince',
 3: 'little',
 4: 'pilot',
 5: 'rose',
 6: 'fox',
 7: 'young',
 8: 'planet',
 9: 'earth',
 10: 'story',
 11: 'plane',
 12: 'meets',
 13: 'asteroid',
 14: 'lessons',
 15: 'love',
 16: 'importance',
 17: 'written',
 18: 'antoine',
 19: 'saint',
 20: 'exupéry',
 21: 'poetic',
 22: 'tale',
 23: 'travels',
 24: 'home',
 25: 'begins',
 26: 'stranded',
 27: 'sahara',
 28: 'desert',
 29: 'crashes',
 30: 'trying',
 31: 'fix',
 32: 'mysterious',
 33: 'boy',
 34: 'comes',
 35: 'small',
 36: 'called',
 37: 'b',
 38: '612',
 39: 'lives',
 40: 'alone',
 41: 'loves',
 42: 'deeply',
 43: 'recounts',
 44: 'journey',
 45: 'describing',
 46: 'visits',
 47: 'several',
 48: 'planets',
 49: 'inhabited',
 50: 'different',
 51: 'character',
 52: 'king',
 53: 'vain',
 54: 'man',
 55: 'drunkard',
 56: 'businessman',
 57: 'geographer',
 58: 'encounters',
 59: 'learns',
 60: 'valuable',
 61: 'responsibility',
 62: 'nature',
 63: 'adult',
 64: 'behavior',
 65: 'various',
 66: 'creatures'

In [None]:
tokenizer.word_counts # corpus 전체 대상으로 생성

OrderedDict([('little', 6),
             ('prince', 9),
             ('written', 1),
             ('antoine', 1),
             ('saint', 1),
             ('exupéry', 1),
             ('poetic', 1),
             ('tale', 1),
             ('young', 2),
             ('travels', 1),
             ('home', 1),
             ('planet', 2),
             ('earth', 2),
             ('story', 2),
             ('begins', 1),
             ('pilot', 4),
             ('stranded', 1),
             ('sahara', 1),
             ('desert', 1),
             ('plane', 2),
             ('crashes', 1),
             ('trying', 1),
             ('fix', 1),
             ('meets', 2),
             ('mysterious', 1),
             ('boy', 1),
             ('comes', 1),
             ('small', 1),
             ('asteroid', 2),
             ('called', 1),
             ('b', 1),
             ('612', 1),
             ('lives', 1),
             ('alone', 1),
             ('rose', 3),
             ('loves', 1),
           

In [None]:
# 정수 인코딩
sequences = tokenizer.texts_to_sequences(preprocessed_sentences)
sequences

[[3, 2, 1, 1, 1, 1, 1, 1, 7, 2, 1, 1, 8, 9],
 [10, 1, 4, 1, 1, 1, 11, 1],
 [1, 1, 11, 12, 1, 7, 1, 3, 2],
 [3, 2, 1, 1, 13, 1, 1, 1, 1, 1, 5, 1, 1],
 [1, 1, 4, 1, 1, 1, 1],
 [8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6],
 [1, 2, 1, 1, 14, 1, 1, 1, 1, 1],
 [9, 3, 2, 12, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1],
 [6, 1, 1, 1, 1, 1, 1, 1, 2, 1, 5],
 [1, 3, 2, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 4, 1, 1, 13, 1, 5],
 [10, 1, 4, 1, 14, 1, 3, 2, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]