<a href="https://colab.research.google.com/github/Jeanrain-lee/practise_nlp/blob/master/practise_integer_encoding1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

참조 :: https://wikidocs.net/31766

˙ 컴퓨터는 텍스트보다 숫자 처리에 능함 </br>
=> 텍스트를 숫자로 바꾸는 여러 기법 존재, 방법들을 적용시키기 위해 각 단어를 고유 숫자에 mapping 시키는 전처리작업이 필요할 수도 </br>
=> 랜덤으로 숫자를 부여하기도 하지만 보통은 빈도수가 높은 단어들만 사용하기 위해 단어에 대한 빈도수를 기준으로 정렬한 뒤 부여


# 정수 인코딩 (Integer Encoding)

In [0]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [19]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [21]:
# 문장 토큰화
text = sent_tokenize(text)
print(text)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [22]:
# 문장으로 토큰화된 텍스트를 단어 토큰화로 쪼개기
# 정제 작업도 병행
vocab = {}
sentences = []
stop_words = set(stopwords.words('english'))

for i in text:
  sentence = word_tokenize(i)
  result = []

  for word in sentence:
    word = word.lower()
    if word not in stop_words:  # 불용어라면 제거 
      if len(word) > 2: # 길이가 짧은 단어 제거
        result.append(word)
        if word not in vocab:
          vocab[word] = 0
        vocab[word] += 1
  sentences.append(result)
print(sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


˙ 텍스트를 숫자로 변환하기 전 단어가 텍스트일 때만 가능한 최대한의 전처리 끝내야

In [23]:
print(vocab)
print(vocab["barber"])

{'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}
8


In [24]:
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse=True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [25]:
# 빈도수가 높을 수록 낮은 정수 인덱스 부여
word_to_index = {}
i=0
for (word, frequency) in vocab_sorted :
  if frequency > 1: # 빈도수 낮은 단어는 처음부터 제외
    i += 1
    word_to_index[word] = i
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [0]:
# 빈도수 상의 n개만 사용하고 싶을 때 (예를 들어 5개만 쓰고 싶다)
vocab_size = 5
words_frequency = [w for w,c in word_to_index.items() if c >= vocab_size + 1]

for w in words_frequency:
  del word_to_index[w]

In [0]:
# word_to_index에 저장된 3개의 단어를 sentence에서 정수로 바꾸기
# 이 때 단어 집합에 없는 단어를 Out-Of-Vocabulary(OOV)라고 부름
word_to_index['OOV'] = len(word_to_index) + 1

In [29]:
encoded = []
for s in sentences:
  temp=[]
  for w in s:
    try:
      temp.append(word_to_index[w])
    except KeyError:
      temp.append(word_to_index['OOV'])
    encoded.append(temp)
print(encoded)

[[1, 5], [1, 5], [1, 7, 5], [1, 7, 5], [1, 7, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5], [7, 2], [7, 2], [2, 4, 3, 2], [2, 4, 3, 2], [2, 4, 3, 2], [2, 4, 3, 2], [3, 2], [3, 2], [1, 4, 7], [1, 4, 7], [1, 4, 7], [1, 4, 7], [1, 4, 7], [1, 4, 7], [1, 4, 2], [1, 4, 2], [1, 4, 2], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [7, 7, 3, 2, 7, 1, 7], [1, 7, 3, 7], [1, 7, 3, 7], [1, 7, 3, 7], [1, 7, 3, 7]]
