In [8]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! \
        The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. \
        His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. \
        the barber went up a huge mountain."
text = sent_tokenize(text)
print(text)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [11]:
vocab = {}
sentences = []
stop_words = set(stopwords.words('english'))

for token in text:
  sentence = word_tokenize(token)
  result = []

  for word in sentence:
    word = word.lower()

    if word not in stop_words and len(word) > 2:
      result.append(word)
      vocab[word] = 1 if word not in vocab else vocab[word] + 1
  sentences.append(result)

print(sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [13]:
vocab_sorted = sorted(vocab.items(), key= lambda word: word[1], reverse= True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [14]:
word2idx = {}
idx = 0

for (word, freq) in vocab_sorted:
  if freq > 1:
    idx += 1
    word2idx[word] = idx

print(word2idx)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [15]:
MIN_FREQ_TO_USE = 5
words_to_remove = [word for word, index in word2idx.items() if index >= MIN_FREQ_TO_USE + 1]

for word in words_to_remove:
  del word2idx[word]
print(word2idx)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5}


In [18]:
word2idx['OOV'] = len(word2idx) + 1
encoded = []

for line in sentences:
  line_encoded = []
  for word in line:
    try:
      line_encoded.append(word2idx[word])
    except KeyError:
      line_encoded.append(word2idx['OOV'])
  encoded.append(line_encoded)
print(encoded)

[[1, 5], [1, 7, 5], [1, 3, 5], [7, 2], [2, 4, 3, 2], [3, 2], [1, 4, 7], [1, 4, 7], [1, 4, 2], [7, 7, 3, 2, 7, 1, 7], [1, 7, 3, 7]]


# NLTK 사용하기

In [19]:
from nltk import FreqDist
import numpy as np

In [20]:
vocab = FreqDist(np.hstack(sentences))
vocab = vocab.most_common(MIN_FREQ_TO_USE)
print(vocab)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]
