In [1]:
# 소설다운로드
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to C:\Users\ktc
[nltk_data]     m\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [2]:
# 소설목록
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
#소설 일부 미리보기
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(emma_raw[:1024])

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died too long ago for her to have more than an indistinct
remembrance of her caresses; and her place had been supplied
by an excellent woman as governess, who had fallen little short
of a mother in affection.

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.  Between _them_ it was more the intimacy
of sisters.  Even before Miss Taylor had ceased to hold the nominal
office of governess, the mildness o

In [4]:
# 문장 단위로 분리
from nltk.tokenize import sent_tokenize
print(sent_tokenize(emma_raw[:1000])[3])

Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.


In [5]:
# 단어 단위로 분리
from nltk.tokenize import word_tokenize
word_tokenize(emma_raw[50:85])

['Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'a']

In [6]:
# 영어와 숫자만 선택하여 분리
from nltk.tokenize import RegexpTokenizer
retokenize = RegexpTokenizer("[\w]+")
retokenize.tokenize(emma_raw[50:85])

['Emma', 'Woodhouse', 'handsome', 'clever', 'a']

In [7]:
words = ['lives', 'crying', 'flies', 'dying']

In [8]:
# 접미사 제거 - PorterStemmer
from nltk.stem import PorterStemmer
st = PorterStemmer()
[st.stem(w) for w in words]

['live', 'cri', 'fli', 'die']

In [9]:
# 접미사 제거 - LancasterStemmer
from nltk.stem import LancasterStemmer
st = LancasterStemmer()
[st.stem(w) for w in words]

['liv', 'cry', 'fli', 'dying']

In [10]:
words = ['lives', 'crying', 'flies', 'dying']

In [11]:
# 원형 복원
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
[lm.lemmatize(w) for w in words]

['life', 'cry', 'fly', 'dying']

In [12]:
# 동사로 복원
lm.lemmatize("dying", pos="v")

'die'

In [13]:
# 품사 태깅
from nltk.tag import pos_tag
sentence = "Emma refused to permit us to obtain the refuse permit"
tagged_list = pos_tag(word_tokenize(sentence))
tagged_list

[('Emma', 'NNP'),
 ('refused', 'VBD'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

In [14]:
# 명사만 추출
nouns_list = [t[0] for t in tagged_list if t[1] == "NN"]
nouns_list

['refuse', 'permit']

In [15]:
#[1.Tokenizing] tokenize novel data to words
from nltk.tokenize import word_tokenize
words=word_tokenize(emma_raw)
w = open('nltk-word.txt','w')
for x in words:
    w.write(x)
    w.write('\n')

In [16]:
#[2.Tagging] pos tagging
tagged=nltk.pos_tag(words)
t = open('nltk-tag.txt','w')
for x in tagged:
    t.write(x[0])
    t.write(' ')
    t.write(x[1])
    t.write('\n')

In [17]:
#[3.Extract noun] collect noun in all tagged words
allnoun = [word for word, pos in tagged if pos in ['NN', 'NNP']]
#write all noun in txt
g = open('Full_nouns2.txt','w')
for x in allnoun:
    g.write(x)
    g.write('\n')

In [18]:
# (선택) 빈도 높은 단어 10개 그래프로 그리기
from nltk import Text
import matplotlib.pyplot as plt
text = Text(retokenize.tokenize(emma_raw), name="Emma")
text.plot(10)
plt.show()

<Figure size 640x480 with 1 Axes>

In [19]:
# (선택) 출현 횟수가 높은 단어 출력하기
from nltk import FreqDist
stopwords = ["Mr.", "Mrs.", "Miss", "Mr", "Mrs", "Dear"]
emma_tokens = pos_tag(retokenize.tokenize(emma_raw))
names_list = [t[0] for t in emma_tokens if t[1] == "NNP" and t[0] not in stopwords]
fd_names = FreqDist(names_list)
# 전체 단어의 수, "Emma"라는 단어의 출현 횟수, 확률을 각각 계산
fd_names.N(), fd_names["Emma"], fd_names.freq("Emma")

(7863, 830, 0.10555767518758744)

In [20]:
# 가장 출현 횟수가 높은 단어 5개
fd_names.most_common(5)

[('Emma', 830),
 ('Harriet', 491),
 ('Weston', 439),
 ('Knightley', 389),
 ('Elton', 385)]