# Document Similarity using NLTK and gensim

### tokenization, word counts, and possibly calculated tf-idf scores for words

### Tokenization in NLTK

In [1]:
import nltk
import string

from collections import Counter
import urllib

# nltk 코퍼스 파일 받아오고 열기
text = nltk.corpus.gutenberg.raw("burgess-busterbrown.txt")

# raw text를 모두 소문자로 바꾸고 특수문자를 제거한 뒤 tokenize하기
def get_tokens(text):
    lowers = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    return tokens

tokens = get_tokens(text)
print(tokens)


['The', 'Adventures', 'of', 'Buster', 'Bear', 'by', 'Thornton', 'W', 'Burgess', '1920', 'I', 'BUSTER', 'BEAR', 'GOES', 'FISHING', 'Buster', 'Bear', 'yawned', 'as', 'he', 'lay', 'on', 'his', 'comfortable', 'bed', 'of', 'leaves', 'and', 'watched', 'the', 'first', 'early', 'morning', 'sunbeams', 'creeping', 'through', 'the', 'Green', 'Forest', 'to', 'chase', 'out', 'the', 'Black', 'Shadows', 'Once', 'more', 'he', 'yawned', 'and', 'slowly', 'got', 'to', 'his', 'feet', 'and', 'shook', 'himself', 'Then', 'he', 'walked', 'over', 'to', 'a', 'big', 'pine', 'tree', 'stood', 'up', 'on', 'his', 'hind', 'legs', 'reached', 'as', 'high', 'up', 'on', 'the', 'trunk', 'of', 'the', 'tree', 'as', 'he', 'could', 'and', 'scratched', 'the', 'bark', 'with', 'his', 'great', 'claws', 'After', 'that', 'he', 'yawned', 'until', 'it', 'seemed', 'as', 'if', 'his', 'jaws', 'would', 'crack', 'and', 'then', 'sat', 'down', 'to', 'think', 'what', 'he', 'wanted', 'for', 'breakfast', 'While', 'he', 'sat', 'there', 'trying'

In [2]:
# tokenize된 문서 안에서 각 단어의 빈도 세기
count = Counter(tokens)

# 가장 많이 쓰인 단어부터 상위 10개 확인
print(count.most_common(10))

[('the', 639), ('he', 562), ('and', 484), ('to', 426), ('of', 326), ('that', 285), ('a', 275), ('was', 274), ('it', 239), ('Buster', 216)]


In [3]:
# nltk를 이용해 stopword 제거하기 
from nltk.corpus import stopwords

tokens = get_tokens(text)
filtered = [w for w in tokens if not w in stopwords.words('english')]

# stopword가 제거된 토큰들의 빈도 세기
count = Counter(filtered)

# 가장 많이 쓰인 토큰부터 상위 100개 확인
print(count.most_common(100))

[('Buster', 216), ('I', 156), ('Bear', 137), ('He', 116), ('Little', 111), ('Joe', 108), ('one', 99), ('Brown', 95), ('Farmer', 94), ('little', 92), ('boy', 88), ('Green', 81), ('see', 73), ('Forest', 66), ('It', 61), ('could', 56), ('Blacky', 51), ('said', 50), ('know', 48), ('Otter', 47), ('time', 47), ('would', 46), ('way', 44), ('fish', 44), ('Then', 43), ('But', 43), ('berries', 43), ('great', 41), ('Sammy', 41), ('afraid', 40), ('away', 40), ('Billy', 39), ('looked', 39), ('pail', 39), ('big', 38), ('trout', 37), ('You', 36), ('Old', 36), ('tree', 35), ('get', 33), ('Brook', 32), ('right', 32), ('Laughing', 30), ('And', 30), ('Now', 30), ('thing', 30), ('Chatterer', 28), ('Mink', 28), ('make', 27), ('pool', 27), ('Pasture', 27), ('So', 26), ('think', 25), ('thought', 25), ('frightened', 25), ('Jay', 24), ('go', 24), ('sure', 24), ('saw', 24), ('back', 24), ('bad', 24), ('There', 24), ('made', 24), ('heard', 23), ('else', 23), ('went', 23), ('That', 23), ('even', 23), ('head', 23)

In [4]:
# 각 토큰의 stem 얻기

from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)

# stemmize된 토큰들의 빈도 세기
count = Counter(stemmed)

# 가장 많이 쓰인 stem부터 상위 100개 확인
print(count.most_common(100))

[('buster', 229), ('littl', 207), ('I', 156), ('bear', 152), ('He', 116), ('joe', 112), ('one', 103), ('brown', 102), ('farmer', 100), ('boy', 95), ('green', 87), ('see', 78), ('fish', 69), ('look', 68), ('forest', 66), ('It', 61), ('could', 56), ('blacki', 52), ('know', 52), ('berri', 52), ('great', 51), ('pool', 51), ('otter', 51), ('time', 51), ('laugh', 50), ('said', 50), ('big', 49), ('would', 46), ('way', 45), ('thing', 45), ('then', 43), ('but', 43), ('pail', 43), ('make', 42), ('sammi', 42), ('get', 42), ('come', 42), ('old', 42), ('tree', 41), ('go', 40), ('afraid', 40), ('away', 40), ('billi', 39), ('trout', 37), ('right', 36), ('you', 36), ('brook', 32), ('and', 32), ('chatter', 30), ('now', 30), ('think', 29), ('thought', 29), ('head', 28), ('eat', 28), ('mink', 28), ('smile', 28), ('sure', 27), ('frighten', 27), ('run', 27), ('pastur', 27), ('So', 26), ('jay', 25), ('thief', 25), ('even', 25), ('made', 25), ('els', 24), ('saw', 24), ('back', 24), ('bad', 24), ('there', 24)

### Tf-Idf in gensim
###### gensim 설치 명령: pip install --upgrade gensim
###### gensim installation instructions: https://radimrehurek.com/gensim/install.html

In [11]:
import nltk
import string
import gensim

# 지금까지 했던 tokenize, stopword 제거, punctuation 제거, stemmize를 모두 수행하는 새 tokenize 함수
def tokenize(text):
    tokens = get_tokens(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

# nltk gutenberg 텍스트의 목록 불러오기
titles = nltk.corpus.gutenberg.fileids()
print(titles)

# 목록에 있는 텍스트를 순서대로 tokenize(stemize)한 뒤 하나의 리스트로 모으기
text_tokens = []
for title in titles:
    text = nltk.corpus.gutenberg.raw(title)
    tokens = tokenize(text)
    text_tokens.append(tokens)
    

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [6]:
# 각 리스트에 들어 있는 tokenized text의 형태
print(text_tokens[0])

['emma', 'by', 'jane', 'austen', '1816', 'volum', 'I', 'chapter', 'I', 'emma', 'woodhous', 'handsom', 'clever', 'and', 'rich', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist', 'and', 'had', 'live', 'nearli', 'twenti', 'one', 'year', 'in', 'the', 'world', 'with', 'veri', 'littl', 'to', 'distress', 'or', 'vex', 'her', 'she', 'wa', 'the', 'youngest', 'of', 'the', 'two', 'daughter', 'of', 'a', 'most', 'affection', 'indulg', 'father', 'and', 'had', 'in', 'consequ', 'of', 'her', 'sister', 's', 'marriag', 'been', 'mistress', 'of', 'hi', 'hous', 'from', 'a', 'veri', 'earli', 'period', 'her', 'mother', 'had', 'die', 'too', 'long', 'ago', 'for', 'her', 'to', 'have', 'more', 'than', 'an', 'indistinct', 'remembr', 'of', 'her', 'caress', 'and', 'her', 'place', 'had', 'been', 'suppli', 'by', 'an', 'excel', 'woman', 'as', 'gover', 'who', 'had', 'fallen', 'littl', 'short', 'of', 'a', 'mother', 'in', 'affect', 'sixtee

In [7]:
# gensim 패키지를 이용하여 단어들을 숫자 인덱스로 바꾸어 쓰기
# 딕셔너리 형태로 저장

from gensim import corpora
dictionary = corpora.Dictionary(text_tokens)

# 인덱스로 단어 호출하기 
print(dictionary[10])
print(dictionary[100])

# 단어로 인덱스 호출하기 
print(dictionary.token2id['road'])
print(dictionary.token2id['like'])

Ah
_evening_
3521
2543


In [14]:
# 딕셔너리 내 단어들의 총 개수 
print("Number of words in dictionary:",len(dictionary))

# 각 인덱스가 부여된 단어들의 목록 
for i in dictionary.keys():
    print(i, dictionary[i])

Number of words in dictionary: 26306
0 000
1 10
2 1816
3 23rd
4 24th
5 26th
6 28th
7 7th
8 8th
9 A
10 Ah
11 Am
12 An
13 As
14 At
15 Be
16 By
17 C
18 Do
19 Dr
20 E
21 Em
22 F
23 Go
24 Ha
25 He
26 I
27 II
28 IV
29 IX
30 If
31 In
32 Is
33 It
34 K
35 La
36 M
37 MY
38 Ma
39 Me
40 Mr
41 My
42 N
43 No
44 Of
45 Oh
46 On
47 Or
48 S
49 So
50 St
51 To
52 V
53 VI
54 W
55 We
56 X
57 XI
58 XV
59 _
60 _______
61 _a_
62 _accepted_
63 _adair_
64 _addition_
65 _all_
66 _almost_
67 _alone_
68 _amor_
69 _and_
70 _answer_
71 _any_
72 _appropriation_
73 _as_
74 _assistance_
75 _at_
76 _bath_
77 _be_
78 _been_
79 _blunder_
80 _boiled_
81 _both_
82 _bride_
83 _broke_
84 _caro_
85 _cause_
86 _chaperon_
87 _compassion_
88 _compliments_
89 _court_
90 _courtship_
91 _did_
92 _dissolved_
93 _dixon_
94 _dixons_
95 _doubts_
96 _each_
97 _eighteen_
98 _elton_
99 _engagement_
100 _evening_
101 _felt_
102 _first_
103 _gentleman_
104 _great_
105 _greater_
106 _had_
107 _half_
108 _happily_
109 _has_
110 _have_
111 _he_


973 concept
974 concern
975 concert
976 concess
977 concili
978 concis
979 conclud
980 conclus
981 concurr
982 condemn
983 condescend
984 condescens
985 condit
986 condol
987 conduc
988 conduct
989 confeder
990 confer
991 confess
992 confid
993 confidant
994 confidenti
995 confin
996 confirm
997 confound
998 confus
999 congratul
1000 congratulatori
1001 conjectur
1002 conjug
1003 connect
1004 connexion
1005 conniv
1006 conquer
1007 conquest
1008 conscienc
1009 consciou
1010 conscious
1011 consent
1012 consequ
1013 conservatori
1014 consid
1015 consider
1016 consist
1017 consol
1018 constanc
1019 constant
1020 constantli
1021 constern
1022 constitut
1023 constrain
1024 construct
1025 consult
1026 consumpt
1027 contain
1028 contempl
1029 contempt
1030 contemptu
1031 contend
1032 content
1033 conting
1034 continu
1035 contract
1036 contradict
1037 contrari
1038 contrast
1039 contribut
1040 contrit
1041 contriv
1042 controul
1043 conundrum
1044 conveni
1045 convers
1046 convey
1047 convict

1973 greatcoat
1974 greater
1975 greatest
1976 greatli
1977 green
1978 greensward
1979 greet
1980 grew
1981 grey
1982 grief
1983 griev
1984 grievanc
1985 grievou
1986 grievous
1987 gross
1988 grossli
1989 ground
1990 groundless
1991 groundwork
1992 group
1993 grove
1994 grow
1995 grown
1996 grudg
1997 gruel
1998 guard
1999 guess
2000 guest
2001 guid
2002 guidabl
2003 guidanc
2004 guileless
2005 guilt
2006 guilti
2007 guinea
2008 ha
2009 haberdash
2010 habit
2011 habitu
2012 had
2013 hair
2014 hairdress
2015 half
2016 hall
2017 hand
2018 handl
2019 handsom
2020 handsomest
2021 handwrit
2022 hang
2023 hannah
2024 haphazard
2025 happen
2026 happi
2027 happier
2028 happiest
2029 happili
2030 harbour
2031 hard
2032 harden
2033 hardi
2034 hardli
2035 hardship
2036 harm
2037 harmless
2038 harmoni
2039 harmonis
2040 harp
2041 harri
2042 harriet
2043 harsh
2044 harshli
2045 hart_
2046 hartfield
2047 hasten
2048 hasti
2049 hastili
2050 hat
2051 hate
2052 hatr
2053 haunt
2054 hautboy
2055 have
20

2722 misinterpret
2723 misl
2724 miss
2725 misspent
2726 mistak
2727 mistaken
2728 mistress
2729 misunderstand
2730 misunderstood
2731 mitchel
2732 mix
2733 mixtur
2734 mock
2735 mode
2736 model
2737 moder
2738 modern
2739 modest
2740 modesti
2741 modestli
2742 moment
2743 momentari
2744 monarch
2745 monday
2746 money
2747 month
2748 moonlight
2749 moral
2750 moralis
2751 more
2752 moreov
2753 morn
2754 morrow
2755 mortal
2756 mortif
2757 mortifi
2758 most
2759 mostli
2760 mother
2761 motherli
2762 motion
2763 motionless
2764 motiv
2765 motto
2766 mount
2767 mourn
2768 mourner
2769 mouth
2770 move
2771 movement
2772 mr
2773 much
2774 mud
2775 muffin
2776 mule
2777 multipl
2778 multipli
2779 mum
2780 murmur
2781 muse
2782 music
2783 musician
2784 muslin
2785 must
2786 muster
2787 mute
2788 mutter
2789 mutton
2790 mutual
2791 my
2792 myself
2793 mysteri
2794 nail
2795 naivet
2796 name
2797 narr
2798 narrat
2799 narrow
2800 nash
2801 nativ
2802 natur
2803 nay
2804 near
2805 nearer
2806 ne

3472 respect
3473 respit
3474 rest
3475 restless
3476 restor
3477 restrain
3478 restraint
3479 restrict
3480 result
3481 resum
3482 retain
3483 retent
3484 reticul
3485 retir
3486 retort
3487 retract
3488 retreat
3489 retrospect
3490 return
3491 reveal
3492 reveri
3493 revers
3494 reviv
3495 revolt
3496 reward
3497 rheumat
3498 ribband
3499 ribbon
3500 rice
3501 rich
3502 richard
3503 richardson
3504 richest
3505 richli
3506 richmond
3507 rid
3508 ridden
3509 riddl
3510 ride
3511 ridicul
3512 right
3513 rightli
3514 ring
3515 ripen
3516 rise
3517 risk
3518 rival
3519 river
3520 rivet
3521 road
3522 roast
3523 rob
3524 robe
3525 robert
3526 rode
3527 romanc
3528 romant
3529 room
3530 root
3531 rose
3532 rough
3533 round
3534 roundabout
3535 roundli
3536 rous
3537 rout
3538 routin
3539 row
3540 ru
3541 rub
3542 rubber
3543 rude
3544 ruin
3545 rule
3546 rumin
3547 rumour
3548 run
3549 rung
3550 rush
3551 s
3552 sack
3553 sacr
3554 sacrif
3555 sacrific
3556 sad
3557 saddl
3558 sadli
3559 s

4472 visit
4473 visitor
4474 vivac
4475 vocal
4476 voic
4477 void
4478 volubl
4479 volum
4480 voluntari
4481 voluntarili
4482 vote
4483 vouch
4484 vouchsaf
4485 vow
4486 vulgar
4487 wa
4488 wade
4489 wainscot
4490 wait
4491 waiter
4492 waiv
4493 wake
4494 wakefield
4495 walk
4496 walker
4497 wall
4498 walli
4499 wallis
4500 walnut
4501 waltz
4502 wan
4503 wander
4504 want
4505 wanton
4506 war
4507 ware
4508 warfar
4509 warm
4510 warmer
4511 warmest
4512 warmli
4513 warmth
4514 warrant
4515 wash
4516 wast
4517 watch
4518 water
4519 waver
4520 wax
4521 way
4522 we
4523 weak
4524 weaken
4525 weaker
4526 weakli
4527 wealth
4528 wear
4529 weari
4530 weather
4531 wed
4532 wedg
4533 wednesday
4534 week
4535 weekli
4536 weigh
4537 weight
4538 welch
4539 welcom
4540 welfar
4541 well
4542 went
4543 wept
4544 were
4545 west
4546 western
4547 weston
4548 wet
4549 weymouth
4550 what
4551 whatev
4552 wheat
4553 when
4554 whenc
4555 whenev
4556 where
4557 wherev
4558 whether
4559 whi
4560 which
4561 

5472 spine
5473 sportsmen
5474 sprang
5475 sprung
5476 squir
5477 stage
5478 stamp
5479 stead
5480 steadfast
5481 steal
5482 stern
5483 stevenson
5484 stile
5485 strafford
5486 straight
5487 straiten
5488 streight
5489 streighten
5490 strenuou
5491 strictest
5492 strip
5493 stroll
5494 studious
5495 stuff
5496 sturdi
5497 suaviti
5498 substanti
5499 subtleti
5500 sulta
5501 summarili
5502 summit
5503 sunni
5504 superfici
5505 superfin
5506 supplement
5507 surfac
5508 surgeon
5509 surmount
5510 surnam
5511 surveyor
5512 suscept
5513 suspend
5514 swear
5515 sweeter
5516 swing
5517 switch
5518 sympath
5519 sympathet
5520 tale
5521 tap
5522 tattersal
5523 tauntingli
5524 taunton
5525 tawni
5526 teeth
5527 tempera
5528 templ
5529 tenant
5530 tenantri
5531 tenement
5532 test
5533 testimoni
5534 thame
5535 theatr
5536 thenc
5537 theori
5538 thicker
5539 thickest
5540 thornberri
5541 thoroughfar
5542 thread
5543 thrill
5544 thursday
5545 tight
5546 titl
5547 today
5548 toil
5549 toilsom
5550 t

6471 wipe
6472 withdrew
6473 withstood
6474 wittic
6475 wittiest
6476 worthiest
6477 worthless
6478 wrung
6479 yew
6480 100
6481 101
6482 102
6483 103
6484 104
6485 105
6486 106
6487 107
6488 108
6489 109
6490 110
6491 111
6492 112
6493 113
6494 114
6495 115
6496 116
6497 117
6498 118
6499 119
6500 120
6501 121
6502 122
6503 123
6504 124
6505 125
6506 126
6507 127
6508 128
6509 129
6510 130
6511 131
6512 132
6513 133
6514 134
6515 135
6516 136
6517 137
6518 138
6519 139
6520 140
6521 141
6522 142
6523 143
6524 144
6525 145
6526 146
6527 147
6528 148
6529 149
6530 150
6531 151
6532 152
6533 153
6534 154
6535 155
6536 156
6537 157
6538 158
6539 159
6540 160
6541 161
6542 162
6543 163
6544 164
6545 165
6546 166
6547 167
6548 168
6549 169
6550 170
6551 171
6552 172
6553 173
6554 174
6555 175
6556 176
6557 51
6558 52
6559 53
6560 54
6561 55
6562 56
6563 57
6564 58
6565 59
6566 60
6567 61
6568 62
6569 63
6570 64
6571 65
6572 66
6573 67
6574 68
6575 69
6576 70
6577 71
6578 72
6579 73
6580 74


7471 betwixt
7472 beulah
7473 bewail
7474 bewaileth
7475 bewar
7476 bewray
7477 bewrayeth
7478 bezai
7479 bezaleel
7480 bezek
7481 bezer
7482 bibl
7483 bichri
7484 bidden
7485 biddeth
7486 bidkar
7487 bier
7488 bigtha
7489 bigthan
7490 bigthana
7491 bigvai
7492 bildad
7493 bileam
7494 bilgah
7495 bilgai
7496 bilhah
7497 bilhan
7498 billow
7499 bilshan
7500 bimhal
7501 bindeth
7502 binea
7503 binnui
7504 birsha
7505 birthright
7506 birzavith
7507 bishlam
7508 bishoprick
7509 bite
7510 biteth
7511 bithiah
7512 bithron
7513 bithynia
7514 bitten
7515 bittern
7516 bizjothjah
7517 biztha
7518 blacker
7519 blackish
7520 blade
7521 blasphem
7522 blasphemest
7523 blasphemeth
7524 blasphemi
7525 blastu
7526 blaze
7527 bleat
7528 blemish
7529 blessed
7530 blessest
7531 blesseth
7532 blew
7533 blindeth
7534 blindfold
7535 bloodguilti
7536 bloodi
7537 bloodthirsti
7538 blotteth
7539 bloweth
7540 blueness
7541 boanerg
7542 boar
7543 boaster
7544 boastest
7545 boasteth
7546 boaz
7547 bocheru
7548 boc

8221 disallow
8222 disannul
8223 disannulleth
8224 disappointeth
8225 discerneth
8226 discipl
8227 discomfitur
8228 discovereth
8229 diseas
8230 disguiseth
8231 dishan
8232 dishon
8233 dishonest
8234 dishonesti
8235 dishonour
8236 dishonourest
8237 dishonoureth
8238 disobey
8239 disorderli
8240 dispossess
8241 dissembleth
8242 dissens
8243 dissimul
8244 dissolvest
8245 distaff
8246 distil
8247 distribut
8248 distributeth
8249 diver
8250 divers
8251 divideth
8252 divineth
8253 dizahab
8254 dodai
8255 dodanim
8256 dodavah
8257 dodo
8258 doeg
8259 doest
8260 doeth
8261 dominion
8262 doorkeep
8263 dophkah
8264 dor
8265 dorca
8266 dost
8267 dothan
8268 doubletongu
8269 doubteth
8270 dough
8271 downsit
8272 downward
8273 dowri
8274 drag
8275 dragon
8276 dram
8277 drave
8278 draweth
8279 dreameth
8280 dreg
8281 dresser
8282 dresseth
8283 drewest
8284 driedst
8285 drieth
8286 drinker
8287 drinketh
8288 driver
8289 driveth
8290 dromedari
8291 droppeth
8292 dropsi
8293 dross
8294 drought
8295 dr

8971 godli
8972 goest
8973 goeth
8974 gog
8975 golan
8976 golden
8977 golgotha
8978 goliath
8979 gomer
8980 gomorrah
8981 gomorrha
8982 goodlier
8983 goodliest
8984 goodman
8985 gopher
8986 gore
8987 gorgeou
8988 gorgeous
8989 goshen
8990 gospel
8991 gotten
8992 gourd
8993 gozan
8994 graf
8995 graff
8996 grain
8997 grape
8998 grapegather
8999 grapeglean
9000 grasshopp
9001 gravecloth
9002 graven
9003 graveth
9004 grayhead
9005 greav
9006 grecia
9007 greec
9008 greedi
9009 greedili
9010 greek
9011 greenish
9012 greeteth
9013 greyhead
9014 greyhound
9015 grieveth
9016 grind
9017 grinder
9018 grisl
9019 groan
9020 groaneth
9021 grope
9022 gropeth
9023 groweth
9024 grudgingli
9025 gudgodah
9026 guestchamb
9027 guiltless
9028 gulf
9029 guni
9030 gunit
9031 gur
9032 gurbaal
9033 gush
9034 gutter
9035 haahashtari
9036 habaiah
9037 habakkuk
9038 habaziniah
9039 habergeon
9040 habor
9041 hachaliah
9042 hachilah
9043 hachmoni
9044 hachmonit
9045 hadad
9046 hadadez
9047 hadadrimmon
9048 hadar
904

9970 kibzaim
9971 kid
9972 kidney
9973 kidron
9974 killedst
9975 killest
9976 killeth
9977 kiln
9978 kin
9979 kinah
9980 kindleth
9981 kine
9982 kingli
9983 kinsfolk
9984 kinsman
9985 kinsmen
9986 kinswoman
9987 kinswomen
9988 kir
9989 kirharaseth
9990 kirhareseth
9991 kirharesh
9992 kirher
9993 kiriathaim
9994 kirioth
9995 kirjath
9996 kirjathaim
9997 kirjatharba
9998 kirjatharim
9999 kirjathba
10000 kirjathhuzoth
10001 kirjathjearim
10002 kirjathsannah
10003 kirjathseph
10004 kish
10005 kishi
10006 kishion
10007 kishon
10008 kison
10009 kithlish
10010 kitron
10011 kittim
10012 knead
10013 kneadingtrough
10014 knewest
10015 knocketh
10016 knop
10017 knowest
10018 knoweth
10019 koa
10020 kohath
10021 kohathit
10022 kolaiah
10023 korah
10024 korahit
10025 korathit
10026 kore
10027 korhit
10028 koz
10029 kushaiah
10030 laadah
10031 laadan
10032 laban
10033 laboureth
10034 lachish
10035 lack
10036 lackest
10037 lacketh
10038 lad
10039 ladder
10040 lade
10041 laden
10042 ladeth
10043 lael


10970 paul
10971 paulu
10972 pave
10973 pavilion
10974 paw
10975 paweth
10976 payeth
10977 peaceabl
10978 peacemak
10979 peacock
10980 pedahel
10981 pedahzur
10982 pedaiah
10983 pedigre
10984 peel
10985 pekah
10986 pekahiah
10987 pekod
10988 pelaiah
10989 pelaliah
10990 pelatiah
10991 peleg
10992 pelet
10993 peleth
10994 pelethit
10995 pelican
10996 pelonit
10997 penc
10998 peniel
10999 peninnah
11000 pennyworth
11001 pentecost
11002 penuel
11003 peor
11004 peradventur
11005 perazim
11006 perceivest
11007 perceiveth
11008 perdit
11009 pere
11010 peresh
11011 perez
11012 perezuzza
11013 perezuzzah
11014 performeth
11015 perga
11016 pergamo
11017 perida
11018 peril
11019 perish
11020 perisheth
11021 perizzit
11022 perjur
11023 pernici
11024 persecutest
11025 persecutor
11026 persi
11027 persia
11028 persian
11029 persuadest
11030 persuadeth
11031 persud
11032 pertain
11033 pertaineth
11034 peruda
11035 perverteth
11036 pestil
11037 pestl
11038 peter
11039 pethahiah
11040 pethor
11041 pet

11970 shuphamit
11971 shuppim
11972 shur
11973 shushan
11974 shuthalhit
11975 shuthelah
11976 shutteth
11977 shuttl
11978 sia
11979 siaha
11980 sibbecai
11981 sibbechai
11982 sibboleth
11983 sibmah
11984 sibraim
11985 sichem
11986 sickl
11987 siddim
11988 sidon
11989 sidonian
11990 sieg
11991 siev
11992 sift
11993 sighest
11994 sigheth
11995 signet
11996 signif
11997 signifieth
11998 sihon
11999 sihor
12000 sila
12001 silla
12002 siloah
12003 siloam
12004 silvanu
12005 silverl
12006 simeon
12007 simeonit
12008 similitud
12009 simon
12010 simri
12011 sina
12012 sinai
12013 sinew
12014 singer
12015 singeth
12016 sinim
12017 sinit
12018 sinnest
12019 sinneth
12020 sion
12021 siphmoth
12022 sippai
12023 sirah
12024 sirion
12025 sisamai
12026 sisera
12027 sith
12028 sitnah
12029 sittest
12030 sitteth
12031 sivan
12032 sixscor
12033 sixteenth
12034 sixth
12035 sixtyfold
12036 skip
12037 skippedst
12038 skull
12039 slack
12040 slain
12041 slander
12042 slanderest
12043 slandereth
12044 slang


12970 wrath
12971 wreathen
12972 wrest
12973 wrestl
12974 wring
12975 writest
12976 writeth
12977 wrongeth
12978 wroth
12979 wroughtest
12980 yarn
12981 yea
12982 yearn
12983 yell
12984 yesternight
12985 yieldeth
12986 yoke
12987 yokefellow
12988 yonder
12989 zaanaim
12990 zaanan
12991 zaanannim
12992 zaavan
12993 zabad
12994 zabbai
12995 zabbud
12996 zabdi
12997 zabdiel
12998 zabud
12999 zabulon
13000 zaccai
13001 zacchaeu
13002 zacchur
13003 zaccur
13004 zacharia
13005 zachariah
13006 zacher
13007 zadok
13008 zaham
13009 zair
13010 zalaph
13011 zalmon
13012 zalmonah
13013 zalmunna
13014 zamzummim
13015 zanoah
13016 zaphnathpaaneah
13017 zaphon
13018 zara
13019 zarah
13020 zare
13021 zareah
13022 zareathit
13023 zarephath
13024 zaretan
13025 zarethshahar
13026 zarhit
13027 zartanah
13028 zarthan
13029 zatthu
13030 zattu
13031 zavan
13032 zaza
13033 zebadiah
13034 zebah
13035 zebaim
13036 zebede
13037 zebina
13038 zeboiim
13039 zeboim
13040 zebudah
13041 zebul
13042 zebulonit
13043 zeb

14219 soo
14220 sprawl
14221 squeak
14222 squeez
14223 sternli
14224 stigand
14225 stingi
14226 straighten
14227 suet
14228 sulki
14229 sulkili
14230 teacup
14231 teapot
14232 telescop
14233 terrier
14234 thatch
14235 thunderstorm
14236 tilli
14237 titter
14238 toast
14239 toffe
14240 tougher
14241 treacl
14242 trot
14243 tuck
14244 tunnel
14245 tureen
14246 tut
14247 ugh
14248 uglif
14249 uglifi
14250 uncork
14251 underton
14252 unrol
14253 untwist
14254 verdict
14255 waist
14256 walru
14257 wearili
14258 weren
14259 whisker
14260 wig
14261 wonderland
14262 wooden
14263 wow
14264 writh
14265 yelp
14266 yer
14267 zealand
14268 00
14269 01
14270 02
14271 1000
14272 10000
14273 1240
14274 1500
14275 1739
14276 1909
14277 1971
14278 1991
14279 1994
14280 1997
14281 1998
14282 1999
14283 2000
14284 2001
14285 2002
14286 2003
14287 2004
14288 2500
14289 3000
14290 38655
14291 4000
14292 4109
14293 501
14294 6000
14295 622154
14296 9000
14297 AN
14298 AS
14299 Al
14300 B
14301 Eh
14302 G
143

15469 republican
15470 research
15471 restaur
15472 retic
15473 retina
15474 retriev
15475 reverber
15476 reverenti
15477 revisit
15478 revolutionari
15479 revolutionist
15480 rhetor
15481 rhinoceros
15482 rhodesian
15483 rhododendron
15484 rhododendru
15485 rim
15486 ripost
15487 ripper
15488 ritual
15489 rivulet
15490 rococo
15491 roland
15492 rollick
15493 rosinant
15494 rotat
15495 rowdi
15496 royalti
15497 ruder
15498 rudimentari
15499 ruffian
15500 rummier
15501 runaway
15502 runner
15503 ruskin
15504 russian
15505 rustic
15506 ruthless
15507 sac
15508 sacrament
15509 sacred
15510 sane
15511 saner
15512 sanest
15513 saniti
15514 saracen
15515 sarcasm
15516 satchel
15517 sate
15518 saturn
15519 sauvera
15520 savageri
15521 scan
15522 scarf
15523 schoolgirl
15524 schopenhau
15525 scientist
15526 scilli
15527 scoot
15528 scotch
15529 scotchman
15530 scotchmen
15531 scraggi
15532 scrubbi
15533 scuttl
15534 seascap
15535 seawe
15536 section
15537 secularist
15538 seesaw
15539 selim
15

16469 larrikin
16470 latchkey
16471 laughabl
16472 lawsuit
16473 leaner
16474 lectern
16475 leed
16476 len
16477 leonardo
16478 letharg
16479 libel
16480 librarian
16481 likeli
16482 liqueur
16483 liverpool
16484 livid
16485 lobsterish
16486 lopsid
16487 ludwig
16488 lugubri
16489 lure
16490 lynch
16491 macnab
16492 maggi
16493 magyar
16494 major
16495 malform
16496 malta
16497 malvoli
16498 mandolin
16499 manuscript
16500 marco
16501 marconi
16502 margin
16503 maria
16504 marseillais
16505 matchbox
16506 matchwood
16507 mathemat
16508 matterhorn
16509 mauric
16510 max
16511 mayor
16512 meaner
16513 mecca
16514 mediaev
16515 melodrama
16516 merlin
16517 mew
16518 mida
16519 midland
16520 migrat
16521 militar
16522 millionair
16523 mineralogist
16524 miscellan
16525 misfir
16526 mishap
16527 misprint
16528 missil
16529 modernist
16530 modul
16531 moneylend
16532 monger
16533 monkhous
16534 monomaniac
16535 montano
16536 motorist
16537 mottl
16538 mousetrap
16539 mousquetair
16540 mousta

17469 unforgett
17470 unfurl
17471 unhelp
17472 unhurri
17473 unknow
17474 unmask
17475 unnerv
17476 unpoet
17477 unslung
17478 unsolv
17479 untam
17480 urchin
17481 utilitarian
17482 vacuum
17483 valor
17484 vant
17485 vegetarian
17486 ven
17487 verg
17488 vestig
17489 vizard
17490 wagner
17491 waterfal
17492 wheeler
17493 whistler
17494 whitish
17495 wickedest
17496 wilk
17497 witcheri
17498 witherspoon
17499 workshop
17500 wrecker
17501 zay
17502 ze
17503 zem
17504 zmite
17505 zoo
17506 zso
17507 zumpt
17508 1414
17509 15o
17510 1795
17511 1799
17512 17oo
17513 17th
17514 1828
17515 18th
17516 22nd
17517 25th
17518 299
17519 2nd
17520 3rd
17521 421
17522 440
17523 4th
17524 5th
17525 6th
17526 8oo
17527 BY
17528 DR
17529 Di
17530 ET
17531 Ev
17532 FA
17533 GO
17534 Il
17535 J
17536 MA
17537 Re
17538 SO
17539 Th
17540 UP
17541 _h_arbour
17542 _h_arm
17543 _h_earth
17544 ab
17545 abash
17546 acclam
17547 accomplic
17548 accoutr
17549 acid
17550 acqua
17551 acquistato
17552 addison
175

18468 abaft
18469 abandonedli
18470 abe
18471 abeam
18472 abjectli
18473 abjectu
18474 ablut
18475 aborigin
18476 abort
18477 aboundingli
18478 abreast
18479 absorbingli
18480 accessori
18481 acerb
18482 achil
18483 actest
18484 actium
18485 acushnet
18486 adamit
18487 adhes
18488 adieux
18489 adio
18490 admeasur
18491 admonitori
18492 adolesc
18493 adown
18494 adrift
18495 adroit
18496 adroitli
18497 adroop
18498 advent
18499 advert
18500 aerat
18501 affghanistan
18502 affidavit
18503 afir
18504 afoam
18505 aforesaid
18506 aforethought
18507 afoul
18508 afric
18509 aft
18510 agassiz
18511 aggreg
18512 aggriev
18513 aglow
18514 agonizingli
18515 agrarian
18516 ahoy
18517 aint
18518 airley
18519 airth
18520 alb
18521 albatross
18522 albemarl
18523 albicor
18524 albino
18525 alcov
18526 aldermen
18527 aldrovandi
18528 aldrovandu
18529 aleak
18530 alew
18531 alfr
18532 algerin
18533 algier
18534 aliment
18535 alleghani
18536 alleghanian
18537 allot
18538 alluringli
18539 almanac
18540 alm

19468 frobish
19469 froissart
19470 froth
19471 fruition
19472 fuddl
19473 fuego
19474 fulller
19475 functionari
19476 fungi
19477 furl
19478 furthest
19479 fuse
19480 fuzz
19481 g
19482 gabl
19483 gaff
19484 gaffman
19485 gainst
19486 gallantli
19487 galli
19488 galliot
19489 gallipago
19490 gallopingli
19491 gam
19492 gambog
19493 gamesom
19494 gami
19495 gander
19496 gangway
19497 gardenni
19498 gardin
19499 garneri
19500 gase
19501 gaseou
19502 gash
19503 gastric
19504 gaudiest
19505 gaunt
19506 gauntlet
19507 gayer
19508 gayhead
19509 gazett
19510 ge
19511 gees
19512 genera
19513 geneva
19514 genius
19515 gentler
19516 genu
19517 geograph
19518 geometr
19519 geometri
19520 ger
19521 germain
19522 gesner
19523 gestat
19524 gesticul
19525 gette
19526 gettest
19527 ghent
19528 giddili
19529 gilder
19530 gill
19531 ginger
19532 gingerli
19533 girth
19534 gizzard
19535 glacier
19536 glade
19537 gleig
19538 glidest
19539 glim
19540 glitteringli
19541 globul
19542 globular
19543 gloomies

20718 salin
20719 salisburi
20720 saltcellar
20721 sam
20722 samphir
20723 sanctiti
20724 sanctorum
20725 sanctum
20726 sandpap
20727 sanguinari
20728 sapl
20729 saratoga
20730 sarmon
20731 sartainti
20732 sartin
20733 sashless
20734 satieti
20735 savanna
20736 savesoul
20737 savor
20738 savori
20739 saxon
20740 sayst
20741 scandinavian
20742 scaramouch
20743 scarri
20744 scentless
20745 schmerenburgh
20746 schooner
20747 schouten
20748 scimetar
20749 scollop
20750 sconc
20751 scorbut
20752 scorchingli
20753 scoresbi
20754 scorpio
20755 scougin
20756 scragg
20757 scrimp
20758 scroug
20759 scud
20760 sculptur
20761 scupper
20762 scyth
20763 sealin
20764 seaman
20765 seamless
20766 searchingli
20767 sebastian
20768 sebond
20769 secluded
20770 sed
20771 seethingli
20772 seeva
20773 segment
20774 seignori
20775 selectest
20776 sellin
20777 semin
20778 semirami
20779 semiweekli
20780 seneca
20781 senor
20782 sequenti
20783 serenest
20784 sette
20785 settler
20786 severest
20787 seychel
2078

21718 bidst
21719 biserta
21720 bitumin
21721 bizanc
21722 blamest
21723 blest
21724 blis
21725 boder
21726 borea
21727 bosporu
21728 bossi
21729 briareo
21730 brigad
21731 brind
21732 bullion
21733 busiri
21734 buxom
21735 cadmu
21736 caecia
21737 calabria
21738 calamit
21739 callow
21740 calmest
21741 calumni
21742 cambalu
21743 cani
21744 caparison
21745 capitolin
21746 capricorn
21747 carnag
21748 casbeen
21749 casiu
21750 caspian
21751 castalian
21752 cathaian
21753 causey
21754 centrick
21755 cerast
21756 cerberean
21757 cere
21758 chaldaea
21759 cham
21760 champain
21761 charlemain
21762 charybdi
21763 chemick
21764 chemo
21765 chersones
21766 cherubick
21767 chimera
21768 choisest
21769 choral
21770 chrysolit
21771 cinctur
21772 circean
21773 circlet
21774 circumflu
21775 circumfus
21776 circumscrib
21777 cleombrotu
21778 climbest
21779 clo
21780 clomb
21781 cocytu
21782 coetern
21783 colick
21784 colur
21785 communic
21786 compeer
21787 compuls
21788 concav
21789 conglob
21790

22717 vallombrosa
22718 vanguard
22719 vassalag
22720 vernant
22721 vertumnu
22722 viand
22723 viceger
22724 viciou
22725 viewest
22726 vilifi
22727 virtual
22728 virtuousest
22729 volant
22730 volli
22731 volubil
22732 volumin
22733 voluptu
22734 wakest
22735 warriour
22736 washi
22737 weakest
22738 welkin
22739 wiseli
22740 woodbin
22741 worthier
22742 wrack
22743 yeanl
22744 zenith
22745 zephyru
22746 zodiack
22747 zophiel
22748 1599
22749 Br
22750 Co
22751 Ne
22752 Tu
22753 Vp
22754 Vs
22755 Y
22756 abler
22757 abou
22758 accidental
22759 actu
22760 addrest
22761 aduantag
22762 afear
22763 affabiliti
22764 affayr
22765 affraid
22766 agen
22767 alarum
22768 alass
22769 alchymi
22770 aliu
22771 anchys
22772 antonio
22773 apparrel
22774 appea
22775 apprehensiu
22776 arriu
22777 artemidoru
22778 artimedoru
22779 aswel
22780 attyr
22781 aueng
22782 auoyd
22783 aym
22784 ayr
22785 barr
22786 bastardi
22787 batchellor
22788 batchellour
22789 battel
22790 begg
22791 begger
22792 behauiour


23967 prouok
23968 prowd
23969 puft
23970 puh
23971 punisht
23972 puppet
23973 purgat
23974 pursi
23975 pursuest
23976 puzel
23977 pyrat
23978 pyrrhu
23979 quarel
23980 quickness
23981 quiddit
23982 quietu
23983 quil
23984 quillet
23985 quoth
23986 rac
23987 ranck
23988 rankli
23989 rapsidi
23990 rashness
23991 ratl
23992 raue
23993 rauell
23994 readiness
23995 reak
23996 recam
23997 recogniz
23998 recoueri
23999 redeliu
24000 reechi
24001 relatiu
24002 releef
24003 releeu
24004 relieu
24005 rellish
24006 remast
24007 remembraunc
24008 remou
24009 rendeuou
24010 renish
24011 rerul
24012 reseru
24013 respeak
24014 responsiu
24015 retook
24016 retyr
24017 reueal
24018 reuengeful
24019 reuennew
24020 reuerend
24021 reuert
24022 reuisit
24023 reuolut
24024 rew
24025 reway
24026 reynol
24027 reynoldo
24028 riual
24029 riuer
24030 riuet
24031 ro
24032 robusti
24033 romag
24034 rore
24035 rosin
24036 rosincr
24037 rosincran
24038 rossiu
24039 rouc
24040 rows
24041 sadness
24042 sai
24043 sall

25217 eschylu
25218 escul
25219 estray
25220 estuari
25221 ethnologist
25222 excelsi
25223 excelsior
25224 excrementiti
25225 excresc
25226 exhaustless
25227 exud
25228 exurg
25229 faithfulest
25230 fatherhood
25231 faucet
25232 faust
25233 feaster
25234 feb
25235 fecund
25236 feejeeman
25237 femm
25238 fernando
25239 fervor
25240 fest
25241 fester
25242 fete
25243 fetich
25244 feuillag
25245 fiat
25246 ficht
25247 filter
25248 finder
25249 firelock
25250 firemen
25251 firth
25252 flabbi
25253 flaccid
25254 flageolet
25255 flagg
25256 flagger
25257 flagston
25258 flatboatmen
25259 flauntest
25260 flex
25261 floodgat
25262 florenc
25263 fluent
25264 foetor
25265 font
25266 fop
25267 foray
25268 forcep
25269 foremen
25270 foretruck
25271 forthgoer
25272 foulest
25273 foundri
25274 fraction
25275 fractur
25276 frailest
25277 framer
25278 fratricid
25279 freest
25280 frenchwoman
25281 freshier
25282 friabl
25283 friendlili
25284 frolicsom
25285 frontest
25286 fructifi
25287 fusillad
25288 

26217 unwrap
26218 unwrit
26219 uprisen
26220 upros
26221 upsprang
26222 uranu
26223 usk
26224 ust
26225 vapor
26226 vapori
26227 vaquero
26228 vasco
26229 vaster
26230 veneer
26231 venereale
26232 venu
26233 vervain
26234 vexer
26235 victress
26236 vina
26237 virgil
26238 virginian
26239 visor
26240 vitreou
26241 viva
26242 vocalist
26243 volga
26244 voter
26245 voyagest
26246 wabash
26247 wacho
26248 wahsatch
26249 waitest
26250 wakedst
26251 walla
26252 wallabout
26253 wallachian
26254 walt
26255 warili
26256 warrante
26257 warsaw
26258 watchfir
26259 weeper
26260 welder
26261 welshman
26262 weser
26263 westernr
26264 whirr
26265 whitman
26266 whoe
26267 wholesal
26268 wielder
26269 willamett
26270 winder
26271 winterwork
26272 wisconsines
26273 wobbl
26274 wolverin
26275 womanhood
26276 woodman
26277 woodpil
26278 woolen
26279 woolli
26280 workwomen
26281 wouldist
26282 wrestler
26283 wrig
26284 xxiv
26285 xxix
26286 xxv
26287 xxvi
26288 xxvii
26289 xxviii
26290 xxx
26291 xxxi
2629

In [9]:
# 위에서 정의한 인덱스에 따라 각 텍스트를 bag of words의 형태로 바꾸어 코퍼스를 구성
corpus = [dictionary.doc2bow(text) for text in text_tokens]

print(dictionary.doc2bow(text_tokens[0]))

[(0, 2), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 125), (10, 46), (11, 3), (12, 12), (13, 49), (14, 34), (15, 5), (16, 13), (17, 2), (18, 60), (19, 2), (20, 11), (21, 1), (22, 4), (23, 3), (24, 2), (25, 441), (26, 3178), (27, 4), (28, 3), (29, 3), (30, 110), (31, 70), (32, 20), (33, 400), (34, 1), (35, 2), (36, 2), (37, 1), (38, 1), (39, 9), (40, 1153), (41, 108), (42, 1), (43, 126), (44, 12), (45, 185), (46, 15), (47, 4), (48, 2), (49, 50), (50, 1), (51, 56), (52, 3), (53, 3), (54, 4), (55, 89), (56, 3), (57, 3), (58, 3), (59, 2), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 3), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 1), (100, 1), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 2), (107,

In [15]:
# create a tf-idf model from the corpus
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

# tf-idf 모델에 따라 문서에 사용된 단어들의 값이 산출됨
tf_idf[corpus[0]]

TfidfModel(num_docs=18, num_nnz=86391)


[(0, 0.0009857032771977616),
 (1, 0.0005195780566590131),
 (2, 0.0007950422372556407),
 (3, 0.0007950422372556407),
 (4, 0.0007950422372556407),
 (5, 0.0007950422372556407),
 (6, 0.0015900844745112815),
 (7, 0.0007950422372556407),
 (8, 0.0007950422372556407),
 (10, 0.001490311564409252),
 (11, 0.0001504512400704869),
 (12, 0.00018866785461690386),
 (14, 0.0005345589214145609),
 (15, 0.00025075206678414487),
 (17, 0.0006043811973135201),
 (18, 0.0009433392730845193),
 (19, 0.0004461182348585569),
 (20, 0.0028576793116245726),
 (21, 0.0007950422372556407),
 (22, 0.0014093640480543558),
 (23, 9.719423246147295e-05),
 (24, 0.0002709268144751991),
 (27, 0.00035805020940692735),
 (28, 0.0004063902217127986),
 (29, 0.0006691773522878354),
 (32, 0.0003144464243615064),
 (34, 0.00035234101201358896),
 (35, 0.0006043811973135201),
 (36, 0.0003813220798842416),
 (37, 0.00025978902832950656),
 (38, 0.00035234101201358896),
 (39, 0.0012191706651383959),
 (40, 0.18641700779840284),
 (42, 0.00041372

In [16]:
# a similarity measure object in tf-idf space
sims = gensim.similarities.Similarity('', tf_idf[corpus], num_features=len(dictionary))
print(type(sims))

# Each shard is stored to disk under output_prefix.shard_number.
# If you don’t specify an output prefix, a random filename in temp will be used.

<class 'gensim.similarities.docsim.Similarity'>


In [30]:
print(sims[tf_idf[corpus[4]]])

[0.00940046 0.01223217 0.00904714 0.0654783  0.99999994 0.04898324
 0.01037068 0.0084695  0.01106989 0.0309748  0.01021332 0.03754831
 0.04145316 0.22050872 0.0223679  0.02730013 0.02785867 0.16470636]


In [17]:
# create a query document and convert it to tf-idf

# query document을 위의 방법에 따라 tokenize 하기
query_doc = [w.lower() for w in tokenize("Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence.")]
print(query_doc)

['emma', 'woodhous', 'handsom', 'clever', 'and', 'rich', 'with', 'a', 'comfort', 'home', 'and', 'happi', 'disposit', 'seem', 'to', 'unit', 'some', 'of', 'the', 'best', 'bless', 'of', 'exist']


In [18]:
# query document를 위 코퍼스의 인덱스에 따라 bag of words로 변환
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)

[(226, 1), (395, 2), (615, 1), (643, 1), (884, 1), (926, 1), (1336, 1), (1493, 1), (1613, 1), (2019, 1), (2026, 1), (2129, 1), (2900, 2), (3501, 1), (3638, 1), (3825, 1), (4101, 1), (4154, 1), (4335, 1), (4601, 1), (4614, 1)]


In [19]:
# query document를 위 tf-idf 모델에 따라 표현
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

[(643, 0.030591767869065194), (884, 0.152665730756644), (1336, 0.152665730756644), (1493, 0.5706847667109823), (1613, 0.152665730756644), (2019, 0.12791083392549962), (3501, 0.014845745129688415), (4335, 0.152665730756644), (4614, 0.7507157663249546)]


In [28]:
# document similarities to query
print(sims[query_doc_tf_idf])

[4.9028519e-01 3.4097787e-03 2.3907379e-03 1.2988232e-04 1.2709189e-03
 1.3461430e-03 1.4980003e-04 4.3051751e-04 1.8733502e-03 3.0567136e-03
 7.2922016e-04 2.7870864e-03 1.2974076e-03 2.0353233e-03 1.8961601e-06
 6.0734787e-04 1.8355380e-04 4.4891909e-03]
