In [1]:
import nltk
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Hyeon\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hyeon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
# Step 1 : Bag of words
from nltk.corpus import reuters
from collections import Counter, defaultdict

counts = Counter(reuters.words())
total_count = len(reuters.words())

# 공통적으로 가장 많이 나타나는 20개의 단어
print(counts.most_common(n=20))

# 빈도 비율 계산
for word in counts:
    counts[word] /= float(total_count)

    # 빈도 비율의 총합 계산
print(sum(counts.values()))

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]
1.0000000000006808


In [4]:
import random
#100개의 단어 생성(임의로 구성, 규칙적 아님)
text = []

for _ in range(100):
    r= random.random() # 임의의 단어를 뽑기 위한 임의의 이산값
    accumulator = .0
    
    for word, freq in counts.items():  # 해당 단어가 나온 비율이 높으면 accumulator에 추가?
        accumulator += freq
        
        if accumulator >= r: # 빈도(freq)가 임의의 값보다 높으면 text에 추가 -> 랜덤 단어로 구성된 문장 생성
            text.append(word)
            break
            
print(' '.join(text))

, immune foreign noted investing interests central apparent . 800 from the industry CORP 126 Ltd Corp a which thirds , said foreign FOR stand 7 U , meeting kilo said . highly Japan -- reported seasonally probably . dlrs O the have guilders of other >, ultimately , informing 1987 GM > a , 95 of its . . . and gains loss QTR is can 3 23 QTR . Financial A new Total bank will April when will . . an valued , would , the dlrs Six Travel 9 international PAK plus unless shifting > . .


In [5]:
# 텍스트의 확률 계산
from operator import mul
from functools import reduce

print(reduce(mul, [counts[w] for w in text], 1.0))

1.57588941009163e-293


In [6]:
# Step 2 : Bi-gram & Tri-gram
from nltk import bigrams, trigrams

first_sentence = reuters.sents()[0]
print(first_sentence)

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [7]:
### Bi-gram 겨로가 확인
print('\n----- Bi-gram 결과 확인 -----')
print(list(bigrams(first_sentence)))

# 패딩된 Bi-gram 결과 확인
print('\n ----- 패딩된 Bi-gram 결과 확인 -----')
print(list(bigrams(first_sentence, pad_left=True, pad_right=True)))


----- Bi-gram 결과 확인 -----
[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), ('that', 'the'), ('the', 'row'), ('row', 'could'), ('could', 'inflict'), ('inflict', 'far'), ('far', '-'), ('-', 'reaching'), ('reaching', 'economic'), ('economic', 'damage'), ('damage', ','), (',', 'businessmen'), ('businessmen', 'and'), ('and', 'officials'), ('officials', 'said'), ('said', '.')]

 ----- 패딩된 Bi-gram 결과 확인 -----
[(None, 'ASIAN'), ('ASIAN', 'EXPORTERS'), ('EXPO

In [9]:
# Tri-gram 결과 확인
print('\n ----- Tri-gram 결과 확인 -----')
print(list(trigrams(first_sentence)))

# 패딩된 Tri-gram 결과 확인
print('\n ----- 패딩된 Tri-gram 결과 확인 -----')
print(list(trigrams(first_sentence, pad_left=True, pad_right=True)))


 ----- Tri-gram 결과 확인 -----
[('ASIAN', 'EXPORTERS', 'FEAR'), ('EXPORTERS', 'FEAR', 'DAMAGE'), ('FEAR', 'DAMAGE', 'FROM'), ('DAMAGE', 'FROM', 'U'), ('FROM', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.-'), ('S', '.-', 'JAPAN'), ('.-', 'JAPAN', 'RIFT'), ('JAPAN', 'RIFT', 'Mounting'), ('RIFT', 'Mounting', 'trade'), ('Mounting', 'trade', 'friction'), ('trade', 'friction', 'between'), ('friction', 'between', 'the'), ('between', 'the', 'U'), ('the', 'U', '.'), ('U', '.', 'S'), ('.', 'S', '.'), ('S', '.', 'And'), ('.', 'And', 'Japan'), ('And', 'Japan', 'has'), ('Japan', 'has', 'raised'), ('has', 'raised', 'fears'), ('raised', 'fears', 'among'), ('fears', 'among', 'many'), ('among', 'many', 'of'), ('many', 'of', 'Asia'), ('of', 'Asia', "'"), ('Asia', "'", 's'), ("'", 's', 'exporting'), ('s', 'exporting', 'nations'), ('exporting', 'nations', 'that'), ('nations', 'that', 'the'), ('that', 'the', 'row'), ('the', 'row', 'could'), ('row', 'could', 'inflict'), ('could', 'inflict', 'far'), ('inflict', '

In [11]:
### Reuters 데이터(말뭉치)를 이용한 Tri-gram 모델 생성
print('\n ----- Tri-gram 모델 생성 -----')
model = defaultdict(lambda : defaultdict(lambda : 0))

for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right = True, pad_left = True):
        model[(w1, w2)][w3] += 1
        
print(model['what', 'the']['economists'])


 ----- Tri-gram 모델 생성 -----
2


In [12]:
# 확률 결과를 얻기 위해 counts를 변환
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
        
print(model['what', 'the']['economists'])

0.043478260869565216


In [16]:
### Language Model을 이용해 텍스트 생성하기
print('\n ----- 언어 모델을 이용해 텍스트 생성하기 -----')
import random

text = [None, None]
prob = 1.0

sentence_finished = False

while not sentence_finished:
    r = random.random()
    accumulator = .0
    
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
        
        if accumulator >= r:
            prob *= model[tuple(text[-2:])][word]
            text.append(word)
            #if len(text) == 3:
            
            break
        
    if text[-2:] == [None, None]:
        sentence_finished = True
        
print('텍스트의 확률 : ', prob)
print('문장 : ', ' '.join([t for t in text if t]))
print('tri-grams : ', text)


 ----- 언어 모델을 이용해 텍스트 생성하기 -----
텍스트의 확률 :  1.51093304494128e-12
문장 :  Bilzerian said he fears that the financial futures room , Sladoje said .
tri-grams :  [None, None, 'Bilzerian', 'said', 'he', 'fears', 'that', 'the', 'financial', 'futures', 'room', ',', 'Sladoje', 'said', '.', None, None]
