In [1]:
from collections import defaultdict
import math

def sent_processing(lines): # 데이터 preprocessing -> 문장과 태그를 튜플 형태로 바꿔서 리스트로 바꾸는 과정, 실습자료 16page

    if isinstance(lines, list):
        lines = [line.strip().split(" ") for line in lines]

        corpus = []
        for line in lines:
            sent = []
            for word in line:
                word = tuple(word.rsplit("/", 1))
                sent.append(word)
            corpus.append(sent)

        return corpus

    elif isinstance(lines, str):
        line = []
        for word in lines.strip().split(" "):
            word = tuple(word.rsplit("/", 1))
            line.append(word)
        return line

    else:
        print("wrong type of input sentence")
        exit(1)

    
with open("./data/corpus.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()

corpus = sent_processing(lines) # 단어와 태그의 쌍을 만듬

In [2]:
corpus

[[('"', 'g'),
  ('한식', 'CMC'),
  ('도시락', 'CMC'),
  ('프랜차이즈', 'CMC'),
  ('"', 'g'),
  ('본도', 'CMC'),
  ('이', 'fpd'),
  ('시', 'fmb'),
  ('락', 'fmoc'),
  ('"', 'g'),
  ('(', 'g'),
  ('대표', 'CMC'),
  ('김철호', 'CMP'),
  (')', 'g'),
  ('은', 'CMC'),
  ('본격', 'CMC'),
  ('적', 'fps'),
  ('이', 'fpd'),
  ('ㄴ', 'fmotg'),
  ('무더위', 'CMC'),
  ('의', 'fjcg'),
  ('시작', 'CMC'),
  ('을', 'fjco'),
  ('앞두', 'YBD'),
  ('고', 'fmoc'),
  ('"', 'g'),
  ('전복', 'CMC'),
  ('삼계', 'CMC'),
  ('밥', 'CMC'),
  ('도시락', 'CMC'),
  ('"', 'g'),
  ('을', 'fjco'),
  ('선물', 'CMC'),
  ('하', 'fpd'),
  ('는', 'fmotg'),
  ('온라인', 'CMC'),
  ('이벤트', 'CMC'),
  ('&lt;', 'g'),
  ('당신', 'CT'),
  ('의', 'fjcg'),
  ('마음', 'CMC'),
  ('을', 'fjco'),
  ('전하', 'YBD'),
  ('시', 'fmb'),
  ('어', 'fmof'),
  ('요', 'fjb'),
  ('!', 'g'),
  ('>', 'g'),
  ('를', 'fjb'),
  ('진행', 'CMC'),
  ('하', 'fpd'),
  ('ㄴ다', 'fmof'),
  ('.', 'g')],
 [('"', 'g'),
  ('나노', 'CMC'),
  ('세계', 'CMC'),
  ('에', 'fjcao'),
  ('나타나', 'YBD'),
  ('는', 'fmotg'),
  ('"', 'g'),
  ('마법', 'CM

In [5]:
def train(corpus): # 함수를 통해서 확률을 구하고자 하는 것, 실습자료 19page

    def bigram_count(sent):
        poslist = [pos for _, pos in sent] # [NN, VBD, DT, NN] <- 각각 몇개인지
        return [(pos0, pos1) for pos0, pos1 in zip(poslist, poslist[1:])]

    pos2words_freq = defaultdict(lambda: defaultdict(int)) # number of (word, tag)
    trans_freq = defaultdict(int) # bigram count --> (tag-1, tag)

    # sent format: [(word, tag), (word, tag), ...,(word, tag)]
    for sent in corpus: # word와 tag의 쌍이 몇번나타나는지 counting
        for word, pos in sent:
            pos2words_freq[pos][word] +=1

        for bigram in bigram_count(sent):
            trans_freq[bigram] +=1 # tag가 연속으로 나오는게 몇번 존재하는지 빈도수

        trans_freq[('BOS', sent[0][1])] += 1 # number of (BOS, tag) bigram
        trans_freq[(sent[-1][1], 'EOS')] +=1 # number of (tag, EOS) bigram

    ### Practice1: emission prob p(x|y) 
    # base prob: p(y).   실습자료 22page
    # P(y) for every y (count for each tag):    {'CMC': count(CMC), 'CMP': count(CMP),..}  
    base = {pos:sum(words.values()) for pos, words in pos2words_freq.items()}
    print("base : ",base) # tag별 말뭉치 총빈도수 count
    print("")
    
    # p(x|y) = p(x, y) / p(y)
    # log(p(x, y)/p(y)) for every (x, y)
    pos2words_prob = defaultdict(lambda: defaultdict(int)) # emission_probability
    for pos, words in pos2words_freq.items():
        for word, count in words.items():
            pos2words_prob[pos][word] = math.log(count/base[pos]) # 해당 tag에서의 단어빈도수 / 말뭉치에서 tag의 총 빈도수
            # 품사 CMC에서의 word 아버지의 count수 / 말뭉치의 CMC 전체 count수 base['CMC'] => log(10 / 24862)
            # pos2words_prob['CMC']['아버지'] = log(10 / 24862)
    
    ### Practice2: transition prob p(y_t|y_(t-1))
    # base prob: p(y_(t-1)) 앞 tag -> base
    # Do something to make {'CMC': count('CMC'), 'fjb': count('fjb'), ..} # CMC, fjb의 빈도수를 구함
    base = defaultdict(int)
    for (pos0, pos1), count in trans_freq.items():
        base[pos0] += count # #of tagk
    
    # p(y_t|y_(t-1)) = p(y_t, y_(t-1)) / p(y_(t-1)) # base 가지고 trans_prob
    # p(y_t, y_t-1) / p(y_t) 
    trans_prob = defaultdict(int)    # transition_probability
    for (pos0, pos1), count in trans_freq.items():
        trans_prob[(pos0, pos1)] = math.log(count/base[pos0]) # count      : pos0 다음에 pos1 순서대로 나오는 빈도, 
                                                              # base[pos0] : 전체 말뭉치 중에서 pos0의 빈도수
    
    # optional : one line code
    # trans_prob = {pos:math.log(count/base[pos[0]]) for pos, count in trans_freq.items()}
    return pos2words_prob, trans_prob # x,y에 대한 확률값   

In [8]:
pos2words, trans = train(corpus)

print('명사 라면의 확률:', pos2words['CMC']['라면']) # 명사 '라면'의 확률 (신라면, 진라면 등.)
print('연결어미 라면의 확률:', pos2words['fmoc']['라면']) # 연결어미 '라면'의 확률 (~ 이라면) 연결어미 등장확률이 더 크다

base :  {'g': 6936, 'CMC': 24862, 'fpd': 4131, 'fmb': 1553, 'fmoc': 4158, 'CMP': 1387, 'fps': 758, 'fmotg': 4024, 'fjcg': 1245, 'fjco': 2913, 'YBD': 4268, 'CT': 424, 'fmof': 3678, 'fjb': 2906, 'fjcao': 3596, 'CS': 1987, 'fjcaq': 98, 'YA': 1435, 'CMD': 2299, 'fjcc': 81, 'YBH': 2035, 'SBO': 1285, 'fjj': 331, 'fjcs': 1973, 'SG': 858, 'fph': 371, 'SBJ': 434, 'F': 1893, 'UM': 293, 'K': 15, 'fmotn': 402, 'fpp': 6, 'fmota': 14, 'fjcI': 2}

명사 라면의 확률: -9.427948631791715
연결어미 라면의 확률: -5.6937321388027


In [17]:
class HMM_tagger(object): # Hidden Markov Model
    def __init__(self, pos2words, trans):
        self.pos2words = pos2words
        self.trans = trans
        self.unk = -15 # 학습할때 정의되지 않은 부분에 대해서 돌려주는 값
        self.eos ='EOS'
        self.bos ='BOS'

    def sent_log_prob(self, sent):
        # sent format: [(word, tag), (word, tag), ...,(word, tag)]
        # emission prob. #단어가 채워졌을때 등장확률
        log_prob = sum(
            (self.pos2words.get(t, {}).get(w, self.unk) # t라는 key에 해당하는 value가져오기, key가 없을때는 빈 value 리턴, word에 해당하는 확률을 가져오는건데 없으면 unk
            for w, t in sent)
        ) # get emission prob. for each (w, t), otherwise unk value
    
        # transition prob. # trans_prob, log변환 sum
        bigrams = [(t0, t1) for (_, t0), (_, t1) in zip(sent, sent[1:])] # every bigram in sentence
        log_prob+= sum(
            (self.trans.get(bigram, self.unk)
             for bigram in bigrams)
        )
        
        # bos
        log_prob += self.trans.get((self.bos, sent[0][1]), self.unk) # get BOS prob for the first (w, t)

        # eos
        log_prob += self.trans.get((sent[-1][1], self.eos), self.unk) # get EOS prob for the last (w, t)
        
        # length norm.
        log_prob /= len(sent)

        return log_prob

In [18]:
tagger = HMM_tagger(pos2words, trans)
test_sent1= "감기/CMC 는/fjb 줄이/YBD 다/fmof ./g"
test_sent2= "감기/fmotg 는/fjb 줄/CMC 이다/fjj ./g"
print("%s: %f" % (test_sent1, tagger.sent_log_prob(sent_processing(test_sent1))))
print("%s: %f" % (test_sent2, tagger.sent_log_prob(sent_processing(test_sent2))))

감기/CMC 는/fjb 줄이/YBD 다/fmof ./g: -5.489636
감기/fmotg 는/fjb 줄/CMC 이다/fjj ./g: -14.037157
