# Naive Bayes Classifier

* 베이즈 정리를 이용한 가장 간단한 supervised classifier.
* 모든 feature의 결합확률분포를 구하기가 힘든것을 감안하여, 모든 feature가 서로 독립임을 가정하여 사용함.(그래서 naive 라는 말이 붙음)
* 간단한것에 비해 성능은 좋지만, 한계도 많다.
* 실제 데이터는 대부분 feature 사이에 correlation이 존재하기 때문에, 실제로는 많이 사용하지 못한다. 예전에는 sentiment analysis나 spam filtering에 많이 사용했었다.

![토픽 모델링의 예](figs/example1.png)

![토픽 모델링의 예](figs/example2.png)

## English Dataset

In [1]:
from nltk.tokenize import word_tokenize
import nltk

In [2]:
train = [('i like you', 'pos'), 
         ('i hate you', 'neg'), 
         ('you like me', 'neg'),
         ('i like her', 'pos')]

In [17]:
all_words = set(word.lower() for sentence in train for word in word_tokenize(sentence[0]))
all_words

{'hate', 'her', 'i', 'like', 'me', 'you'}

In [4]:
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t

[({'i': True,
   'me': False,
   'her': False,
   'hate': False,
   'like': True,
   'you': True},
  'pos'),
 ({'i': True,
   'me': False,
   'her': False,
   'hate': True,
   'like': False,
   'you': True},
  'neg'),
 ({'i': False,
   'me': True,
   'her': False,
   'hate': False,
   'like': True,
   'you': True},
  'neg'),
 ({'i': True,
   'me': False,
   'her': True,
   'hate': False,
   'like': True,
   'you': False},
  'pos')]

In [5]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                    like = True              pos : neg    =      1.7 : 1.0
                    hate = False             pos : neg    =      1.7 : 1.0
                     her = False             neg : pos    =      1.7 : 1.0
                      me = False             pos : neg    =      1.7 : 1.0
                       i = True              pos : neg    =      1.7 : 1.0
                     you = True              neg : pos    =      1.7 : 1.0


In [6]:
test_sentence = 'i like MeRui'
test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words}
test_sent_features

{'i': True,
 'me': False,
 'her': False,
 'hate': False,
 'like': True,
 'you': False}

In [7]:
classifier.classify(test_sent_features)

'pos'

## Korean Dataset

In [8]:
from konlpy.tag import Twitter

In [9]:
pos_tagger = Twitter()

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [10]:
train = [('메리가 좋아', 'pos'), 
         ('고양이도 좋아', 'pos'),
         ('난 수업이 지루해', 'neg'),
         ('메리는 이쁜 고양이야', 'pos'),
         ('난 마치고 메리랑 놀거야', 'pos')]

In [11]:
all_words = set(word.lower() for sentence in train for word in word_tokenize(sentence[0]))
all_words

{'고양이도',
 '고양이야',
 '난',
 '놀거야',
 '마치고',
 '메리가',
 '메리는',
 '메리랑',
 '수업이',
 '이쁜',
 '좋아',
 '지루해'}

In [12]:
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t

[({'메리가': True,
   '난': False,
   '마치고': False,
   '놀거야': False,
   '수업이': False,
   '메리랑': False,
   '지루해': False,
   '메리는': False,
   '좋아': True,
   '고양이야': False,
   '고양이도': False,
   '이쁜': False},
  'pos'),
 ({'메리가': False,
   '난': False,
   '마치고': False,
   '놀거야': False,
   '수업이': False,
   '메리랑': False,
   '지루해': False,
   '메리는': False,
   '좋아': True,
   '고양이야': False,
   '고양이도': True,
   '이쁜': False},
  'pos'),
 ({'메리가': False,
   '난': True,
   '마치고': False,
   '놀거야': False,
   '수업이': True,
   '메리랑': False,
   '지루해': True,
   '메리는': False,
   '좋아': False,
   '고양이야': False,
   '고양이도': False,
   '이쁜': False},
  'neg'),
 ({'메리가': False,
   '난': False,
   '마치고': False,
   '놀거야': False,
   '수업이': False,
   '메리랑': False,
   '지루해': False,
   '메리는': True,
   '좋아': False,
   '고양이야': True,
   '고양이도': False,
   '이쁜': True},
  'pos'),
 ({'메리가': False,
   '난': True,
   '마치고': True,
   '놀거야': True,
   '수업이': False,
   '메리랑': True,
   '지루해': False,
   '메리는': False,
   '좋아': False,
   '고양이야': F

In [13]:
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()

Most Informative Features
                       난 = True              neg : pos    =      2.5 : 1.0
                      좋아 = False             neg : pos    =      1.5 : 1.0
                     마치고 = False             neg : pos    =      1.1 : 1.0
                     메리는 = False             neg : pos    =      1.1 : 1.0
                     놀거야 = False             neg : pos    =      1.1 : 1.0
                    고양이도 = False             neg : pos    =      1.1 : 1.0
                     메리가 = False             neg : pos    =      1.1 : 1.0
                     메리랑 = False             neg : pos    =      1.1 : 1.0
                      이쁜 = False             neg : pos    =      1.1 : 1.0
                    고양이야 = False             neg : pos    =      1.1 : 1.0


In [14]:
test_sentence = '난 수업이 마치면 메리랑 놀거야'

In [15]:
test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words}
test_sent_features

{'메리가': False,
 '난': True,
 '마치고': False,
 '놀거야': True,
 '수업이': True,
 '메리랑': True,
 '지루해': False,
 '메리는': False,
 '좋아': False,
 '고양이야': False,
 '고양이도': False,
 '이쁜': False}

In [16]:
classifier.classify(test_sent_features)

'neg'

In [17]:
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

In [18]:
train_docs = [(tokenize(row[0]), row[1]) for row in train]
train_docs

[(['메리/Noun', '가/Josa', '좋다/Adjective'], 'pos'),
 (['고양이/Noun', '도/Josa', '좋다/Adjective'], 'pos'),
 (['난/Noun', '수업/Noun', '이/Josa', '지루하다/Adjective'], 'neg'),
 (['메리/Noun', '는/Josa', '이쁘다/Adjective', '고양이/Noun', '야/Josa'], 'pos'),
 (['난/Noun', '마치/Noun', '고/Josa', '메리/Noun', '랑/Josa', '놀다/Verb'], 'pos')]

In [19]:
tokens = [t for d in train_docs for t in d[0]]
tokens

['메리/Noun',
 '가/Josa',
 '좋다/Adjective',
 '고양이/Noun',
 '도/Josa',
 '좋다/Adjective',
 '난/Noun',
 '수업/Noun',
 '이/Josa',
 '지루하다/Adjective',
 '메리/Noun',
 '는/Josa',
 '이쁘다/Adjective',
 '고양이/Noun',
 '야/Josa',
 '난/Noun',
 '마치/Noun',
 '고/Josa',
 '메리/Noun',
 '랑/Josa',
 '놀다/Verb']

In [20]:
def term_exists(doc):
    return {word: (word in set(doc)) for word in tokens}

In [21]:
train_xy = [(term_exists(d), c) for d,c in train_docs]
train_xy

[({'메리/Noun': True,
   '가/Josa': True,
   '좋다/Adjective': True,
   '고양이/Noun': False,
   '도/Josa': False,
   '난/Noun': False,
   '수업/Noun': False,
   '이/Josa': False,
   '지루하다/Adjective': False,
   '는/Josa': False,
   '이쁘다/Adjective': False,
   '야/Josa': False,
   '마치/Noun': False,
   '고/Josa': False,
   '랑/Josa': False,
   '놀다/Verb': False},
  'pos'),
 ({'메리/Noun': False,
   '가/Josa': False,
   '좋다/Adjective': True,
   '고양이/Noun': True,
   '도/Josa': True,
   '난/Noun': False,
   '수업/Noun': False,
   '이/Josa': False,
   '지루하다/Adjective': False,
   '는/Josa': False,
   '이쁘다/Adjective': False,
   '야/Josa': False,
   '마치/Noun': False,
   '고/Josa': False,
   '랑/Josa': False,
   '놀다/Verb': False},
  'pos'),
 ({'메리/Noun': False,
   '가/Josa': False,
   '좋다/Adjective': False,
   '고양이/Noun': False,
   '도/Josa': False,
   '난/Noun': True,
   '수업/Noun': True,
   '이/Josa': True,
   '지루하다/Adjective': True,
   '는/Josa': False,
   '이쁘다/Adjective': False,
   '야/Josa': False,
   '마치/Noun': False,
   '고/Jo

In [22]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)

In [23]:
test_sentence = [("난 수업이 마치면 메리랑 놀거야")]

In [24]:
test_docs = pos_tagger.pos(test_sentence[0])
test_docs

[('난', 'Noun'),
 ('수업', 'Noun'),
 ('이', 'Josa'),
 ('마치', 'Noun'),
 ('면', 'Josa'),
 ('메리', 'Noun'),
 ('랑', 'Josa'),
 ('놀거야', 'Verb')]

In [25]:
classifier.show_most_informative_features()

Most Informative Features
                  난/Noun = True              neg : pos    =      2.5 : 1.0
                 메리/Noun = False             neg : pos    =      2.5 : 1.0
            좋다/Adjective = False             neg : pos    =      1.5 : 1.0
                고양이/Noun = False             neg : pos    =      1.5 : 1.0
                  도/Josa = False             neg : pos    =      1.1 : 1.0
                  는/Josa = False             neg : pos    =      1.1 : 1.0
                  고/Josa = False             neg : pos    =      1.1 : 1.0
                 놀다/Verb = False             neg : pos    =      1.1 : 1.0
                  가/Josa = False             neg : pos    =      1.1 : 1.0
                 마치/Noun = False             neg : pos    =      1.1 : 1.0


In [26]:
test_sent_features = {word: (word in tokens) for word in test_docs}
test_sent_features

{('난', 'Noun'): False,
 ('수업', 'Noun'): False,
 ('이', 'Josa'): False,
 ('마치', 'Noun'): False,
 ('면', 'Josa'): False,
 ('메리', 'Noun'): False,
 ('랑', 'Josa'): False,
 ('놀거야', 'Verb'): False}

In [27]:
classifier.classify(test_sent_features)

'pos'

In [32]:
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset="all")
X = news.data
y = news.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model1 = Pipeline([
    ('vect', CountVectorizer()),
    ('model', MultinomialNB()),
])
model2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('model', MultinomialNB()),
])
model3 = Pipeline([
    ('vect', TfidfVectorizer(stop_words="english")),
    ('model', MultinomialNB()),
])
model4 = Pipeline([
    ('vect', TfidfVectorizer(stop_words="english",
                             token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
    ('model', MultinomialNB()),
])

In [34]:
%%time
from sklearn.model_selection import cross_val_score, KFold

for i, model in enumerate([model1, model2, model3, model4]):
    scores = cross_val_score(model, X, y, cv=5)
    print(("Model{0:d}: Mean score: {1:.3f}").format(i + 1, np.mean(scores)))

Model1: Mean score: 0.855
Model2: Mean score: 0.856
Model3: Mean score: 0.883
Model4: Mean score: 0.888
CPU times: user 1min 26s, sys: 2.95 s, total: 1min 29s
Wall time: 1min 23s
