<a href="https://colab.research.google.com/github/Jisang-hwang93/NLP_Class/blob/master/10%20Naive%20Bayes%20Classifier%20Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Document Classification**

##**1.Naive Bayes Classifier 구현**
**스팸 메일 필터링**

### **1-1. 데이터 전처리**

In [1]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam"]

In [2]:
# 각 문장 토큰화
lines = []
tokens = []

for i in range(len(mail)):
    lines.append(mail[i].split(" "))
    # 단어 뭉치 생성
    for word in lines[i]:
        tokens.append(word)
tokens = list(set(tokens))

lines, tokens

([['me', 'free', 'lottery'],
  ['free', 'get', 'free', 'you'],
  ['you', 'free', 'scholarship'],
  ['free', 'to', 'contact', 'me'],
  ['you', 'won', 'award'],
  ['you', 'ticket', 'lottery']],
 ['award',
  'scholarship',
  'won',
  'lottery',
  'to',
  'me',
  'you',
  'contact',
  'get',
  'ticket',
  'free'])

In [3]:
# 메일 분류 확인
import pandas as pd

mail_classify = {"메일" : lines, "분류": mail_type}
df = pd.DataFrame(mail_classify)

df

Unnamed: 0,메일,분류
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam


In [4]:
# spam, normal 메일 개수
num_spam = 0
num_norm = 0

for label in mail_classify["분류"]:
    if label == "spam":
        num_spam += 1
    if label == "normal":
        num_norm += 1

# 총 메일 개수
num_total = num_spam + num_norm

# 사전 확률 계산
spam_prior = num_spam / num_total
norm_prior = num_norm / num_total

spam_prior, norm_prior

(0.5, 0.5)

In [5]:
import numpy as np

spam_word = []
norm_word = []
total_spam = 0
total_norm = 0

for i in range(len(tokens)):
    count_spam = 0
    count_norm = 0
    for j in range(len(lines)):
        if mail_classify["분류"][j] == "spam":
            if tokens[i] in lines[j]:
                count_spam += lines[j].count(tokens[i])
        if mail_classify["분류"][j] == "normal":
            if tokens[i] in lines[j]:
                count_norm += lines[j].count(tokens[i])

    spam_word.append(count_spam)
    norm_word.append(count_norm)
    total_spam += count_spam
    total_norm += count_norm

total_spam, total_norm

(10, 10)

### **1-2. Laplace Smoothing**

In [21]:
# Laplace Smoothing
laplace_spam = []
laplace_norm = []

for i in range(len(tokens)):
    laplace_spam.append((spam_prior+spam_word[i])/(2*spam_prior+total_spam)*100)

for i in range(len(tokens)):
    laplace_norm.append((norm_prior+norm_word[i])/(2*norm_prior+total_norm)*100)

In [22]:
word_table = np.array([spam_word, norm_word, laplace_spam, laplace_norm])

df = pd.DataFrame(word_table.T, index=tokens, columns=["spam", "normal", "P(w|spam)", "P(w|normal)"])
df.sort_index(axis=0)

Unnamed: 0,spam,normal,P(w|spam),P(w|normal)
award,0.0,1.0,4.545455,13.636364
contact,0.0,1.0,4.545455,13.636364
free,3.0,2.0,31.818182,22.727273
get,1.0,0.0,13.636364,4.545455
lottery,2.0,0.0,22.727273,4.545455
me,1.0,1.0,13.636364,13.636364
scholarship,0.0,1.0,4.545455,13.636364
ticket,1.0,0.0,13.636364,4.545455
to,0.0,1.0,4.545455,13.636364
won,0.0,1.0,4.545455,13.636364


### **1-3. Log 이용**
**Log의 성질을 활용. 곱셈을 덧셈으로 변환해 Underflow를 방지함**

In [28]:
# 로그 이용 언더 플로우
log_spam = []
log_norm = []

for i in range(len(tokens)):
    log_spam.append(np.log(laplace_spam[i]/100))
    log_norm.append(np.log(laplace_norm[i]/100))

In [31]:
word_table = np.array([spam_word, norm_word, laplace_spam, laplace_norm, log_spam, log_norm])

df = pd.DataFrame(word_table.T, index=tokens, columns=["spam", "normal", "P(w|spam)", "P(w|normal)", "Log(P(w|spam))", "Log(P(w|normal))"])
df.sort_index(axis=0)

Unnamed: 0,spam,normal,P(w|spam),P(w|normal),Log(P(w|spam)),Log(P(w|normal))
award,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
contact,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
free,3.0,2.0,31.818182,22.727273,-1.145132,-1.481605
get,1.0,0.0,13.636364,4.545455,-1.99243,-3.091042
lottery,2.0,0.0,22.727273,4.545455,-1.481605,-3.091042
me,1.0,1.0,13.636364,13.636364,-1.99243,-1.99243
scholarship,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
ticket,1.0,0.0,13.636364,4.545455,-1.99243,-3.091042
to,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243
won,0.0,1.0,4.545455,13.636364,-3.091042,-1.99243


### **1-4. 스팸 확률 구하기**

In [62]:
# 스팸 필터링 : 입력값
check_list = "free lottery"

check_token = []

check_token.append(check_list.split(" "))

check_token

[['free', 'lottery']]

In [67]:
# 입력 토큰 로그들의 합
import math

spam_filter = 0
norm_filter = 0

for i in range(len(check_token[0])):
    spam_filter += df['Log(P(w|spam))'][check_token[0][i]]
    norm_filter += df['Log(P(w|normal))'][check_token[0][i]]

spam_filter, norm_filter

(-2.626736845227218, -4.572646994282531)

In [68]:
# 입력 토큰 로그합과 사전확률 로그의 합
spam_filter = math.exp(spam_filter + np.log(spam_prior))
norm_filter = math.exp(norm_filter + np.log(norm_prior))

spam_filter, norm_filter

(0.03615702479338842, 0.00516528925619835)

In [69]:
# 스팸/정상 메일일 확률
spam_prob = spam_filter / (spam_filter + norm_filter)
norm_prob = norm_filter / (spam_filter + norm_filter)

spam_prob, norm_prob

(0.8749999999999999, 0.12500000000000008)

### **1-5. 최종 결과**

In [82]:
print("{}라는 토큰이 있는 메일이 스팸일 확률 : {:.2f}%".format(check_list, spam_prob*100))
print("{}라는 토큰이 있는 메일이 정상일 확률 : {:.2f}%".format(check_list, norm_prob*100))

free lottery라는 토큰이 있는 메일이 스팸일 확률 : 87.50%
free lottery라는 토큰이 있는 메일이 정상일 확률 : 12.50%
