<a href="https://colab.research.google.com/github/Jisang-hwang93/NLP_Class/blob/master/10%20Naive%20Bayes%20Multi%20Classifier%20Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Naive Bayes Calssifier를 이용한 다중 문항 분류**

## **1. Naive Bayes Classifier 다중 분류 구현**
**메일 필터링**

### **1-1. 데이터 전처리**

In [1]:
# 메일과 메일의 종류 데이터
mail = ['me free lottery',
        'free get free you',
        'you free scholarship',
        'free to contact me',
        'you won award',
        'you ticket lottery']

mail_type = ["spam", "spam", "normal", "normal", "normal", "spam"]

In [2]:
# 각 문장 토큰화
lines = []
tokens = []

for i in range(len(mail)):
    lines.append(mail[i].split(" "))
    # 단어 뭉치 생성
    for word in lines[i]:
        tokens.append(word)
tokens = list(set(tokens))

# 메일의 종류
types = list(set(mail_type))

lines, tokens, types

([['me', 'free', 'lottery'],
  ['free', 'get', 'free', 'you'],
  ['you', 'free', 'scholarship'],
  ['free', 'to', 'contact', 'me'],
  ['you', 'won', 'award'],
  ['you', 'ticket', 'lottery']],
 ['me',
  'award',
  'get',
  'contact',
  'ticket',
  'won',
  'you',
  'free',
  'lottery',
  'scholarship',
  'to'],
 ['spam', 'normal'])

In [3]:
# 메일 분류 확인
import pandas as pd

mail_classify = {"메일" : lines, "분류" : mail_type}
df = pd.DataFrame(mail_classify)

df

Unnamed: 0,메일,분류
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam


### **1-2. 토큰별 분류**

In [4]:
# 토큰별 분류 카운트
import numpy as np

type_matrix = []

for tokens_idx in range(len(tokens)):
    type_count = [0 for a in range(len(types))]
    for type_idx in range(len(types)):
        for line_idx, label in enumerate(mail_classify["분류"]):
            if label == types[type_idx]:
                if tokens[tokens_idx] in lines[line_idx]:
                    type_count[type_idx] += lines[line_idx].count(tokens[tokens_idx])
    type_matrix.append(type_count)

matrix_table = pd.DataFrame(type_matrix, index=tokens, columns=types)
matrix_table

Unnamed: 0,spam,normal
me,1,1
award,0,1
get,1,0
contact,0,1
ticket,1,0
won,0,1
you,2,2
free,3,2
lottery,2,0
scholarship,0,1


### **1-3. 토큰별 확률 계산**

In [5]:
# 사전확률 계산
prior_prob = []

for type_idx in types:
    count_type = 0
    for i in range(len(lines)):
        if type_idx == mail_classify['분류'][i]:
            count_type += 1

    prior_prob.append(count_type/len(lines))

prior_prob, types

([0.5, 0.5], ['spam', 'normal'])

In [6]:
# Laplace Smoothing
laplace = []
k = 0.5

for i in range(len(tokens)):
    laplace_tmp = []
    for j in range(len(types)):
        laplace_tmp.append((k+type_matrix[i][j])/(2*k+matrix_table[types[j]].sum())*100)
    laplace.append(laplace_tmp)

laplace_type = ["P(w|{})".format(i) for i in types]
laplace_type, laplace

(['P(w|spam)', 'P(w|normal)'],
 [[13.636363636363635, 13.636363636363635],
  [4.545454545454546, 13.636363636363635],
  [13.636363636363635, 4.545454545454546],
  [4.545454545454546, 13.636363636363635],
  [13.636363636363635, 4.545454545454546],
  [4.545454545454546, 13.636363636363635],
  [22.727272727272727, 22.727272727272727],
  [31.818181818181817, 22.727272727272727],
  [22.727272727272727, 4.545454545454546],
  [4.545454545454546, 13.636363636363635],
  [4.545454545454546, 13.636363636363635]])

In [7]:
# 로그 이용 언더 플로우
log = []

for i in range(len(tokens)):
    log_tmp = []
    for j in range(len(types)):
        log_tmp.append(np.log(laplace[i][j]/100))
    log.append(log_tmp)
log_type = ["Log(P(w|{}))".format(i) for i in types]
log_type, log

(['Log(P(w|spam))', 'Log(P(w|normal))'],
 [[-1.9924301646902063, -1.9924301646902063],
  [-3.0910424533583156, -1.9924301646902063],
  [-1.9924301646902063, -3.0910424533583156],
  [-3.0910424533583156, -1.9924301646902063],
  [-1.9924301646902063, -3.0910424533583156],
  [-3.0910424533583156, -1.9924301646902063],
  [-1.4816045409242156, -1.4816045409242156],
  [-1.1451323043030026, -1.4816045409242156],
  [-1.4816045409242156, -3.0910424533583156],
  [-3.0910424533583156, -1.9924301646902063],
  [-3.0910424533583156, -1.9924301646902063]])

In [8]:
df = pd.concat((pd.DataFrame(type_matrix, index=tokens, columns=types), pd.DataFrame(laplace, index=tokens, columns=laplace_type), pd.DataFrame(log, index=tokens, columns=log_type)), axis=1)
df

Unnamed: 0,spam,normal,P(w|spam),P(w|normal),Log(P(w|spam)),Log(P(w|normal))
me,1,1,13.636364,13.636364,-1.99243,-1.99243
award,0,1,4.545455,13.636364,-3.091042,-1.99243
get,1,0,13.636364,4.545455,-1.99243,-3.091042
contact,0,1,4.545455,13.636364,-3.091042,-1.99243
ticket,1,0,13.636364,4.545455,-1.99243,-3.091042
won,0,1,4.545455,13.636364,-3.091042,-1.99243
you,2,2,22.727273,22.727273,-1.481605,-1.481605
free,3,2,31.818182,22.727273,-1.145132,-1.481605
lottery,2,0,22.727273,4.545455,-1.481605,-3.091042
scholarship,0,1,4.545455,13.636364,-3.091042,-1.99243


### **1-4. 입력 데이터 처리**

In [9]:
# 스팸 필터링 : 입력값
check_list = "free lottery"

check_token = []

check_token.append(check_list.split(" "))

check_token

[['free', 'lottery']]

In [10]:
# 입력 토큰 로그들의 합
import math

filter = []
tmp = []
for i in range(len(check_token[0])):
    tmp.append(df[df.index==check_token[0][i]][("Log(P(w|{}))".format(i) for i in types)].sum())

for i in range(len(types)):
    filter.append(math.exp(sum(tmp[i]) + np.log(prior_prob[i])))

prob = []

for i in range(len(types)):
    prob.append(filter[i] / sum(filter))

filter, prob

([0.03615702479338842, 0.00516528925619835],
 [0.8749999999999999, 0.12500000000000008])

### **1-5. 최종 결과**

In [11]:
for i in range(len(check_token[0])):
    print("{}라는 토큰이 있는 메일이 스팸일 확률 : {:.2f}%".format(check_token[0][i], prob[i]*100))

free라는 토큰이 있는 메일이 스팸일 확률 : 87.50%
lottery라는 토큰이 있는 메일이 스팸일 확률 : 12.50%


## **2. Naive Bayes Classifier 다중 분류 Class화**
**메일 필터링**

### **2-1. Naive Bayes Multi Classifier Class**

In [12]:
import pandas as pd
import numpy as np
import math

class multi_nbc():
    def __init__(self):
        self.lines = []
        self.tokens = []
        self.types = []
        self.type_matrix = []
        self.matrix_table = []
        self.prior_prob = []
        self.laplace = []
        self.log = []
        self.token_matirx = []
        self.check_token = []
        self.prob = []
    
    # 각 문장 토큰화
    def classify_mail(self, mail, mail_type):
        BagOfWords = []
        for i in range(len(mail)):
            self.lines.append(mail[i].split(" "))
            # 단어 뭉치 생성
            for word in self.lines[i]:
                BagOfWords.append(word)
        self.tokens = list(set(BagOfWords))
        # 메일의 종류
        self.types = list(set(mail_type))
        # 메일 분류 확인
        self.mail_classify = {"메일" : self.lines, "분류" : mail_type}
        
        return self.lines, self.tokens, self.types, self.mail_classify

    # 토큰별 분류 카운트
    def count_matrix(self):
        for i in range(len(self.tokens)):
            type_count = [0 for a in range(len(self.types))]
            for j in range(len(self.types)):
                for l, label in enumerate(self.mail_classify["분류"]):
                    if label == self.types[j]:
                        if self.tokens[i] in self.lines[l]:
                            type_count[j] += self.lines[l].count(self.tokens[i])
            self.type_matrix.append(type_count)
            
        self.matrix_table = pd.DataFrame(self.type_matrix, index=self.tokens, columns=self.types)

        return self.type_matrix, self.matrix_table
    
    # 사전확률 계산
    def cal_prior(self):
        for type_idx in self.types:
            count_type = 0
            for i in range(len(self.lines)):
                if type_idx == self.mail_classify['분류'][i]:
                    count_type += 1
            self.prior_prob.append(count_type/len(self.lines))

        return self.prior_prob

    # Laplace Smoothing
    def cal_laplace(self, k):
        for i in range(len(self.tokens)):
            laplace_tmp = []
            for j in range(len(self.types)):
                laplace_tmp.append((k+self.type_matrix[i][j])/(2*k+self.matrix_table[self.types[j]].sum())*100)
            self.laplace.append(laplace_tmp)
        laplace_type = ["P(w|{})".format(i) for i in self.types]

        # 로그 이용 언더 플로우
        for i in range(len(self.tokens)):
            log_tmp = []
            for j in range(len(self.types)):
                log_tmp.append(np.log(self.laplace[i][j]/100))
            self.log.append(log_tmp)
        log_type = ["Log(P(w|{}))".format(i) for i in self.types]
        
        # 최종 매트릭스
        self.token_matrix = pd.concat((pd.DataFrame(self.type_matrix, index=self.tokens, columns=self.types),
                                       pd.DataFrame(self.laplace, index=self.tokens, columns=laplace_type),
                                       pd.DataFrame(self.log, index=self.tokens, columns=log_type)), axis=1)
        
        return self.token_matrix.sort_index(axis=0)

    # 스팸 필터링 : 입력값
    def input_check(self, check_list): # 해당 코드를 좀 더 직관적으로 풀어 쓸 수 있는 방안을 고려해봐야할 듯
        self.check_token.append(check_list.split(" "))
        # 입력 토큰 로그 합
        filter = []
        filter_tmp = []
        for c in range(len(self.check_token[0])):
            filter_tmp.append(self.token_matrix[self.token_matrix.index==self.check_token[0][c]][("Log(P(w|{}))".format(t) for t in self.types)].sum())
        # 입력 토큰 로그합
        filter = [0 for a in range(len(self.types))]
        for c in range(len(self.check_token[0])):
            for t in range(len(self.types)):
                filter[t] += filter_tmp[c][t]
        # 입력 토큰 로그합과 사전확률 로그의 합
        for t in range(len(self.types)):
            filter[t] = math.exp(filter[t] + np.log(self.prior_prob[t]))
        #각 토큰별 확률 계산
        for t in range(len(self.types)):
            self.prob.append(filter[t] / sum(filter))

        return self.check_token, self.prob
    
    # 최종 결과값
    def nbc_result(self):
        if len(self.check_token[0]) >= len(self.types):
            for c in range(len(self.check_token[0])):
                print("{}라는 토큰이 있는 메일이 {}일 확률 : {:.2f}%".format(self.check_token[0], self.types[c], self.prob[c]*100))
        
        elif len(self.check_token[0]) < len(self.types):
            for i in range(len(self.types)):
                print("{}라는 토큰이 있는 메일이 {}일 확률 : {:.2f}%".format(self.check_token[0], self.types[i], self.prob[i]*100))

    # 자동 실행
    def run(self, mail, mail_type, k, check_list):
        self.classify_mail(mail, mail_type)
        self.count_matrix()
        self.cal_prior()
        self.cal_laplace(k)
        self.input_check(check_list)
        self.nbc_result()

### **2-2. 결과 확인**


In [13]:
# 메일과 메일의 종류 데이터
mail = ['I love you',
        'love happy weekend',
        'bore work job',
        'I hate you',
        'bore weekend',
        'happy together',
        'normal day']

mail_type = ["긍정", "긍정", "부정", "부정", "부정", "긍정", "중립"]

check_list = "happy weekend bore"

In [14]:
mnbc = multi_nbc()

In [15]:
mnbc.run(mail, mail_type, 0.5, check_list)

['happy', 'weekend', 'bore']라는 토큰이 있는 메일이 중립일 확률 : 23.08%
['happy', 'weekend', 'bore']라는 토큰이 있는 메일이 부정일 확률 : 38.46%
['happy', 'weekend', 'bore']라는 토큰이 있는 메일이 긍정일 확률 : 38.46%
