# Make Classifier From Scratch
최소한의 패키지를 가지고 다중 클래스분류기 만들기 프로젝트

2018310412 인공지능융합전공 고준서

### 전처리기 만들기
clean_text( ) 함수에 도큐먼트 묶음을 넣으면 전처리 한 후 다시 돌려보냄

In [2]:
from nltk.corpus import names
all_names = set(names.words())

In [3]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
def letters_only(word):
    return word.isalpha()

In [5]:
def clean_text(doc):
    cleaned_doc = []
    for word in doc.split(' '):
        word = word.lower()
  
        if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
            cleaned_doc.append(lemmatizer.lemmatize(word, "v"))
    return ' '.join(cleaned_doc) 

### Vectorizer를 만들어준다.
나이브 베이즈가 성능이 제일 좋아서 이걸 제출하지만, SVM도 해봤기 때문에 tfidf와 CountVectorizer도 필요했다. 
Vectorizer에 파라미터로 tfidf를 True혹은 False로 해서 tfidf를 해줄지 말지를 결정할 수 있도록 했다. 그 외에도 성능을 체크하기 위해 max_len, stop_word같은 옵션을 두었다.

In [63]:
import math
import numpy as np
from nltk.corpus import stopwords 

class Vectorizer:
    def __init__(self, max_len = 100000, tfidf = False, stop_word = False, max_df = float("inf")):
        self.max_len = max_len
        self.tfidf = tfidf
        self.stop_word = stop_word
        self.max_df = max_df
    
    def vocab_(self):
        return self.word_count
    
    def make_vector(self, word_count):
        self.word_count = word_count
        sort_by_count = sorted(word_count.items(), key = lambda x:x[1][0], reverse = True)
        how_many = len(sort_by_count)
        if self.max_len > how_many:
            how_many = how_many
        elif self.max_len <= how_many:
            how_many = self.max_len
            
        count = 0
        index_of_words = {}
        self.idfs = []
        for key, value in sort_by_count:
            if count >= how_many:
                break
            if self.stop_word == True:
                if key not in stopwords.words("english"):
                    index_of_words[key] = count
                    if self.tfidf == True:
                        self.idfs.append(value[2])
                    count += 1
                else:
                    continue
            else:
                index_of_words[key] = count
                if self.tfidf == True:
                    self.idfs.append(value[2])
                count += 1

        self.max_len = how_many
        self.index_of_words = index_of_words
                
        
    def tfidfVect(self, word_count):
        nD = len(self.texts)
        for key, value in sorted(word_count.items(), key = lambda x:x[1][0], reverse = True):
            idf = math.log(nD/(1+value[1]))
            word_count[key].append(idf)
            word_count[key][0] = value[0] * idf
        
        self.make_vector(word_count)
        
    def countVect(self, word_count):
        self.make_vector(word_count)
        
    def fit(self, texts):
        self.texts = texts
        word_count = {}
        #word_count = {단어: [전체단어개수,단어가 나온 문장개수]}
        for text in texts:
            word_in_text = []
            
            for word in text.split():
                if word not in word_count.keys():
                    word_count[word] = [1, 1]
                    word_in_text.append(word)
                else:
                    word_count[word][0] += 1
                    word_in_text.append(word)
            
            for i in set(word_in_text):
                word_count[i][1] += 1
        
        chaier = []
        for key, value in word_count.items():
            if value[0] > self.max_df:
                chaier.append(key)
        for i in chaier:
            del word_count[i]
        
        if self.tfidf:
            self.tfidfVect(word_count)
        
        else:
            self.countVect(word_count)
        
    def transform(self, texts):
        ret_list = [[0 for i in range(self.max_len)] for i in range(len(texts))]
        for i in range(len(texts)):
            text = texts[i]
            for word in text.split():
#                 print(word)
                if word in self.index_of_words.keys():
#                     print(self.index_of_words[word])
                    ret_list[i][self.index_of_words[word]] += 1
        
        ret_list = np.array(ret_list)
        idfs = np.array(self.idfs)
        print(ret_list.shape, idfs.shape)
        if self.tfidf == True:
            ret_list = ret_list * idfs
        
        return ret_list

### NaiveBayes Classifier를 만들어준다.
class로 만들어서 여러번의 함수를 호출하지 않고도 동일한 작업을 수행할 수 있도록 한다.
smoothing옵션을 파라미터로 줄 수 있다. 

In [64]:
from collections import defaultdict

class NaiveBayes:
    def __init__(self, smoothing = 0):
        self.smoothing = smoothing
        self.term_docs_train_dict = {}
        self.y_ = {}
        
    def _split_sort(self, X, y):
#         print(X.shape)
        for label in range(self.how_many_label):
            y_temp = []
            x_temp = []
            for i in range(len(y)):
#                 print(i)
                if y[i] == label:
                    y_temp.append(1)
                    x_temp.append(X[i])
                else:
                    y_temp.append(-1)
                    x_temp.append(X[i])
            self.term_docs_train_dict[label] = np.array(x_temp)
            self.y_[label] = y_temp
    
    def _get_label_index(self, labels):
        label_index = defaultdict(list)
        for index, label in enumerate(labels):
            label_index[label].append(index)
        return label_index
    
    def _get_prior(self, label_index):
        prior = {label: len(index) for label, index in label_index.items()}
        pr_zero = prior[1]
        pr_one = prior[-1]

        prior[1] = pr_zero/(pr_zero+ pr_one)
        prior[-1] = pr_one/(pr_zero+pr_one)

        return prior
    
    def _get_likelihood(self, term_doc_matrix, label_index, smoothing=0):
        likelihood = {}
        word_count = term_doc_matrix.shape[1]
        total_word_len_in_s = {}
        total_word_set_in_s = {}
        for label, index in label_index.items():
            whole = 0
            for ix in index:
                whole += sum(term_doc_matrix[ix])
            total_word_len_in_s[label] = whole
        for label, index in label_index.items():
            whole = 0
            for ix in index:
                count = 0
                for tmp in term_doc_matrix[ix]:
                    if tmp != 0:
                        count += 1
                whole += count
            total_word_set_in_s[label] = whole
        likelihood[1] = np.zeros(word_count)
        likelihood[-1] = np.zeros(word_count)
        lab_one = np.array([term_doc_matrix[idx] for idx in label_index[-1]])
        lab_zero = np.array([term_doc_matrix[idx] for idx in label_index[1]])
        sum_one = lab_one.sum(axis = 0)
        sum_zero = lab_zero.sum(axis = 0)
        for i in range(word_count):
            likelihood[1][i] = (sum_zero[i]+smoothing) / (total_word_len_in_s[1]+total_word_set_in_s[1])
            likelihood[-1][i] = (sum_one[i]+smoothing) / (total_word_len_in_s[-1]+total_word_set_in_s[-1])

        return likelihood
    
    
    def _get_posterior(self, term_doc_matrix, prior, likelihood):

        a = []
        tt = term_doc_matrix.sum(axis=1)
        for i in range(term_doc_matrix.shape[0]):
            a.append((term_doc_matrix[i]/tt[i])*100)
        term_doc_matrix = np.array(a)


        num_docs = term_doc_matrix.shape[0]
        posteriors = [None for i in range(num_docs)]
        posterior_yes = likelihood[1]
        posterior_no = likelihood[-1]


        posterior_yes = (np.log(posterior_yes) * term_doc_matrix).sum(axis = 1) + np.log(prior[1])
        posterior_no = (np.log(posterior_no) * term_doc_matrix).sum(axis = 1) + np.log(prior[-1])
        posterior_yes = np.exp(posterior_yes)
        posterior_no = np.exp(posterior_no)
        num_docs = term_doc_matrix.shape[0]

        for i in range(num_docs):
            if posterior_yes[i] == 0 and posterior_no[i] == 0:
                if posterior_yes[i] < posterior_no[i]:        
                    tmp = {-1: 0, 1:1}
                else:
                    tmp = {-1:1,1:0}
                posteriors[i] =tmp
                continue

            tmp = {-1: posterior_no[i]/(posterior_yes[i]+posterior_no[i]), 1:posterior_yes[i]/(posterior_yes[i]+posterior_no[i])}

            posteriors[i] =tmp

        return posteriors

    
    def fit(self, X, y):
        self.how_many_label = len(set(y))
        self._split_sort(X, y)
        self.label_index_dict = {}
        for lab in range(self.how_many_label):
            label_index = self._get_label_index(self.y_[lab])
            self.label_index_dict[lab] = label_index

        self.prior_dict = {}
        for lab in range(self.how_many_label):
            prior = self._get_prior(self.label_index_dict[lab])
            self.prior_dict[lab] = prior
        
        self.likelihood_dict = {}
        for lab in range(self.how_many_label):
            likelihood = self._get_likelihood(self.term_docs_train_dict[lab], self.label_index_dict[lab], self.smoothing) 
            self.likelihood_dict[lab] = likelihood
        
    def predict(self, X):
        posteriors_dict = {}
        for lab in range(self.how_many_label):
            posteriors = self._get_posterior(X, self.prior_dict[lab], self.likelihood_dict[lab])
            posteriors_dict[lab] = posteriors
            
        a = {}
        for text in range(X.shape[0]):
            max_id = -1
            max_val = -float("inf")
            for lab in range(5):    
                if max_val < posteriors_dict[lab][text][1]:
                    max_id = lab
                    max_val = posteriors_dict[lab][text][1]
            a[text] = max_id
        return list(a.values())

### 정확도 측정 함수
a,b의 값을 비교해 같으면 count를 올리고 전체 개수로 나눈다.

In [65]:
def accuracy_function(a, b):
    b = list(b)
    a = list(a)
    count = 0
    for i in range(len(b)):
        if a[i] == b[i]:
            count+= 1
    return count / len(b)

### pandas를 통해 train과 test에 해당하는 csv파일을 불러와 저장한다.

In [66]:
import pandas as pd
train_df = pd.read_csv("TRAIN.csv")
test_df = pd.read_csv("TEST.csv")
x_train = train_df.iloc[:, 2]
y_train = train_df.iloc[:,-1].astype(int)
x_train = [clean_text(doc) for doc in x_train]
x_test = test_df["Post"]
x_test = [clean_text(doc) for doc in x_test]
print(len(x_train), len(x_test))

350 150


### train data분리
test데이터는 label이 없기 때문에 정확성 테스트를 위해 train data를 분할해준다. 분할해주는 함수를 직접 만들어 사용하였다.

In [67]:
def train_test_spliter(X, y, test_rate = 0.858):
    standard = int(len(X)* test_rate)
    return (X[:standard], X[standard:]), (y[:standard], y[standard:])

In [68]:
(train_x_train,test_x_train), (train_y_train, test_y_train) = train_test_spliter(x_train, y_train)

### 라벨이 있는 train data를 분리한 것을 통해 정확도를 테스트한다.
제작한 Vectorizer, NaiveBayes를 활용하여 분석한다.

In [100]:
cv = Vectorizer(max_len = 600, tfidf = False, stop_word = True, max_df = 500)
cv.fit(x_train)
term_docs_train = cv.transform(train_x_train)
term_docs_test = cv.transform(test_x_train)

(300, 600) (0,)
(50, 600) (0,)


In [101]:
NB = NaiveBayes(smoothing = 1)
NB.fit(term_docs_train, train_y_train)
y_pred = NB.predict(term_docs_test)

In [102]:
accuracy_function(y_pred, test_y_train)

0.32

In [103]:
cv.vocab_()

{'dbt': [5, 3],
 'mind': [265, 122],
 'explain': [87, 60],
 'little': [317, 140],
 'bite': [240, 114],
 'sorry': [358, 117],
 'hear': [306, 120],
 'suppose': [66, 51],
 'actually': [313, 151],
 'mean': [357, 139],
 'definitely': [110, 63],
 'person': [444, 153],
 'honestly': [78, 59],
 'sure': [404, 154],
 'why': [488, 169],
 'shun': [2, 3],
 'sound': [466, 148],
 'anything': [469, 166],
 'plan': [138, 76],
 'pain': [307, 129],
 'ish': [1, 2],
 'cut': [127, 70],
 'each': [132, 84],
 'tire': [336, 98],
 'concern': [63, 42],
 'symptoms': [26, 19],
 'wake': [84, 59],
 'bad': [376, 150],
 'dream': [62, 44],
 'hit': [91, 66],
 'rock': [34, 25],
 'bottom': [12, 12],
 'pleasant': [7, 8],
 'lady': [4, 5],
 'conduct': [3, 4],
 'therapy': [87, 56],
 'sessions': [1, 2],
 'kill': [479, 202],
 'especially': [112, 66],
 'since': [220, 127],
 'youve': [384, 121],
 'carry': [44, 36],
 'chance': [136, 83],
 'bet': [35, 29],
 'days': [176, 101],
 'row': [5, 6],
 'ever': [393, 152],
 'message': [75, 45],

### 라벨이 없는 test data를 예측하여 csv파일로 바꾸어준다.

In [104]:
cv = Vectorizer(1000, tfidf = False, stop_word = True)
cv.fit(x_train)
term_docs_train = cv.transform(x_train)
test_term_docs = cv.transform(x_test)

(350, 1000) (0,)
(150, 1000) (0,)


In [105]:
NB = NaiveBayes(smoothing = 1)
NB.fit(term_docs_train, y_train)
tes_pred = NB.predict(test_term_docs)

In [106]:
for i in range(len(test_pred)):
    print(tes_pred[i], test_pred[i])

4 4
4 4
4 4
1 1
2 2
0 0
3 3
3 3
3 3
1 1
3 1
4 3
0 0
0 0
0 0
4 4
4 4
3 3
4 4
4 4
1 1
4 4
0 0
4 4
3 3
4 4
4 4
4 4
4 4
3 0
4 4
1 1
0 0
3 4
4 4
1 1
4 4
0 0
0 0
4 4
4 4
4 4
4 4
2 2
4 4
0 4
4 4
4 3
1 1
0 0
4 4
4 4
0 0
3 3
0 0
3 3
0 0
1 1
0 0
4 4
4 4
4 4
0 0
0 0
3 3
1 4
4 4
4 4
3 3
4 4
0 0
3 3
4 4
4 4
4 4
4 4
4 4
3 3
1 1
4 4
3 3
3 1
4 4
4 4
3 4
1 1
3 3
4 4
4 4
1 1
4 4
4 4
3 0
4 4
3 3
4 4
4 4
4 4
4 4
4 4
0 4
4 4
0 0
0 0
1 1
1 1
4 4
3 3
1 1
0 0
4 4
4 4
0 0
0 0
0 0
3 3
4 4
0 0
4 4
1 1
3 3
1 1
2 2
0 0
4 4
3 3
4 4
0 0
0 0
1 1
4 4
4 4
4 4
0 0
4 4
4 4
0 0
3 3
3 3
4 4
0 0
4 4
4 4
3 3
4 4
0 0
4 4
4 4
1 1
3 3


In [80]:
pd.DataFrame(test_pred).to_csv("2018310412.csv", index=False)