# Naive Bayes
= 특성들 사이의 독립(=한 특성의 결과가 다른 특성에 영향을 주지 않는다)을 가정하는 베이즈 정리를 적용한 확률적인 알고리즘이다.

> 입력 특성에 따라 3개의 분류기가 존재한다.
> - 가우시안 나이브 베이즈 분류기
> - 베르누이 나이브 베이즈 분류기
> - 다항 나이브 베이즈 분류기

> ***parameter***  
> *BernoulliNB, MultinomialNB는 모델의 복잡도를 조절 가능한 **alpha**를 갖는다.*  


> GaussianNB의 경우 대부분 고차원인 데이터셋에 사용되고,  
> BernoulliNB, MultinomialNB은 텍스트와 같은 희소한 데이터를 카운트하는 데 사용된다.

In [21]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.datasets import fetch_covtype, fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn import metrics
import matplotlib.pyplot as plt

## covtype dataset

In [2]:
covtype = fetch_covtype()
print(covtype.DESCR)

.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30×30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like 'Bunch' object
with the feature matrix in the ``data`` member
and the target values in ``target``. If optional argument 'as_frame' is
set to 'True', it will return ``data`` and ``target`

In [3]:
covtype_df = pd.DataFrame(data = covtype.data)
covtype_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396.0,153.0,20.0,85.0,17.0,108.0,240.0,237.0,118.0,837.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581008,2391.0,152.0,19.0,67.0,12.0,95.0,240.0,237.0,119.0,845.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581009,2386.0,159.0,17.0,60.0,7.0,90.0,236.0,241.0,130.0,854.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581010,2384.0,170.0,15.0,60.0,5.0,90.0,230.0,245.0,143.0,864.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
covtype_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.044175,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.205483,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
covtype.target

array([5, 5, 2, ..., 3, 3, 3])

In [6]:
covtype_X = covtype.data
covtype_y = covtype.target

In [7]:
covtype_X_train, covtype_X_test, covtype_y_train, covtype_y_test = train_test_split(covtype_X, covtype_y, 
                                                                                    test_size = 0.2, random_state = 1999)

In [8]:
print("dataset shape : {}".format(covtype_X.shape))
print("train shape : {}".format(covtype_X_train.shape))
print("test shape : {}".format(covtype_X_test.shape))

dataset shape : (581012, 54)
train shape : (464809, 54)
test shape : (116203, 54)


In [9]:
scaler = StandardScaler()

covtype_X_train_scaler = scaler.fit_transform(covtype_X_train)
covtype_X_test_scaler = scaler.transform(covtype_X_test)

In [10]:
covtype_scaler_df = pd.DataFrame(data = covtype_X_train_scaler)
covtype_scaler_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
count,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,...,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0,464809.0
mean,-4.900202e-16,2.808848e-17,8.061444e-16,-7.330369e-16,-2.087621e-16,-2.280752e-16,1.184295e-17,2.415142e-16,1.383988e-16,-8.443742e-17,...,3.00146e-16,9.170598e-15,8.068721e-15,-1.960082e-14,8.38792e-15,-5.053533e-15,-1.404638e-15,1.912168e-15,7.757928e-15,-1.058551e-16
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,...,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-3.931815,-1.390707,-1.881546,-1.267732,-3.765307,-1.508202,-7.91591,-11.28356,-3.722965,-1.495339,...,-0.2151944,-0.314573,-0.2910973,-0.05269314,-0.05733573,-0.01414646,-0.02277634,-0.1661724,-0.1560155,-0.1234023
25%,-0.5373816,-0.8722141,-0.6812554,-0.759578,-0.6769576,-0.7986879,-0.5279288,-0.5207186,-0.6140992,-0.7232763,...,-0.2151944,-0.314573,-0.2910973,-0.05269314,-0.05733573,-0.01414646,-0.02277634,-0.1661724,-0.1560155,-0.1234023
50%,0.1307858,-0.2553866,-0.1477928,-0.2420133,-0.2823351,-0.2258162,0.2183319,0.1361686,0.01289899,-0.2035314,...,-0.2151944,-0.314573,-0.2910973,-0.05269314,-0.05733573,-0.01414646,-0.02277634,-0.1661724,-0.1560155,-0.1234023
75%,0.7310645,0.9335709,0.5190353,0.5390387,0.3868073,0.6267554,0.7034014,0.6919963,0.6660221,0.4302853,...,-0.2151944,-0.314573,-0.2910973,-0.05269314,-0.05733573,-0.01414646,-0.02277634,-0.1661724,-0.1560155,-0.1234023
max,3.210787,1.827524,6.920586,5.272402,9.514597,3.057451,1.561601,1.551003,2.912766,3.923454,...,4.646962,3.178913,3.435277,18.9778,17.44113,70.68908,43.90522,6.017848,6.409618,8.103577


## newsgroup dataset

In [11]:
newsgroup = fetch_20newsgroups()
print(newsgroup.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [12]:
newsgroup_train = fetch_20newsgroups(subset = "train")
newsgroup_test = fetch_20newsgroups(subset = "test")

In [13]:
X_train, y_train = newsgroup_train.data, newsgroup_train.target
X_test, y_test = newsgroup_test.data, newsgroup_test.target

### Vectorization

- 텍스트 데이터는 기계하습 모델에 입력 할 수 없다.
- 벡터화하는 과정은 텍스트 데이터를 실수 벡터로 변환하여 기계학습 모델에 입력 할 수 있도록 하는 전처리 과정이다.
- sklearn에서는 Count, Tf-idf, Hashing의 세가지 방법을 지원한다.

#### CountVectorizer
- 문서에 존재하는 단어의 수를 세서 벡터를 생성하는 방식이다.

In [14]:
count_vectorizer = CountVectorizer()

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

#### HashingVectorizer
- 각 단어를 해쉬값으로 표현한다.
- 미리 정해진 크기의 벡터로 표현한다.

In [15]:
hash_vectorizer = HashingVectorizer(n_features = 1000)

X_train_hash = hash_vectorizer.fit_transform(X_train)
X_test_hash = hash_vectorizer.transform(X_test)

#### TfidfVectorizer

- 문서에 나온 단어의 빈도(term frequency)와 역문서 빈도(inverse document frequency)를 곱해서 계산한다.
- 각 빈도는 일반적으로 로그 스케일링후 사용된다.

In [16]:
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Gaussian NB
= 입력 특성이 가우시안(정규)분포를 갖는다고 가정

In [30]:
GaussianNB_model = GaussianNB()
GaussianNB_model.fit(covtype_X_train_scaler, covtype_y_train)

GaussianNB()

In [32]:
train_predict = GaussianNB_model.predict(covtype_X_train_scaler)
test_predict = GaussianNB_model.predict(covtype_X_test_scaler)

In [36]:
accuracy = metrics.accuracy_score(covtype_y_train, train_predict)
f1 = metrics.f1_score(covtype_y_train, predict, average = None)

print("train accuracy : {}".format(accuracy))
print("train f1 score : {}".format(f1))

train accuracy : 0.08784038174820195
train f1 score : [0.04057288 0.01784529 0.33400414 0.13626012 0.04334818 0.07134045
 0.23557879]


# Bernoulli NB 
= 입력 특성이 베르누이 분포에 의해 생성된 이진 값을 갖는 다고 가정

## count

In [38]:
count_model = BernoulliNB()
count_model.fit(X_train_count, y_train)

BernoulliNB()

In [39]:
count_predict = count_model.predict(X_train_count)
count_accuracy = metrics.accuracy_score(y_train, count_predict)
count_f1_score = metrics.f1_score(y_train, count_predict, average = None)

print("count accuracy : {}".format(count_accuracy))
print("count f1_score : {}".format(count_f1_score))

count accuracy : 0.7821283365741559
count f1_score : [0.80096502 0.8538398  0.13858268 0.70686337 0.85220126 0.87944493
 0.51627694 0.84532672 0.89064976 0.87179487 0.94561404 0.91331546
 0.84627832 0.89825848 0.9047619  0.79242424 0.84693878 0.84489796
 0.67329545 0.14742015]


In [40]:
count_predict = count_model.predict(X_test_count)
count_accuracy = metrics.accuracy_score(y_test, count_predict)
count_f1_score = metrics.f1_score(y_test, count_predict, average = None)

print("count accuracy : {}".format(count_accuracy))
print("count f1_score : {}".format(count_f1_score))

count accuracy : 0.6307753584705258
count f1_score : [0.47086247 0.60643564 0.01       0.56014047 0.6953405  0.70381232
 0.44829721 0.71878646 0.81797753 0.81893491 0.90287278 0.74794521
 0.61647059 0.64174455 0.76967096 0.63555114 0.64285714 0.77971474
 0.31382979 0.00793651]


## hash

In [46]:
hash_model = BernoulliNB()
hash_model.fit(X_train_hash, y_train)

BernoulliNB()

In [47]:
hash_predict = hash_model.predict(X_train_hash)
hash_accuracy = metrics.accuracy_score(y_train, hash_predict)
hash_f1_score = metrics.f1_score(y_train, hash_predict, average = None)

print("hash accuracy : {}".format(hash_accuracy))
print("hash f1_score : {}".format(hash_f1_score))

hash accuracy : 0.5951917977726711
hash f1_score : [0.74226804 0.49415205 0.45039019 0.59878155 0.57327935 0.63929619
 0.35390947 0.59851301 0.72695347 0.68123862 0.79809524 0.70532319
 0.54703833 0.66862745 0.61889927 0.74707471 0.6518668  0.60485269
 0.5324165  0.54576271]


In [48]:
hash_predict = hash_model.predict(X_test_hash)
hash_accuracy = metrics.accuracy_score(y_test, hash_predict)
hash_f1_score = metrics.f1_score(y_test, hash_predict, average = None)

print("hash accuracy : {}".format(hash_accuracy))
print("hash f1_score : {}".format(hash_f1_score))

hash accuracy : 0.4430430164630908
hash f1_score : [0.46678636 0.33826638 0.29391892 0.45743329 0.41939121 0.46540881
 0.34440068 0.46464646 0.62849873 0.53038674 0.63782051 0.55251799
 0.32635983 0.34266886 0.46105919 0.61780105 0.46197991 0.54591837
 0.27513228 0.3307888 ]


## tfidf

In [49]:
tfidf_model = BernoulliNB()
tfidf_model.fit(X_train_tfidf, y_train)

BernoulliNB()

In [50]:
tfidf_predict = tfidf_model.predict(X_train_tfidf)
tfidf_accuracy = metrics.accuracy_score(y_train, tfidf_predict)
tfidf_f1_score = metrics.f1_score(y_train, tfidf_predict, average = None)

print("tfidf accuracy : {}".format(tfidf_accuracy))
print("tfidf f1_score : {}".format(tfidf_f1_score))

tfidf accuracy : 0.7821283365741559
tfidf f1_score : [0.80096502 0.8538398  0.13858268 0.70686337 0.85220126 0.87944493
 0.51627694 0.84532672 0.89064976 0.87179487 0.94561404 0.91331546
 0.84627832 0.89825848 0.9047619  0.79242424 0.84693878 0.84489796
 0.67329545 0.14742015]


In [51]:
tfidf_predict = tfidf_model.predict(X_test_tfidf)
tfidf_accuracy = metrics.accuracy_score(y_test, tfidf_predict)
tfidf_f1_score = metrics.f1_score(y_test, tfidf_predict, average = None)

print("tfidf accuracy : {}".format(tfidf_accuracy))
print("tfidf f1_score : {}".format(tfidf_f1_score))

tfidf accuracy : 0.6307753584705258
tfidf f1_score : [0.47086247 0.60643564 0.01       0.56014047 0.6953405  0.70381232
 0.44829721 0.71878646 0.81797753 0.81893491 0.90287278 0.74794521
 0.61647059 0.64174455 0.76967096 0.63555114 0.64285714 0.77971474
 0.31382979 0.00793651]


## Multinomial NB
= 입력 특성이 다항분포에 의해 생성된 빈도수 값을 갖는다고 가정

## count

In [52]:
multi_model = MultinomialNB()
multi_model.fit(X_train_count, y_train)

MultinomialNB()

In [53]:
multi_predict = multi_model.predict(X_train_count)
multi_accuracy = metrics.accuracy_score(y_train, multi_predict)
multi_f1_score = metrics.f1_score(y_train, multi_predict, average = None)

print("train accuracy : {}".format(multi_accuracy))
print("train f1_score : {}".format(multi_f1_score))

train accuracy : 0.9245182959165635
train f1_score : [0.95228426 0.904      0.25073746 0.81402003 0.96669513 0.88350983
 0.90710383 0.97014925 0.98567818 0.99325464 0.98423237 0.95399516
 0.95703454 0.98319328 0.98584513 0.95352564 0.97307002 0.97467249
 0.95157895 0.86526946]


In [54]:
multi_predict = multi_model.predict(X_train_count)
multi_accuracy = metrics.accuracy_score(y_train, multi_predict)
multi_f1_score = metrics.f1_score(y_train, multi_predict, average = None)

print("train accuracy : {}".format(multi_accuracy))
print("train f1_score : {}".format(multi_f1_score))

test accuracy : 0.7728359001593202
test f1_score : [0.77901431 0.7008547  0.00501253 0.64516129 0.79178082 0.73370166
 0.76550681 0.88779285 0.93951094 0.91390728 0.94594595 0.78459938
 0.72299169 0.84635417 0.86029412 0.80846561 0.78665077 0.89281211
 0.60465116 0.48695652]


## tfidf

In [56]:
multi_model = MultinomialNB()
multi_model.fit(X_train_tfidf, y_train)

MultinomialNB()

In [57]:
multi_predict = multi_model.predict(X_train_tfidf)
multi_accuracy = metrics.accuracy_score(y_train, multi_predict)
multi_f1_score = metrics.f1_score(y_train, multi_predict, average = None)

print("train accuracy : {}".format(multi_accuracy))
print("train f1_score : {}".format(multi_f1_score))

train accuracy : 0.9326498143892522
train f1_score : [0.87404162 0.95414462 0.95726496 0.92863002 0.97812773 0.97440273
 0.91090909 0.97261411 0.98659966 0.98575021 0.98026316 0.94033413
 0.9594478  0.98032506 0.97755611 0.77411003 0.93506494 0.97453907
 0.90163934 0.45081967]


In [59]:
multi_predict = multi_model.predict(X_test_tfidf)
multi_accuracy = metrics.accuracy_score(y_test, multi_predict)
multi_f1_score = metrics.f1_score(y_test, multi_predict, average = None)

print("train accuracy : {}".format(multi_accuracy))
print("train f1_score : {}".format(multi_f1_score))

train accuracy : 0.7738980350504514
train f1_score : [0.63117871 0.72       0.72778561 0.72104019 0.81309686 0.81643836
 0.7958884  0.88135593 0.93450882 0.91071429 0.92917167 0.73583093
 0.69732938 0.81907433 0.86559803 0.60728118 0.76286353 0.92225201
 0.57977528 0.24390244]
