# Scikit Learn을 사용한 나이브 베이지안 분류

* Naive 어리숙한 : 모든 사건을 독립적이라고 가정

In [2]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving weather.csv to weather.csv


In [3]:
import pandas as pd
data = pd.read_csv(filename, sep=',')

In [5]:
# Naive Bayes 모듈 중 다항분포 나이브 베이스 가져와 인스턴스화 하기
from sklearn.naive_bayes import MultinomialNB
Multinomial_model = MultinomialNB()


In [6]:
Multinomial_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
# mapping dic 만들기
outlook_dic = {'overcast':0, 'rainy':1, 'sunny':2}
temperature_dic = {'cool':0, 'hot':1, 'mild':2}
humidity_dic = {'high':0, 'normal':1}
windy_dic = {False:0, True:1}

In [8]:
# 딕셔너리를 활용해 데이터에 매핑
data['outlook'] = data['outlook'].map(outlook_dic)
data['temperature'] = data['temperature'].map(temperature_dic)
data['humidity'] = data['humidity'].map(humidity_dic)
data['windy'] = data['windy'].map(windy_dic)

In [9]:
data

Unnamed: 0,outlook,temperature,humidity,windy,play
0,0,1,0,0,yes
1,0,0,1,1,yes
2,0,2,0,1,yes
3,0,1,1,0,yes
4,1,2,0,0,yes
5,1,0,1,0,yes
6,1,0,1,1,no
7,1,2,1,0,yes
8,1,2,0,1,no
9,2,1,0,0,no


In [14]:
# multinational 모델 학습
Multinomial_model.fit(data.iloc[:, :4], data['play'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
# 오늘 날씨 예측
Multinomial_model.predict([[1, 2, 0, 1]])

array(['yes'], dtype='<U3')

In [17]:
# 계산된 확률
Multinomial_model.predict_proba([[2, 2, 0, 1]])

array([[0.57311795, 0.42688205]])

연속적인 데이터 iris 나이브 베이지안 분류

In [18]:
from sklearn.datasets import load_iris
iris = load_iris()

In [19]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target

In [20]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [21]:
# 가우시안 모델 인스턴스화
from sklearn.naive_bayes import GaussianNB
gaussian_model = GaussianNB()

In [22]:
# 데이터를 학습데이터와 테스트 데이터로 분할 후 학습 데이터로 학습

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:, :4], iris_df['species'], test_size=0.33)

In [24]:
gaussian_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
# 테스트 데이터로 성능평가
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, gaussian_model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.95      1.00      0.97        18
           2       1.00      0.94      0.97        17

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [26]:
print(confusion_matrix(y_test, gaussian_model.predict(X_test)))

[[15  0  0]
 [ 0 18  0]
 [ 0  1 16]]
