# Naive Bayes

In [1]:
import pandas as pd
data = pd.read_csv('weather.csv', sep=',')

In [2]:
data

Unnamed: 0,outlook,temperature,humidity,windy,play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
4,rainy,mild,high,False,yes
5,rainy,cool,normal,False,yes
6,rainy,cool,normal,True,no
7,rainy,mild,normal,False,yes
8,rainy,mild,high,True,no
9,sunny,hot,high,False,no


In [3]:
# 인스턴스화
from sklearn.naive_bayes import MultinomialNB

In [4]:
multinomial_model = MultinomialNB()

In [5]:
multinomial_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [6]:
# 숫자로 변환 
# mapping dictionary

outlook_dic = {"overcast":0, "rainy":1, "sunny":2}
temperature_dic = {'cool':0, 'hot':1, 'mild':2}
humidity_dic = {'high':0, 'normal':1}
windy_dic = {False:0, True:1}

In [7]:
# 딕셔너리를 활용해 데이터 맵핑

data['outlook'] = data['outlook'].map(outlook_dic)
data['temperature'] = data['temperature'].map(temperature_dic)
data['humidity'] = data['humidity'].map(humidity_dic)
data['windy'] = data['windy'].map(windy_dic)


In [8]:
data

Unnamed: 0,outlook,temperature,humidity,windy,play
0,0,1,0,0,yes
1,0,0,1,1,yes
2,0,2,0,1,yes
3,0,1,1,0,yes
4,1,2,0,0,yes
5,1,0,1,0,yes
6,1,0,1,1,no
7,1,2,1,0,yes
8,1,2,0,1,no
9,2,1,0,0,no


In [9]:
# 모델 학습
multinomial_model.fit(data.iloc[:, :4], data['play'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# 예측
multinomial_model.predict([[1, 2, 0, 1]])

array(['yes'], dtype='<U3')

In [11]:
# 계산된 확률
multinomial_model.predict_proba([[2, 2, 0, 1]])

array([[0.57311795, 0.42688205]])

## 연속적인 데이터

In [12]:
from sklearn.datasets import load_iris
iris = load_iris()

In [13]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target

In [14]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [15]:
# 가우시안 모델 인스턴스화
from sklearn.naive_bayes import GaussianNB
gaussian_model = GaussianNB()

In [16]:
# 트레인/테스트 분할 후 학습 데이터로 학습

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:, :4],
                                                    iris_df['species'],
                                                    test_size=0.33)

In [17]:
gaussian_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [18]:
# 성능 평가

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, gaussian_model.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        20
           2       0.91      1.00      0.95        20

    accuracy                           0.96        50
   macro avg       0.97      0.97      0.97        50
weighted avg       0.96      0.96      0.96        50



In [19]:
print(confusion_matrix(y_test, gaussian_model.predict(X_test)))

[[10  0  0]
 [ 0 18  2]
 [ 0  0 20]]
