### 분류 : Naive Bayes
1. 사전확률 및 추가정보를 기반으로 사후 확률을 추론 : 베이즈 추정 기반 분류
2. 종속변수 각 범주의 등장 빈도를 사전확률(prior) 설정이 중요
3. 각 데이터의 사전 확률을 기반으로 사후 확률(posterior)을 계산

In [115]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

|라벨|특징벡터|
|---|---|
| 1|[1,2,3,4,5,6,7,8]|
| 2|[1,1,3,4,5,6,6,7]|
| 3|[2,1,2,3,4,8,8,8]|

#### 테스트 데이터

|테스트 데이터|
|---|
|[2,2,4,5,6,8,8,8]|


#### 1. 정규분포(가우시안 분포)
$$
{p(x; \mu, \sigma^{2}) = \frac{1}{\sqrt{2\pi\sigma^{2}}}\exp^{\Bigl\{-\frac{(x-\mu)^{2}}{2\sigma^{2}}\Bigr\}}
}
$$ 

$$
x=연속변수, \mu=평균, \sigma=분산
$$

In [None]:
- GaussianNB클래스로 모델 객체를 생성하고 독립변수와 종속변수를 fit()으로 지정해서 실행한다
- predict_proba(X)로 예측 확률값을 계산한다
- 만약에 이진분류를 할 경우 출력된 예측 확률값의 두 번째 열이 1이 될 확률로 계산된다

In [14]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
X = np.array([[1,2,3,4,5,6,7,8],
              [1,1,3,4,5,6,6,7],
              [2,1,2,3,4,8,8,8]]) #특징 벡터

y = np.array([1,2,3]) # y라벨

t = np.array([2,2,4,5,6,8,8,8]).reshape(1,-1) # 테스트 데이터

clf = GaussianNB()
clf.fit(X,y)

clf.predict(t)
# clf.predict(X[[2]])

array([1])

In [20]:
X

array([[1, 2, 3, 4, 5, 6, 7, 8],
       [1, 1, 3, 4, 5, 6, 6, 7],
       [2, 1, 2, 3, 4, 8, 8, 8]])

2. 베르누이 분포
    - 특징이 0과 1로 표현되는 벡터형을 사용할 때 
$$
{p(x;q) = q^{x}(1-q)^{1-x}
}
$$

In [15]:
from sklearn.naive_bayes import BernoulliNB
clf = GaussianNB()
clf.fit(X,y)

clf.predict(t)

array([1])

3. 다항분포: MultinomialNB(*, alpha=1.0, fit_prior=True, class_prior=None)

In [19]:
from sklearn.naive_bayes import MultinomialNB
t = np.array([2,3,4,5,6,8,8,8])
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit(X,y)

clf.predict([t])

array([1])

In [52]:
# 1. iris 로드해서 GaussianNB()로 분류해보자 _accuracy_
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn import datasets

# X, y = load_iris(return_X_y=True)
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = GaussianNB()
model.fit(X_train,y_train)

predicted =model.predict(X_test)

metrics.accuracy_score(y_test, predicted)


0.9466666666666667

In [53]:
model.theta_

array([[4.97586207, 3.35862069, 1.44827586, 0.23448276],
       [5.935     , 2.71      , 4.185     , 1.3       ],
       [6.77692308, 3.09230769, 5.73461538, 2.10769231]])

In [64]:
res = model.predict_proba(X) #예측 확률값
# res

In [65]:
# 80% 이상 예측확률만 리턴
res_class = (res>0.8) #True FALSE로 리턴
res_class = (res>0.8)+0
# res_class

In [69]:
#2. diabates.cvs를 로드해서, BMI가 0초과한 데이터만 사용해서 나이브 베이즈 분류
# Outcome을 종속 변수로 하고 나머지를 독립변수로 할 때 종속 변수의 사전 확률은
import pandas as pd
df = pd.read_csv('../../ML_DATA/diabetes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [77]:
df_sub = df.loc[df['BMI']>0,]
df_sub

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [78]:
#2-2 Outcome을 종속 변수로 하고 나머지를 독립변수로 할 때 종속 변수의 사전 확률은?
# - 만약에 이진분류를 할 경우 출력된 예측 확률값의 두 번째 열이 1이 될 확률로 계산된다
df_sub['Outcome'].value_counts(normalize=True) #0.351387

0    0.648613
1    0.351387
Name: Outcome, dtype: float64

In [136]:
# 3. 혈당 혈압 나이을 독립변수로 하고 당뇨발병 여부를 종속 변수로 했을 때 정확도는 얼마인가
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn import datasets
df = pd.read_csv('../../ML_DATA/diabetes.csv')
df.head()
# Glucose BloodPressure Age y = Outcome
X = df[['Glucose','BloodPressure','Age']]
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = GaussianNB()
model.fit(X_train,y_train)

predicted =model.predict(X_test)

metrics.accuracy_score(y_test, predicted)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


0.7447916666666666

In [137]:
# 3. 혈당 혈압 나이을 독립변수로 하고 당뇨발병 여부를 종속 변수로 했을 때 정확도는 얼마인가
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn import datasets
df = pd.read_csv('../../ML_DATA/diabetes.csv')
df.head()
# Glucose BloodPressure Age y = Outcome
X = df.loc[:,['Glucose','BloodPressure','Age']]
y = df['Outcome']

model = GaussianNB().fit(X,y)
model.fit(X,y)

pred = model.predict_proba(X)

pred_class = (pred[:, 1]>0.5)+0
pred_class[:4]

accuracy_score(y_pred= pred_class, y_true=y)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


array([1, 0, 1, 0])

0.7552083333333334


 4. 임신유무, 연령대, BMI, 혈당을 독립변수로 하고 
 - 당뇨별 발병 유무를 종속 변수로 했을 때 나이브베이즈와 로지스틱회귀분석을 실시하고 둘 중 정확도가 높은 모델
 - 조건 1 : 
    - BMI가 0을 초과한 것을 사용한다 
    - 데이터 세트를 8:2로 분할
    - seed는 123
 - 조건2
     - 연령 age가 21인 경우 20으로 , 39일 경우 30으로 계산
 - 조건3
     - 조지스틱회귀분설을 실시하고 임계값을 0.5로 한다

In [128]:
#4.번 문제
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('../../ML_DATA/diabetes.csv')
df = df.loc[df['BMI']>0,]

# 조건1-1 BMI가 0을 초과한 것을 사용한다
X = df.loc[:,['Pregnancies','Age','BMI','Glucose','Outcome']]
y = df['Outcome']

# 조건 2 연령 age가 21인 경우 20으로 , 39일 경우 30으로 계산 -> 버림
X['Age'].replace(21,20, inplace=True)
X['Age'].replace(39,30, inplace=True)

# 조건 1-2, 1-3 연령 age가 21인 경우 20으로 , 39일 경우 30으로 계산
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


# 조건 3 조지스틱회귀분설을 실시하고 임계값을 0.5로 한다


Unnamed: 0,Pregnancies,Age,BMI,Glucose
0,6,50,33.6,148
1,1,31,26.6,85
2,8,32,23.3,183
3,1,21,28.1,89
4,0,33,43.1,137
...,...,...,...,...
763,10,63,32.9,101
764,2,27,36.8,122
765,5,30,26.2,121
766,1,47,30.1,126


In [129]:
# X['Age'].loc[X['Age']==30]

In [152]:
model = GaussianNB().fit(X_train,y_train)
model.fit(X_train,y_train)

pred = model.predict_proba(X_test)

pred_class = (pred[:, 1]>0.5)+0
pred_class[:4]

accuracy_score(y_pred= pred_class, y_true=y_test)

array([1, 0, 0, 1])

0.7447916666666666

In [153]:
logistic = LogisticRegression()
logistic.fit(X_train,y_train)

logistic_pridict = logistic.predict(X_test)
accuracy_score(y_pred= logistic_pridict, y_true=y_test)

0.7369791666666666

In [154]:
import pandas as pd
df = pd.read_csv('../../ML_DATA/diabetes.csv')
df
#4-2 조건
#BMI가 0 초과한 것을 사용하며 학습/ 평가 데이터 세트를 8:2로 분할, Seed는 123으로 한다
#연령 AGE가 21인 경우 20으로, 39일경우 30으로 계산한다.
df = df.loc[df["BMI"] > 0, ]
df["Age_g"] = (df["Age"] // 10) * 10
df["is_preg"] = (df["Pregnancies"] > 0) + 0
df.head(2)
df_train, df_test = train_test_split(df, train_size = 0.8, random_state = 123)
df_train.head(2)
model = GaussianNB().fit(X = df_train.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]],
y = df_train["Outcome"])
pred = model.predict_proba(df_test.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]])
pred[:4, ]
accuracy_score(y_pred = (pred[:, 1] > 0.5) + 0,
y_true = df_test["Outcome"])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age_g"] = (df["Age"] // 10) * 10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_preg"] = (df["Pregnancies"] > 0) + 0


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age_g,is_preg
0,6,148,72,35,0,33.6,0.627,50,1,50,1
1,1,85,66,29,0,26.6,0.351,31,0,30,1


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Age_g,is_preg
247,0,165,90,33,680,52.3,0.427,23,0,20,0
659,3,80,82,31,70,34.2,1.292,27,1,20,1


array([[0.09436402, 0.90563598],
       [0.74783283, 0.25216717],
       [0.11042961, 0.88957039],
       [0.57991266, 0.42008734]])

0.8026315789473685

In [155]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(X = df_train.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]],
y = df_train["Outcome"])
pred_lr = model_lr.predict_proba(df_test.loc[:, ["is_preg", "Age_g", "BMI", "Glucose"]])
pred_lr = pred_lr[:, 1]
pred_lr_class = (pred_lr > 0.5) + 0
accuracy_score(y_true = df_test["Outcome"], y_pred = pred_lr_class)

0.8289473684210527