-- Bayes Theorem --

describes the probability of an event, based on a priori knowledge that may be related to the event.

Used as an example in:
- spam filters;
- emotion mining;
- separation of documents;

##### Credit Risk

In [None]:
import pandas as pd
# read file
base_credit_risk = pd.read_csv('../examples/credit_risk.csv')
# separate data in categories and results
x_credit_risk = base_credit_risk.iloc[:, 0:4].values # categories
y_credit_risk = base_credit_risk.iloc[:, 4].values # results

In [None]:
# show formatted categories to trainning
x_credit_risk

In [None]:
# show results to trainning
y_credit_risk

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
# set default label for each category
label_encoder_history = LabelEncoder()
label_encoder_debt = LabelEncoder()
label_encoder_collateral = LabelEncoder()
label_encoder_income = LabelEncoder()

In [None]:
# transform categories to unique int value for each category
x_credit_risk[:,0] = label_encoder_history.fit_transform(x_credit_risk[:,0])
x_credit_risk[:,1] = label_encoder_debt.fit_transform(x_credit_risk[:,1])
x_credit_risk[:,2] = label_encoder_collateral.fit_transform(x_credit_risk[:,2])
x_credit_risk[:,3] = label_encoder_income.fit_transform(x_credit_risk[:,3])
# show credit risk formatted
x_credit_risk

In [None]:
# procedure necessary to get data after fit without necessity to re-encoder 
import pickle
# add serialized data in file opened
with open('../examples/credit_risk.pkl', 'wb') as f:
  pickle.dump([x_credit_risk, y_credit_risk], f)

In [None]:
# training algorithm
naive_credit_risk = GaussianNB()
naive_credit_risk.fit(x_credit_risk, y_credit_risk)

In [None]:
# examples to predict
## historia boa (0), dívida alta (0), garantias nenhuma (1), renda > 35 (2) # must be "baixo"
## historia ruim (2), dívida alta (0), garantias adequada (0), renda < 15 (0) # must be "moderado"

# predict algotithm
prevision = naive_credit_risk.predict([[0,0,1,2], [2,0,0,0]])
# show predict
prevision

##### Base Credit data

In [None]:
import pickle
# read credit.pkl file
with open('../examples/credit.pkl', 'rb') as f:
  x_credit_trainning, y_credit_trainning, x_credit_test, y_credit_test = pickle.load(f)

In [None]:
# show shape of trainning variables
x_credit_trainning.shape, y_credit_trainning.shape

In [None]:
# show shape of test variables
x_credit_test.shape, y_credit_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB
# training algorithm
naive_credit_data = GaussianNB()
naive_credit_data.fit(x_credit_trainning, y_credit_trainning)

In [None]:
# predict
predict = naive_credit_data.predict(x_credit_test)
# show predict
predict

In [None]:
from sklearn.metrics import accuracy_score
# accuracy score of compare between test and predict
accuracy_score(y_credit_test, predict)

In [None]:
from sklearn.metrics import confusion_matrix
# same index row in the same index column, are record count is correct
confusion_matrix(y_credit_test, predict)

#### Base census

In [None]:
import pickle
# read census.pkl file
with open('../examples/census.pkl', 'rb') as f:
  x_census_trainning, y_census_trainning, x_census_test, y_census_test = pickle.load(f)

In [None]:
# show shape of trainning variables
x_census_trainning.shape, y_census_trainning.shape

In [None]:
# show shape of test variables
x_census_test.shape, y_census_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB
# training algorithm
naive_census = GaussianNB()
naive_census.fit(x_census_trainning, y_census_trainning)
# predict
predict = naive_census.predict(x_census_test)
# show predict
predict

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_census_test, predict)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_census_test, predict)