# Pré processamento da Base de Dados

# Importando Bibliotecas

In [82]:
import pandas as pd # Para tratar dos Data Series e Data Frames
from sklearn.impute import SimpleImputer # Essa biblioteca cuida dos dados faltantes
import numpy as np # Biblioteca de Álgebra Linear em Python
from sklearn.preprocessing import StandardScaler # Cuidará do escalonamento dos atributos
from sklearn.model_selection import train_test_split # Função que fará automaticamente a divisão da base de dados
from sklearn.naive_bayes import GaussianNB # Algoritmo Naive Bayes para treinamento supervisionado

In [50]:
dados_cred = pd.read_csv('credit_data.csv')

In [51]:
dados_cred

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [52]:
# Criação das variáveis previsores e classe (matriz de recursos e dependente)
previsores = dados_cred.iloc[:,1:4].values
classe = dados_cred.iloc[:,4].values


In [53]:
print(previsores)

[[6.61559251e+04 5.90170151e+01 8.10653213e+03]
 [3.44151540e+04 4.81171531e+01 6.56474502e+03]
 [5.73171701e+04 6.31080495e+01 8.02095330e+03]
 ...
 [4.43114493e+04 2.80171669e+01 5.52278669e+03]
 [4.37560566e+04 6.39717958e+01 1.62272260e+03]
 [6.94365796e+04 5.61526170e+01 7.37883360e+03]]


In [54]:
print(classe)

[0 0 0 ... 1 0 0]


In [57]:
# Renomeando colunas
dados_cred.rename({'clientid':'ID Cliente','income':'Renda Anual','age':'Idade','loan':\
                   'Emprestado','default':'Quitado'},axis=1, inplace=True)

In [58]:
dados_cred

Unnamed: 0,ID Cliente,Renda Anual,Idade,Emprestado,Quitado
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


# Tratamento de valores inconsistentes

In [9]:
# Verificando os valores NaN ('Not a number')
dados_cred.isnull().sum()

ID Cliente     0
Renda Anual    0
Idade          3
Emprestado     0
Quitado        0
dtype: int64

In [59]:
# Deletando a coluna ID Cliente
dados_cred.drop('ID Cliente',axis=1,inplace=True)

In [60]:
# Verificando quarenta primeiros registros
dados_cred.head(41)

Unnamed: 0,Renda Anual,Idade,Emprestado,Quitado
0,66155.925095,59.017015,8106.532131,0
1,34415.153966,48.117153,6564.745018,0
2,57317.170063,63.108049,8020.953296,0
3,42709.534201,45.751972,6103.64226,0
4,66952.688845,18.584336,8770.099235,1
5,24904.06414,57.471607,15.498598,0
6,48430.359613,26.809132,5722.581981,0
7,24500.141984,32.897548,2971.00331,1
8,40654.892537,55.496853,4755.82528,0
9,25075.872771,39.776378,1409.230371,0


In [61]:
# Separando os dados com problemas localizados acima
dados_cred.loc[[15,21,26,28,30,31]]

Unnamed: 0,Renda Anual,Idade,Emprestado,Quitado
15,50501.726689,-28.218361,3977.287432,0
21,32197.620701,-52.42328,4244.057136,0
26,63287.038908,-36.496976,9595.286289,0
28,59417.805406,,2082.625938,0
30,48528.852796,,6155.78467,0
31,23526.302555,,2862.010139,0


In [62]:
# Corrigindo dados com valores negativos da coluna Idade
dados_cred.loc[dados_cred['Idade']<0]

Unnamed: 0,Renda Anual,Idade,Emprestado,Quitado
15,50501.726689,-28.218361,3977.287432,0
21,32197.620701,-52.42328,4244.057136,0
26,63287.038908,-36.496976,9595.286289,0


In [63]:
# Média dos valores negativos
dados_cred['Idade'][dados_cred.Idade>0].mean()

40.92770044906149

In [64]:
# Substituição dos valores negativos pela média dos mesmos valores
dados_cred.loc[dados_cred.Idade < 0,'Idade'] = 40.92

In [65]:
# Tratando dos valores NaN ('Not a number')
dados_cred.loc[[28,30,31]]

Unnamed: 0,Renda Anual,Idade,Emprestado,Quitado
28,59417.805406,,2082.625938,0
30,48528.852796,,6155.78467,0
31,23526.302555,,2862.010139,0


In [None]:
# substituição dos valores 'NaN' 
# from sklearn.impute import SimpleImputer - Uso da biblioteca scikit learn(Responsável pela
# correção dos valores faltantes)
# import numpy as np - Uso da biblioteca Numpy
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(previsores[:, 0:3]) # Método fit que ajustará os valores
previsores[:, 0:3] = imputer.transform(previsores[:,0:3]) # Comando que transformará os dados faltantes

# Escalonamento dos Atributos

In [76]:
# Deixando os atributos de previsores na mesma escala

# from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)

# Treinamento da Base de Dados (previsores e classe) 

# Aprendizagem Supervisionada - Naive Bayes

In [79]:
# from sklearn.model_selection import train_test_split - Função que fará automaticamente a divisão da base de dados
previsores_treinamento,previsores_teste, classe_treinamento, classe_teste = train_test_split\
(previsores,classe,test_size=0.25, random_state=0)

In [83]:
# from sklearn.naive_bayes import GaussianNB
classificador = GaussianNB()
classificador.fit(previsores_treinamento,classe_treinamento) # Construção da tabela de probabilidade com Naive Bayes

GaussianNB()

In [84]:
previsoes = classificador.predict(previsores_teste) # Criação da variável previsões para análise de probabilidades

In [85]:
# Análise entre a classe_teste e a previsão do Naive Bayes(variável previsoes)
print(previsoes)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 

In [86]:
print(classe_teste)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 

In [90]:
# Importando o módulo Metrics para verificar a acurácia do algoritmo Naive Bayes
from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)

In [94]:
# Percentual de acerto do Naive Bayes nessa base de dados
precisao// 100

0.0

In [91]:
# Quantos registros o algoritmo Naive Bayes acertou
matriz = confusion_matrix(classe_teste, previsoes)

In [93]:
# Dos 500 registros analisados para ler corretamente os dados abaixo é preciso somar
# 430 + 39 = 469. Portanto dos 500 registros de teste 469 o Naive Bayes acertou!

# Para saber o percentual de erros basta somar 25 + 6 = 31. Se for somado 430 + 6 + 25 + 39 
# teremos o total dos registros que é 500.

# O Naive Bayes acertou mais na linha dos clientes que não pagaram os emprestimos
#( valor 1 da classe)
# E o Naive Bayes errou a parte dos clientes que pagaram seus empréstimos(valor 0 da classe)
print(matriz)

[[430   6]
 [ 25  39]]
