In [1]:
import pandas as pd
import numpy as np
import statistics
from scipy import stats

In [2]:
dataset = pd.read_csv('credit_data.csv')
dataset.dropna(inplace = True)
dataset.shape

(1997, 5)

In [3]:
dataset

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [4]:
X = dataset.iloc[:,1:4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [5]:
y = dataset.iloc[:,4].values
y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier #Utiliza várias árvores de decisão
from sklearn.metrics import accuracy_score

In [7]:
resultados_nb = []
resultados_lr = []
resultados_rf = []
for i in range(30): #Queremos 30 testes, logo para cada random_state será armazenado um destes resultados.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = i)
    
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    resultados_nb.append(accuracy_score(y_test, nb.predict(X_test)))
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    resultados_lr.append(accuracy_score(y_test, lr.predict(X_test)))
    
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    resultados_rf.append(accuracy_score(y_test, rf.predict(X_test)))

In [8]:
print(resultados_nb)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [9]:
print(resultados_lr)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [10]:
print(resultados_rf)

[0.9725, 0.9875, 0.98, 0.9975, 0.98, 0.9875, 0.985, 0.985, 0.9825, 0.9875, 0.98, 0.9825, 0.9775, 0.985, 0.9775, 0.9775, 0.985, 0.98, 0.9875, 0.985, 0.98, 0.9775, 0.9775, 0.9825, 0.99, 0.995, 0.9925, 0.985, 0.99, 0.9825]


In [11]:
type(resultados_nb)

list

In [12]:
resultados_nb = np.array(resultados_nb)
resultados_lr = np.array(resultados_lr)
resultados_rf = np.array(resultados_rf)
type(resultados_nb)

numpy.ndarray

Média

In [13]:
resultados_nb.mean(), resultados_lr.mean(), resultados_rf.mean()

(0.92425, 0.9145, 0.9838333333333332)

Moda

In [14]:
statistics.mode(resultados_nb)

0.925

In [15]:
stats.mode(resultados_nb), stats.mode(resultados_lr), stats.mode(resultados_rf)

(ModeResult(mode=0.9175, count=5),
 ModeResult(mode=0.9075, count=4),
 ModeResult(mode=0.985, count=6))

Mediana

In [16]:
np.median(resultados_nb), np.median(resultados_lr), np.median(resultados_rf)

(0.925, 0.9125, 0.98375)

Variância

In [17]:
np.set_printoptions(suppress = True) #Retirar notação científica
np.var(resultados_nb), np.var(resultados_lr), np.var(resultados_rf)

(8.756250000000001e-05, 0.00020933333333333337, 3.11388888888889e-05)

In [18]:
resultados_rf

array([0.9725, 0.9875, 0.98  , 0.9975, 0.98  , 0.9875, 0.985 , 0.985 ,
       0.9825, 0.9875, 0.98  , 0.9825, 0.9775, 0.985 , 0.9775, 0.9775,
       0.985 , 0.98  , 0.9875, 0.985 , 0.98  , 0.9775, 0.9775, 0.9825,
       0.99  , 0.995 , 0.9925, 0.985 , 0.99  , 0.9825])

Desvio padrão

In [19]:
np.std(resultados_nb), np.std(resultados_lr), np.std(resultados_rf)

(0.00935748363610645, 0.014468356276140472, 0.005580223014261069)

Coeficiente de variação

In [20]:
stats.variation(resultados_nb) * 100, stats.variation(resultados_lr) * 100, stats.variation(resultados_rf) * 100

(1.0124407504578252, 1.5821056616884057, 0.5671919038720383)

# EXERCÍCIO: VALIDAÇÃO CRUZADA 

In [21]:
from sklearn.model_selection import cross_val_score, KFold

In [22]:
resultados_nb_cv = []
resultados_lr_cv = []
resultados_rf_cv = []

for i in range(30):
    kfold = KFold(n_splits = 10, shuffle = True, random_state = i) #n_splits é a quantidade de pedaços que deseja dividir a base de dados
    
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv = kfold)
    resultados_nb_cv.append(scores.mean())
    
    lr = LogisticRegression()
    scores = cross_val_score(lr, X, y, cv = kfold)
    resultados_lr_cv.append(scores.mean())

    rf = RandomForestClassifier()
    scores = cross_val_score(rf, X, y, cv = kfold)
    resultados_rf_cv.append(scores.mean())

In [23]:
scores

array([0.985     , 0.98      , 0.985     , 0.985     , 0.98      ,
       0.99      , 0.99      , 0.98994975, 0.98994975, 1.        ])

In [24]:
scores.mean()

0.9874899497487437

In [26]:
print(resultados_nb_cv)

[0.9248618090452261, 0.9249170854271356, 0.9248894472361808, 0.9228819095477387, 0.9248718592964824, 0.9253894472361809, 0.9243844221105528, 0.9248894472361808, 0.9248718592964824, 0.9238894472361808, 0.9253844221105527, 0.9244020100502512, 0.9253743718592965, 0.924394472361809, 0.9253869346733667, 0.9248819095477387, 0.9258844221105527, 0.924894472361809, 0.9238819095477387, 0.9258844221105529, 0.9258944723618091, 0.9253894472361811, 0.92436432160804, 0.9263844221105527, 0.9228944723618092, 0.9253894472361809, 0.9248869346733668, 0.9253919597989949, 0.9253819095477386, 0.9258844221105527]


In [27]:
print(resultados_lr_cv)

[0.9113718592964826, 0.9103668341708543, 0.918359296482412, 0.9158768844221106, 0.9133417085427136, 0.9108718592964824, 0.9188366834170856, 0.9188969849246231, 0.913356783919598, 0.9148693467336683, 0.9098718592964824, 0.9183542713567838, 0.9183793969849248, 0.9213618090452261, 0.9098643216080402, 0.914356783919598, 0.9103693467336684, 0.915856783919598, 0.9128542713567839, 0.9123668341708543, 0.9208944723618091, 0.9108668341708542, 0.9103467336683417, 0.9133768844221105, 0.9098592964824121, 0.9148768844221105, 0.910859296482412, 0.9113768844221104, 0.9113668341708543, 0.9098442211055277]


In [28]:
print(resultados_rf_cv)

[0.9854773869346733, 0.9874899497487437, 0.9829748743718593, 0.9859698492462312, 0.9884824120603015, 0.9864824120603016, 0.9884798994974874, 0.9869849246231155, 0.9874798994974874, 0.9879849246231156, 0.9879798994974875, 0.9874773869346735, 0.9879773869346733, 0.9874798994974874, 0.9869874371859296, 0.9894849246231155, 0.9849798994974874, 0.9874748743718593, 0.9864824120603014, 0.9839773869346734, 0.9879773869346733, 0.9879824120603015, 0.9854798994974875, 0.9864748743718593, 0.9879824120603015, 0.9854874371859296, 0.9859773869346734, 0.9864773869346735, 0.9884899497487437, 0.9874899497487437]


In [29]:
stats.variation(resultados_nb_cv) * 100, stats.variation(resultados_lr_cv) * 100, stats.variation(resultados_rf_cv) * 100

(0.08641071566366061, 0.38801026116292653, 0.1424941724844509)