In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

#### Lendo os dados

In [2]:
df = pd.read_csv('breast_cancer_.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0,0


#### Verificando se existem itens faltantes

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
0     561 non-null float64
1     565 non-null float64
2     558 non-null float64
3     564 non-null float64
4     564 non-null float64
5     560 non-null float64
6     561 non-null float64
7     566 non-null float64
8     558 non-null float64
9     566 non-null float64
10    564 non-null float64
11    559 non-null float64
12    564 non-null float64
13    560 non-null float64
14    563 non-null float64
15    566 non-null float64
16    559 non-null float64
17    566 non-null float64
18    560 non-null float64
19    566 non-null float64
20    559 non-null float64
21    566 non-null float64
22    564 non-null float64
23    563 non-null float64
24    560 non-null float64
25    564 non-null float64
26    563 non-null float64
27    562 non-null float64
28    563 non-null float64
29    565 non-null float64
30    561 non-null float64
y     569 non-null int64
dtypes: float64(31), int64(1)
mem

#### Preenchando itens faltantes.
Como observado na celula anterior existem alguns valores faltantes, portando a celula seguinte preenche os vlaores faltantes com a média da coluna.

In [4]:
df.where(pd.notna(df), df.mean(), axis='columns', inplace=True)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.627451,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
0     569 non-null float64
1     569 non-null float64
2     569 non-null float64
3     569 non-null float64
4     569 non-null float64
5     569 non-null float64
6     569 non-null float64
7     569 non-null float64
8     569 non-null float64
9     569 non-null float64
10    569 non-null float64
11    569 non-null float64
12    569 non-null float64
13    569 non-null float64
14    569 non-null float64
15    569 non-null float64
16    569 non-null float64
17    569 non-null float64
18    569 non-null float64
19    569 non-null float64
20    569 non-null float64
21    569 non-null float64
22    569 non-null float64
23    569 non-null float64
24    569 non-null float64
25    569 non-null float64
26    569 non-null float64
27    569 non-null float64
28    569 non-null float64
29    569 non-null float64
30    569 non-null float64
y     569 non-null int64
dtypes: float64(31), int64(1)
mem

#### Separação dos dados

In [7]:
X, y = df.values[: , :-1], df.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

#### Definição dos modelos e padronização

In [8]:
gbParam = {'estimator__learning_rate':[0.1, 0.05, 0.01], 'estimator__n_estimators':[50, 100, 200], 'estimator__max_depth':[3, 5, 7]}
oParam = {'estimator__n_estimators':[50, 100, 200], 'estimator__max_depth':[3, 5, 7]}
bParam = {'estimator__n_estimators':[50, 100, 200], 'estimator__base_estimator__max_depth':[3, 5, 7]}

# Bagging com Decision Tree
BDT = Pipeline([('transformer', StandardScaler()),
                ('estimator', BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), oob_score=True, n_jobs=-1))])
BDT = GridSearchCV(BDT, bParam, cv=3, scoring=['accuracy', 'precision', 'recall'], iid=True, return_train_score=True, refit=False)
BDT.fit(X_train, y_train);

# Random Forest
RF = Pipeline([('transformer', StandardScaler()), ('estimator', RandomForestClassifier(random_state=42))])
RF = GridSearchCV(RF, oParam, cv=3, scoring=['accuracy', 'precision', 'recall'], iid=True, return_train_score=True, refit=False)
RF.fit(X_train, y_train);

# Gradient Boosting
GB = Pipeline([('transformer', StandardScaler()), ('estimator', GradientBoostingClassifier(random_state=42))])
GB = GridSearchCV(GB, gbParam, cv=3, scoring=['accuracy', 'precision', 'recall'], iid=True, return_train_score=True, refit=False)
GB.fit(X_train, y_train);

# Avaliação do método Enseble BaggingClassifier -- Q1

#### Observando os valores do BaggingClassifier com o Decision Tree

In [9]:
pd.DataFrame(BDT.cv_results_, columns=['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'param_estimator__base_estimator__max_depth', 'param_estimator__n_estimators'])

Unnamed: 0,mean_test_accuracy,mean_test_precision,mean_test_recall,param_estimator__base_estimator__max_depth,param_estimator__n_estimators
0,0.995604,0.993076,1.0,3,50
1,0.995604,0.993076,1.0,3,100
2,0.995604,0.993076,1.0,3,200
3,0.995604,0.993076,1.0,5,50
4,0.995604,0.993076,1.0,5,100
5,0.995604,0.993076,1.0,5,200
6,0.995604,0.993076,1.0,7,50
7,0.995604,0.993076,1.0,7,100
8,0.995604,0.993076,1.0,7,200


Podemos observar que a maioria das combinações retornaram performances semelhantes

#### Observando os valores do Random Forest

In [10]:
pd.DataFrame(RF.cv_results_, columns=['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'param_estimator__max_depth', 'param_estimator__n_estimators'])

Unnamed: 0,mean_test_accuracy,mean_test_precision,mean_test_recall,param_estimator__max_depth,param_estimator__n_estimators
0,0.984615,0.976215,1.0,3,50
1,0.982418,0.97611,0.996507,3,100
2,0.978022,0.972607,0.993013,3,200
3,0.995604,0.993076,1.0,5,50
4,0.993407,0.989691,1.0,5,100
5,0.993407,0.989691,1.0,5,200
6,0.995604,0.993076,1.0,7,50
7,0.995604,0.993076,1.0,7,100
8,0.993407,0.989691,1.0,7,200


Podemos observar que na linha 7 e 3 foram obtidos os melhores resultados, utilizando os parâmetros
3 ---> max_depth: 5 | n_estimators 50
7 ---> max_depth: 7 | n_estimators 100

#### Obsevando os valores do gradient Boosting

In [11]:
pd.DataFrame(GB.cv_results_, columns=['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'param_estimator__learning_rate', 'param_estimator__max_depth', 'param_estimator__n_estimators'])

Unnamed: 0,mean_test_accuracy,mean_test_precision,mean_test_recall,param_estimator__learning_rate,param_estimator__max_depth,param_estimator__n_estimators
0,0.993407,0.99304,0.996497,0.1,3,50
1,0.993407,0.99304,0.996497,0.1,3,100
2,0.993407,0.99304,0.996497,0.1,3,200
3,0.991209,0.989583,0.996497,0.1,5,50
4,0.991209,0.989583,0.996497,0.1,5,100
5,0.991209,0.989583,0.996497,0.1,5,200
6,0.991209,0.989583,0.996497,0.1,7,50
7,0.991209,0.989583,0.996497,0.1,7,100
8,0.991209,0.989583,0.996497,0.1,7,200
9,0.993407,0.99304,0.996497,0.05,3,50


Já para o GradientBoosting várias combinações se mostraram ter a mesma perfomance como nas linas: 0, 1, 2, 9, 10, 11 e 20

#### Criando um modelo utilizando os melhores resultados obtidos no GridSearch utilizando todo o dataset

O RandomForest e o Bagging com DecisionTree obtiveram os melhores resultados. Portanto irei escolher o RandomForest por ser mais rápido para mim.

O StandardScalar foi ajustado para os dados de treino, e transformou os dados de treino.
Para os dados de teste deverá apenas ser transformado por isso estou salvando o Scaler.

In [12]:
scaler = StandardScaler().fit(X_train)
X_train_trans = scaler.transform(X_train)
X_test_trans = scaler.transform(X_test)

In [13]:
rf = RandomForestClassifier(max_depth=5, n_estimators=50)
rf.fit(X_train_trans, y_train)
pred = rf.predict(X_test_trans)
accuracy_score(y_test, pred), precision_score(y_test, pred), recall_score(y_test, pred)

(1.0, 1.0, 1.0)

O modelo ficou bem melhor, com todos os dados, acertando todos os dados do conjunto de teste.

# Criar uma rede MultiLayerPerceptron -- Q2

In [14]:
mpl = Sequential()
mpl.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
mpl.add(Dense(1, activation='sigmoid'))
mpl.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) # Add precision and recall

Não sei muito bem como funciona as camadas, mas o problema é bem simples, portanto eu fiz apenas uma camada densa, e depois fiz uma outra camada, com saída 1, com a ativação sigmoid, que é voltada para valores entre 0 e 1. Usei o binary conssentropy como loss, pois estamos trabalhando com um problema binário.

In [15]:
mpl.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                1024      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,057
Trainable params: 1,057
Non-trainable params: 0
_________________________________________________________________


In [16]:
# como separando anteriormente, os dados de treino possuem 80% do dataset real,
# separei 10% desses 80 para realizar a validação
history = mpl.fit(X_train, y_train, epochs=100, validation_split=.1); 

Train on 409 samples, validate on 46 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [17]:
pd.DataFrame(history.history).sort_values(by='acc', ascending=False)

Unnamed: 0,loss,acc,val_loss,val_acc
97,0.140538,0.955990,2.453725,0.565217
75,0.206822,0.948655,1.636848,0.695652
70,0.193087,0.946210,0.808793,0.891304
81,0.247838,0.943765,2.049324,0.760870
57,0.232228,0.941320,2.111295,0.760870
...,...,...,...,...
47,0.644376,0.823961,1.934744,0.695652
1,2.921303,0.823961,4.956324,0.826087
2,2.853500,0.819071,4.724637,0.847826
4,2.445927,0.811736,3.663115,0.869565


A época onde foi obtida a melhor acurácia foi a 85

In [18]:
mpl.fit(X_train, y_train, epochs=1, callbacks=[history]); # espero que seja isso ;-;

Train on 455 samples


#### Realizando o teste do modelo.

In [19]:
mpl.evaluate(X_test, y_test);



# Uau ele foi muito bem :)