# Modelagem

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

In [2]:
data = load_breast_cancer()

In [3]:
#Atributos
X = pd.DataFrame(data=data.data, columns=data.feature_names)
#Variável dependente
y = pd.DataFrame(data=data.target, columns=['diagnostic'])

In [4]:
df = pd.concat([X, y],axis = 1)

In [5]:
matrix_corr = df.corr()

In [6]:
not_related_cols = matrix_corr['diagnostic'].abs().sort_values().head().index.tolist()

In [7]:
df.drop(not_related_cols, axis=1, inplace=True)

Não há dados ausentes então eu não preciso pensar em fazer algum tipo de imputação.

Vou fazer a separação de treino e teste. Depois criar a pipeline com a normalização, logo em seguida aplicar o modelo de regressão logística e de árvore de decisão.

Porém não verifiquei se havia outliers.

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df, df['diagnostic']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [9]:
diagnostic = strat_train_set.drop('diagnostic', axis=1)
diagnostic_labels = strat_train_set['diagnostic'].copy()

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

diagnostic_num_tr = num_pipeline.fit_transform(diagnostic)

In [11]:
# Pensando num caso maior, que haveria outros tipos de colunas para trabalhar.
from sklearn.compose import ColumnTransformer

num_attribs = list(diagnostic)
cat_attribs = []

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    # ('cat', OneHotEncoder(), cat_attribs)
])

diagnostic_prepared = full_pipeline.fit_transform(diagnostic)

In [12]:
# Modelagem
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(diagnostic_prepared, diagnostic_labels)

In [13]:
# Fazendo teste em alguns instância do treinamento
some_data = diagnostic.iloc[:5]
some_labels = diagnostic_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predições: ", log_reg.predict(some_data_prepared))


Predições:  [1 1 1 1 1]


In [14]:
print("Rótulos:", list(some_labels))

Rótulos: [1, 1, 1, 1, 1]


In [15]:
# Verificando a medida de desempenho sobre o treinamento
from sklearn.metrics import accuracy_score, precision_score
diagnostic_predictions = log_reg.predict(diagnostic_prepared)
acc = accuracy_score(diagnostic_predictions, diagnostic_labels)
prc = precision_score(diagnostic_predictions, diagnostic_labels)
print('Acurácia: ', acc, '\nPrecisão: ', prc)

Acurácia:  0.9874371859296482 
Precisão:  0.996


Utilizando o outro modelo:

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(diagnostic_prepared, diagnostic_labels)

In [17]:
# Fazendo teste em alguns instância do treinamento
some_data = diagnostic.iloc[:5]
some_labels = diagnostic_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predições: ", knn.predict(some_data_prepared))

Predições:  [1 1 1 1 1]


In [18]:
diagnostic_predictions = knn.predict(diagnostic_prepared)
acc = accuracy_score(diagnostic_predictions, diagnostic_labels)
prc = precision_score(diagnostic_predictions, diagnostic_labels)
print('Acurácia: ', acc, '\nPrecisão: ', prc)

Acurácia:  0.9798994974874372 
Precisão:  0.992


Validação Cruzada, para melhor avaliação

In [19]:
from sklearn.model_selection import cross_val_score

scores_lreg = cross_val_score(log_reg, diagnostic_prepared, diagnostic_labels,
                             scoring='accuracy', cv=10)

scores_knn = cross_val_score(knn, diagnostic_prepared, diagnostic_labels,
                             scoring='accuracy', cv=10)


In [20]:
print(scores_lreg)

[1.         0.975      0.975      0.925      0.975      1.
 1.         0.975      1.         0.97435897]


In [21]:
print(scores_knn)

[1.         0.95       1.         0.975      0.925      0.95
 0.925      1.         1.         0.97435897]


## Fine-Tune do Modelo

Buscar hiperparâmetros dos dois modelos utilizados e fazer os testes.

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {"C": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 
              "solver": ['lbfgs', 'liblinear']}# l1 lasso l2 ridge

log_reg_gs = LogisticRegression()
grid_search = GridSearchCV(log_reg_gs, param_grid, scoring = 'accuracy',
                           cv=5, return_train_score=True)

grid_search.fit(diagnostic_prepared, diagnostic_labels)


In [23]:
grid_search.best_params_

{'C': 0.8, 'solver': 'liblinear'}

In [24]:
grid_search.best_estimator_

In [25]:
cv_res = grid_search.cv_results_
for acc, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(acc, params)

0.977373417721519 {'C': 0.1, 'solver': 'lbfgs'}
0.9748734177215189 {'C': 0.1, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.2, 'solver': 'lbfgs'}
0.9774050632911392 {'C': 0.2, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.3, 'solver': 'lbfgs'}
0.9774050632911392 {'C': 0.3, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.4, 'solver': 'lbfgs'}
0.9799050632911392 {'C': 0.4, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.5, 'solver': 'lbfgs'}
0.9799050632911392 {'C': 0.5, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.6, 'solver': 'lbfgs'}
0.9799050632911392 {'C': 0.6, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.7, 'solver': 'lbfgs'}
0.9799050632911392 {'C': 0.7, 'solver': 'liblinear'}
0.9799050632911392 {'C': 0.8, 'solver': 'lbfgs'}
0.9824367088607595 {'C': 0.8, 'solver': 'liblinear'}
0.9824367088607595 {'C': 0.9, 'solver': 'lbfgs'}
0.9799367088607596 {'C': 0.9, 'solver': 'liblinear'}
0.9774367088607596 {'C': 1, 'solver': 'lbfgs'}
0.9774367088607596 {'C': 1, 'solver'

## Avaliar o sistema no conjunto de teste

Por enquanto fazer com a regressão logística

In [26]:
final_modelo = grid_search.best_estimator_

X_test = strat_test_set.drop('diagnostic', axis=1)
y_test = strat_test_set['diagnostic'].copy()

X_test_prepared = full_pipeline.transform(X_test)
predicoes_finais = final_modelo.predict(X_test_prepared)

acc = accuracy_score(predicoes_finais, y_test)
prc = precision_score(predicoes_finais, y_test)
print('Acurácia: ', acc, '\nPrecisão: ', prc)

Acurácia:  0.9766081871345029 
Precisão:  0.9813084112149533



---

## Extra
Fazer Pipeline com Predições direto!

Salvar Modelo em .pkl.

In [27]:
import util

util.salva_modelo_pkl(final_modelo, 'logRegression')

In [28]:
modelo_novo = util.importa_modelo_pkl('logRegression_070924.pkl')

In [29]:
predicoes_teste = modelo_novo.predict(X_test_prepared)
acc = accuracy_score(predicoes_teste, y_test)
prc = precision_score(predicoes_teste, y_test)

print('Acurácia: ', acc, '\nPrecisão: ', prc)

Acurácia:  0.9766081871345029 
Precisão:  0.9813084112149533


In [41]:
from sklearn.tree import DecisionTreeClassifier

full_pipeline_model = Pipeline([
    ('num_proc', StandardScaler()),
    ('estimator', DecisionTreeClassifier())
])

In [42]:
full_pipeline_model.fit(diagnostic, diagnostic_labels)

In [43]:
full_pipeline_model.predict(some_data)

array([1, 1, 1, 1, 1])

In [44]:
predicoes_teste = full_pipeline_model.predict(X_test)
acc = accuracy_score(predicoes_teste, y_test)
prc = precision_score(predicoes_teste, y_test)

print('Acurácia: ', acc, '\nPrecisão: ', prc)

Acurácia:  0.9005847953216374 
Precisão:  0.8878504672897196


In [56]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'max_depth' : [1, 2, 3, 4, 5]
}

dtree = DecisionTreeClassifier()

random_search = RandomizedSearchCV(dtree, params, scoring = 'accuracy', n_iter=5,
                                   cv=5, return_train_score=True)

random_search.fit(diagnostic_prepared, diagnostic_labels)

In [57]:
random_search.best_estimator_

In [59]:
cv_res = random_search.cv_results_
for acc, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(acc, params)

0.9045253164556961 {'max_depth': 1}
0.9449050632911392 {'max_depth': 2}
0.944873417721519 {'max_depth': 3}
0.9474050632911393 {'max_depth': 4}
0.9423101265822783 {'max_depth': 5}
