# [Breast Cancer Coimbra](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Coimbra)


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics as mt
from sklearn import preprocessing
import numpy as np
import pandas as pd

**Este notebook tem o objetivo de prever se um paciente tem ou não câncer de mama**

##### Existem 9 preditores, todos quantitativos, e uma variável dependente binária, indicando a presença ou ausência de câncer de mama. 
##### Os preditores são dados e parâmetros antropométricos que podem ser coletados em análises de sangue de rotina. 
##### Modelos de previsão baseados nesses preditores, se precisos, podem ser usados ​​como biomarcadores de câncer de mama.

In [2]:
data = pd.read_csv('/content/drive/MyDrive/1-MESTRADO/IA/Colab Notebooks/datasets/dataR2.csv', sep=',')

In [3]:
data

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,1
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,2
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,2
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,2
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,2


### **Dados**

Idade (anos)\
IMC (Kg/m2)\
Glucose (mg/dL)\
Insulina (µU/mL)\
HOMA\
Leptina (ng/mL)\
Adiponectina (µg / mL)\
Resistina (ng / mL)\
MCP-1 (pg / dL)\

**Classification:**\
1 = saudáveis\
2 = pacientes





**Amostra inicial dos dados**

In [4]:
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


**Amostra final dos dados**

In [5]:
data.tail()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
111,45,26.85,92,3.33,0.755688,54.68,12.1,10.96,268.23,2
112,62,26.84,100,4.53,1.1174,12.45,21.42,7.32,330.16,2
113,65,32.05,97,5.73,1.370998,61.48,22.54,10.33,314.05,2
114,72,25.59,82,2.82,0.570392,24.96,33.75,3.27,392.46,2
115,86,27.18,138,19.91,6.777364,90.28,14.11,4.35,90.09,2


**Informações sobre o conjunto de dados** (quantidade de linhas e colunas, além das informações sobre o tipo de cada dado)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB


**Verificação se existe algum dado nulo.** A soma retornou 0 em todas as colunas, podemos concluir que não possuem campos nulos.

In [7]:
data.isnull().sum()

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64


**Dados estatísticos sobre o dataset**

In [8]:
data.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,1.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,1.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474282,6.881763,269.97825,1.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,2.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,2.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,2.0


**count:** contagem não nula\
**mean:**  média\
**std:** desvio padrão\
**25%:** percentil\
**50%:** mediana



**Separando o valor classificatório dos demais dados**

In [9]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

**Separando os dados para treino e teste**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X_train

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
9,75,23.000000,83,4.952,1.013839,17.1270,11.578990,7.09130,318.302
33,43,34.422174,89,23.194,5.091856,31.2128,8.300955,6.71026,960.246
64,59,22.832879,98,6.862,1.658774,14.9037,4.230105,8.20490,355.310
66,54,24.218750,86,3.730,0.791257,8.6874,3.705230,10.34455,635.049
28,35,35.250761,90,6.817,1.513374,50.6094,6.966895,22.03703,667.928
...,...,...,...,...,...,...,...,...,...
106,45,29.384757,90,4.713,1.046286,23.8479,6.644245,15.55625,621.273
14,38,23.340000,75,5.782,1.069670,15.2600,17.950000,9.35000,165.020
92,52,30.801249,87,30.212,6.483495,29.2739,6.268540,24.24591,764.667
51,77,25.900000,85,4.580,0.960273,13.7400,9.753260,11.77400,488.829


In [12]:
X_test

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
83,71,25.510204,112,10.395,2.871792,19.0653,5.4861,42.7447,799.898
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92
42,75,27.3,85,5.197,1.089638,10.39,9.000805,7.5767,335.393
40,76,29.218408,83,5.376,1.100646,28.562,7.36996,8.04375,698.789
10,34,21.47,78,3.469,0.667436,14.57,13.11,6.92,354.6
47,78,25.3,60,3.508,0.519184,6.633,10.567295,4.6638,209.749
110,54,36.05,119,11.91,3.495982,89.27,8.01,5.06,218.28
36,66,31.23859,82,4.181,0.845677,16.2247,4.267105,3.29175,634.602
70,45,20.26,92,3.44,0.780651,7.65,16.67,7.84,193.87
11,29,23.01,82,5.663,1.145436,35.59,26.72,4.58,174.8


**Instanciando o modelo**

In [13]:
model = KNeighborsClassifier(n_neighbors=3)

**Treino do modelo KNN**

In [14]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

**Criando a variável `y_predict` e adicionando o modelo treinado com o conjunto `X_test`**

In [15]:
y_predict = model.predict(X_test)

In [16]:
y_predict

array([2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1])

In [17]:
y_predict.shape[0]

35

In [18]:
(y_test == y_predict).sum()

18

**Cálculo da métrica Acurácia** 

In [19]:
(y_test == y_predict).sum() / y_test.shape[0]

0.5142857142857142

In [20]:
mt.accuracy_score(y_test, y_predict)

0.5142857142857142

**Classificação com MLPClassifier** 

In [21]:
model_neural_network = MLPClassifier(hidden_layer_sizes=800)

In [22]:
model_neural_network.fit(X_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=800, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [24]:
y_predict_neural_network = model_neural_network.predict(X_test)

In [25]:
mt.accuracy_score(y_test, y_predict_neural_network)

0.8