# Prevendo a Ocorrência de Câncer

## Etapa 1 - Coletando os Dados

In [1]:
dados <- read.csv("dataset.csv", stringsAsFactors = FALSE)

In [2]:
str(dados)

'data.frame':	569 obs. of  32 variables:
 $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
 $ diagnosis        : chr  "B" "B" "B" "B" ...
 $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
 $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
 $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
 $ area_mean        : num  464 346 373 385 712 ...
 $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
 $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
 $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
 $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
 $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
 $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
 $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
 $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
 $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
 $ area_se          : num  1

In [22]:
dados

diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
Benigno,12.320,12.39,78.85,464.1,0.10280,0.06981,0.039870,0.037000,0.1959,...,13.50,15.64,86.97,549.1,0.1385,0.12660,0.124200,0.09391,0.2827,0.06771
Benigno,10.600,18.95,69.28,346.4,0.09688,0.11470,0.063870,0.026420,0.1922,...,11.88,22.94,78.28,424.8,0.1213,0.25150,0.191600,0.07926,0.2940,0.07587
Benigno,11.040,16.83,70.92,373.2,0.10770,0.07804,0.030460,0.024800,0.1714,...,12.41,26.44,79.93,471.4,0.1369,0.14820,0.106700,0.07431,0.2998,0.07881
Benigno,11.280,13.39,73.00,384.8,0.11640,0.11360,0.046350,0.047960,0.1771,...,11.92,15.77,76.53,434.0,0.1367,0.18220,0.086690,0.08611,0.2102,0.06784
Benigno,15.190,13.21,97.65,711.8,0.07963,0.06934,0.033930,0.026570,0.1721,...,16.20,15.73,104.50,819.1,0.1126,0.17370,0.136200,0.08178,0.2487,0.06766
Benigno,11.570,19.04,74.20,409.7,0.08546,0.07722,0.054850,0.014280,0.2031,...,13.07,26.98,86.43,520.5,0.1249,0.19370,0.256000,0.06664,0.3035,0.08284
Benigno,11.510,23.93,74.52,403.5,0.09261,0.10210,0.111200,0.041050,0.1388,...,12.48,37.16,82.28,474.2,0.1298,0.25170,0.363000,0.09653,0.2112,0.08732
Maligno,13.810,23.75,91.56,597.8,0.13230,0.17680,0.155800,0.091760,0.2251,...,19.20,41.85,128.50,1153.0,0.2226,0.52090,0.464600,0.20130,0.4432,0.10860
Benigno,10.490,19.29,67.41,336.1,0.09989,0.08578,0.029950,0.012010,0.2217,...,11.54,23.31,74.22,402.8,0.1219,0.14860,0.079870,0.03203,0.2826,0.07552
Benigno,11.060,14.96,71.49,373.9,0.10330,0.09097,0.053970,0.033410,0.1776,...,11.92,19.90,79.76,440.0,0.1418,0.22100,0.229900,0.10750,0.3301,0.09080


## Etapa 2 - Pré-Processamento

### Excluindo a coluna ID

In [4]:
dados$id = NULL

### Ajustando o label da variável alvo

In [5]:
dados$diagnosis = sapply(dados$diagnosis, function(x){ifelse(x=='M', 'Maligno', 'Benigno')})

### Muitos classificadores requerem que as variáveis sejam do tipo Fator

In [6]:
table(dados$diagnosis)


Benigno Maligno 
    357     212 

In [7]:
dados$diagnosis <- factor(dados$diagnosis, levels = c("Benigno", "Maligno"), labels = c("Benigno", "Maligno"))

In [8]:
str(dados$diagnosis)

 Factor w/ 2 levels "Benigno","Maligno": 1 1 1 1 1 1 1 2 1 1 ...


### Verificando a proporção

In [9]:
round(prop.table(table(dados$diagnosis)) * 100, digits = 1) 


Benigno Maligno 
   62.7    37.3 

### Medidas de Tendência Central

summary(dados[c("radius_mean", "area_mean", "smoothness_mean")])

### Criando um função de normalização

In [12]:
normalizar <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

### Testando a função de normalização - os resultados devem ser idênticos

In [13]:
normalizar(c(1, 2, 3, 4, 5))

In [14]:
normalizar(c(10, 20, 30, 40, 50))

### Normalizando os dados

In [19]:
dados_norm <- as.data.frame(lapply(dados[2:31], normalizar))

In [21]:
dados_norm

radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0.2526859,0.0906324,0.2422777,0.13599152,0.4529205,0.15468376,0.093416120,0.18389662,0.4540404,0.20197978,...,0.1981501,0.09648188,0.1820808,0.08943669,0.4446279,0.09635106,0.099201278,0.32271478,0.2487680,0.08310376
0.1712812,0.3124789,0.1761454,0.08606575,0.3994764,0.29237470,0.149648547,0.13131213,0.4353535,0.31486942,...,0.1405194,0.29104478,0.1388017,0.05888714,0.3310440,0.21752966,0.153035144,0.27237113,0.2710428,0.13662600
0.1921056,0.2407846,0.1874784,0.09743372,0.4971563,0.17992761,0.071368322,0.12326044,0.3303030,0.28306655,...,0.1593739,0.38432836,0.1470193,0.07034015,0.4340619,0.11730749,0.085223642,0.25536082,0.2824759,0.15590975
0.2034644,0.1244505,0.2018520,0.10235419,0.5756974,0.28900067,0.108598875,0.23836978,0.3590909,0.22662174,...,0.1419424,0.09994670,0.1300862,0.06114825,0.4327412,0.15029446,0.069241214,0.29591065,0.1058545,0.08395645
0.3885182,0.1183632,0.3721927,0.24106045,0.2437483,0.15324213,0.079498594,0.13205765,0.3338384,0.11541702,...,0.2942014,0.09888060,0.2693859,0.15579532,0.2735918,0.14204771,0.108785942,0.28103093,0.1817465,0.08277581
0.2171896,0.3155225,0.2101444,0.11291622,0.2963799,0.17741243,0.128514527,0.07097416,0.4904040,0.26769166,...,0.1828531,0.39872068,0.1793914,0.08240759,0.3548174,0.16145181,0.204472843,0.22900344,0.2897694,0.18234291
0.2143499,0.4808928,0.2123557,0.11028632,0.3609280,0.25372677,0.260543580,0.20402584,0.1656566,0.33150800,...,0.1618641,0.67004264,0.1587230,0.07102831,0.3871756,0.21772371,0.289936102,0.33171821,0.1078257,0.21172767
0.3232051,0.4748055,0.3301085,0.19270414,0.7192381,0.48285381,0.365042174,0.45606362,0.6015152,0.51074136,...,0.4009249,0.79504264,0.3889138,0.23785883,1.0000000,0.47890289,0.371086262,0.69175258,0.5651488,0.35130526
0.1660751,0.3239770,0.1632230,0.08169671,0.4266498,0.20366849,0.070173383,0.05969185,0.5843434,0.31276327,...,0.1284240,0.30090618,0.1185816,0.05348014,0.3350063,0.11769557,0.063793930,0.11006873,0.2485709,0.13433032
0.1930522,0.1775448,0.1914173,0.09773065,0.4574343,0.21958776,0.126452671,0.16605368,0.3616162,0.40248526,...,0.1419424,0.21002132,0.1461726,0.06262289,0.4664201,0.18793841,0.183626198,0.36941581,0.3422038,0.23455333


## Etapa 3: Treinando o modelo com KNN

### Carregando o pacote library

In [23]:
library(class)

### Criando dados de treino e dados de teste

In [24]:
dados_treino <- dados_norm[1:469, ]

In [25]:
dados_teste <- dados_norm[470:569, ]

### Criando os labels para os dados de treino e de teste

In [26]:
dados_treino_labels <- dados[1:469, 1]

In [27]:
dados_teste_labels <- dados[470:569, 1]

In [28]:
length(dados_treino_labels)

In [29]:
length(dados_teste_labels)

### Criando o modelo

In [30]:
modelo_knn_v1 <- knn(train = dados_treino, 
                     test = dados_teste,
                     cl = dados_treino_labels, 
                     k = 21)

### A função knn() retorna um objeto do tipo fator com as previsões para cada exemplo no dataset de teste

In [31]:
summary(modelo_knn_v1)

## Etapa 4: Avaliando e Interpretando o Modelo

### Carregando o gmodels

In [33]:
install.packages("gmodels")

also installing the dependencies 'gtools', 'gdata'



package 'gtools' successfully unpacked and MD5 sums checked
package 'gdata' successfully unpacked and MD5 sums checked
package 'gmodels' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\luciano\AppData\Local\Temp\RtmpMfLJi7\downloaded_packages


In [34]:
library(gmodels)

"package 'gmodels' was built under R version 3.6.3"

### Criando uma tabela cruzada dos dados previstos x dados atuais

In [35]:
CrossTable(x = dados_teste_labels, y = modelo_knn_v1, prop.chisq = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
                   | modelo_knn_v1 
dados_teste_labels |   Benigno |   Maligno | Row Total | 
-------------------|-----------|-----------|-----------|
           Benigno |        61 |         0 |        61 | 
                   |     1.000 |     0.000 |     0.610 | 
                   |     0.968 |     0.000 |           | 
                   |     0.610 |     0.000 |           | 
-------------------|-----------|-----------|-----------|
           Maligno |         2 |        37 |        39 | 
                   |     0.051 |     0.949 |     0.390 | 
                   |     0.032 |     1.000 |           | 
                   |     0.020 |     0.370 |           | 
-------------------|-----------|-----------|-----------|
      Column Total |        63 |        37

## Etapa 5: Otimizando a Performance do Modelo

### Usando a função scale() para padronizar o z-score 

In [36]:
dados_z <- as.data.frame(scale(dados[-1]))

### Confirmando transformação realizada com sucesso

In [37]:
summary(dados_z$area_mean)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.4532 -0.6666 -0.2949  0.0000  0.3632  5.2459 

### Criando novos datasets de treino e de teste

In [38]:
dados_treino <- dados_z[1:469, ]

In [39]:
dados_teste <- dados_z[470:569, ]

### Criando os Novos Labels

In [40]:
dados_treino_labels <- dados[ 1: 469, 1] 

In [41]:
dados_teste_labels <- dados[ 470: 569, 1]

### Reclassificando

In [42]:
modelo_knn_v2 <- knn(train = dados_treino, 
                     test = dados_teste,
                     cl = dados_treino_labels, 
                     k = 21)

### Criando uma tabela cruzada dos dados previstos x dados atuais

In [43]:
CrossTable(x = dados_teste_labels, y = modelo_knn_v2, prop.chisq = FALSE)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
                   | modelo_knn_v2 
dados_teste_labels |   Benigno |   Maligno | Row Total | 
-------------------|-----------|-----------|-----------|
           Benigno |        61 |         0 |        61 | 
                   |     1.000 |     0.000 |     0.610 | 
                   |     0.924 |     0.000 |           | 
                   |     0.610 |     0.000 |           | 
-------------------|-----------|-----------|-----------|
           Maligno |         5 |        34 |        39 | 
                   |     0.128 |     0.872 |     0.390 | 
                   |     0.076 |     1.000 |           | 
                   |     0.050 |     0.340 |           | 
-------------------|-----------|-----------|-----------|
      Column Total |        66 |        34

## Etapa 6: Construindo um Modelo com Algoritmo Support Vector Machine (SVM)

### Definindo a semente para resultados reproduzíveis

In [44]:
set.seed(40) 

### Prepara o dataset

In [45]:
dados <- read.csv("dataset.csv", stringsAsFactors = FALSE)

In [46]:
dados$id = NULL

In [47]:
dados[,'index'] <- ifelse(runif(nrow(dados)) < 0.8,1,0)

In [48]:
dados

diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst,index
B,12.320,12.39,78.85,464.1,0.10280,0.06981,0.039870,0.037000,0.1959,...,15.64,86.97,549.1,0.1385,0.12660,0.124200,0.09391,0.2827,0.06771,1
B,10.600,18.95,69.28,346.4,0.09688,0.11470,0.063870,0.026420,0.1922,...,22.94,78.28,424.8,0.1213,0.25150,0.191600,0.07926,0.2940,0.07587,0
B,11.040,16.83,70.92,373.2,0.10770,0.07804,0.030460,0.024800,0.1714,...,26.44,79.93,471.4,0.1369,0.14820,0.106700,0.07431,0.2998,0.07881,1
B,11.280,13.39,73.00,384.8,0.11640,0.11360,0.046350,0.047960,0.1771,...,15.77,76.53,434.0,0.1367,0.18220,0.086690,0.08611,0.2102,0.06784,1
B,15.190,13.21,97.65,711.8,0.07963,0.06934,0.033930,0.026570,0.1721,...,15.73,104.50,819.1,0.1126,0.17370,0.136200,0.08178,0.2487,0.06766,1
B,11.570,19.04,74.20,409.7,0.08546,0.07722,0.054850,0.014280,0.2031,...,26.98,86.43,520.5,0.1249,0.19370,0.256000,0.06664,0.3035,0.08284,1
B,11.510,23.93,74.52,403.5,0.09261,0.10210,0.111200,0.041050,0.1388,...,37.16,82.28,474.2,0.1298,0.25170,0.363000,0.09653,0.2112,0.08732,1
M,13.810,23.75,91.56,597.8,0.13230,0.17680,0.155800,0.091760,0.2251,...,41.85,128.50,1153.0,0.2226,0.52090,0.464600,0.20130,0.4432,0.10860,1
B,10.490,19.29,67.41,336.1,0.09989,0.08578,0.029950,0.012010,0.2217,...,23.31,74.22,402.8,0.1219,0.14860,0.079870,0.03203,0.2826,0.07552,1
B,11.060,14.96,71.49,373.9,0.10330,0.09097,0.053970,0.033410,0.1776,...,19.90,79.76,440.0,0.1418,0.22100,0.229900,0.10750,0.3301,0.09080,1


### Dados de treino e teste

In [49]:
trainset <- dados[dados$index==1,]

In [50]:
testset <- dados[dados$index==0,]

### Obter o índice 

In [51]:
trainColNum <- grep('index', names(trainset))

### Remover o índice dos datasets

In [52]:
trainset <- trainset[,-trainColNum]

In [53]:
testset <- testset[,-trainColNum]

### Obter índice de coluna da variável target no conjunto de dados

In [54]:
typeColNum <- grep('diag',names(dados))


### Cria o modelo

In [56]:
install.packages("e1071")

package 'e1071' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\luciano\AppData\Local\Temp\RtmpMfLJi7\downloaded_packages


In [57]:
library(e1071)

"package 'e1071' was built under R version 3.6.3"

In [58]:
modelo_svm_v1 <- svm(diagnosis ~ ., 
                     data = trainset, 
                     type = 'C-classification', 
                     kernel = 'radial')

### Previsões nos dados de treino

In [59]:
pred_train <- predict(modelo_svm_v1, trainset) 

### Percentual de previsões corretas com dataset de treino

In [60]:
mean(pred_train == trainset$diagnosis)  

### Previsões nos dados de teste

In [61]:
pred_test <- predict(modelo_svm_v1, testset) 

### Percentual de previsões corretas com dataset de teste

In [62]:
mean(pred_test == testset$diagnosis)  

### Confusion Matrix

In [63]:
table(pred_test, testset$diagnosis)

         
pred_test  B  M
        B 63  1
        M  0 47

## Etapa 7: Construindo um Modelo com Algoritmo Random Forest

### Criando o modelo

In [64]:
library(rpart)

In [65]:
modelo_rf_v1 = rpart(diagnosis ~ ., data = trainset, control = rpart.control(cp = .0005)) 

### Previsões nos dados de teste

In [66]:
tree_pred = predict(modelo_rf_v1, testset, type='class')

### Percentual de previsões corretas com dataset de teste

In [67]:
mean(tree_pred==testset$diagnosis) 

### Confusion Matrix

In [68]:
table(tree_pred, testset$diagnosis)

         
tree_pred  B  M
        B 62  3
        M  1 45