https://archive.ics.uci.edu/dataset/14/breast+cancer

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [2]:
breast_cancer = pd.read_csv("breast_cancer.csv")
breast_cancer

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [3]:
X = breast_cancer.iloc[:,1:]
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [4]:
y = breast_cancer.iloc[:, :1]
y

Unnamed: 0,Class
0,no-recurrence-events
1,no-recurrence-events
2,no-recurrence-events
3,no-recurrence-events
4,no-recurrence-events
...,...
281,recurrence-events
282,recurrence-events
283,recurrence-events
284,recurrence-events


In [5]:
y.value_counts()

Class               
no-recurrence-events    201
recurrence-events        85
Name: count, dtype: int64

## Pre-processing

In [6]:
X["age"]

0      30-39
1      40-49
2      40-49
3      60-69
4      40-49
       ...  
281    30-39
282    30-39
283    60-69
284    40-49
285    50-59
Name: age, Length: 286, dtype: object

In [7]:
def calculate_midpoint(rrange):
    lower, upper = map(int, rrange.split('-'))
    return (lower + upper) / 2
 
X["age"] = X["age"].apply(calculate_midpoint)
X["age"]

0      34.5
1      44.5
2      44.5
3      64.5
4      44.5
       ... 
281    34.5
282    34.5
283    64.5
284    44.5
285    54.5
Name: age, Length: 286, dtype: float64

In [8]:
X["menopause"]
X["menopause"] = X["menopause"].map({"lt40": 0, "ge40": 1, "premeno": 2})
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,30-34,0-2,no,3,left,left_low,no
1,44.5,2,20-24,0-2,no,2,right,right_up,no
2,44.5,2,20-24,0-2,no,2,left,left_low,no
3,64.5,1,15-19,0-2,no,2,right,left_up,no
4,44.5,2,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,30-34,0-2,no,2,left,left_up,no
282,34.5,2,20-24,0-2,no,3,left,left_up,yes
283,64.5,1,20-24,0-2,no,1,right,left_up,no
284,44.5,1,30-34,3-5,no,3,left,left_low,no


In [9]:
X["tumor-size"]
X["tumor-size"] = X["tumor-size"].apply(calculate_midpoint)
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,0-2,no,3,left,left_low,no
1,44.5,2,22.0,0-2,no,2,right,right_up,no
2,44.5,2,22.0,0-2,no,2,left,left_low,no
3,64.5,1,17.0,0-2,no,2,right,left_up,no
4,44.5,2,2.0,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,0-2,no,2,left,left_up,no
282,34.5,2,22.0,0-2,no,3,left,left_up,yes
283,64.5,1,22.0,0-2,no,1,right,left_up,no
284,44.5,1,32.0,3-5,no,3,left,left_low,no


In [10]:
X["inv-nodes"]
X["inv-nodes"] = X["inv-nodes"].apply(calculate_midpoint)
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,1.0,no,3,left,left_low,no
1,44.5,2,22.0,1.0,no,2,right,right_up,no
2,44.5,2,22.0,1.0,no,2,left,left_low,no
3,64.5,1,17.0,1.0,no,2,right,left_up,no
4,44.5,2,2.0,1.0,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,no,2,left,left_up,no
282,34.5,2,22.0,1.0,no,3,left,left_up,yes
283,64.5,1,22.0,1.0,no,1,right,left_up,no
284,44.5,1,32.0,4.0,no,3,left,left_low,no


In [11]:
X["node-caps"]

0      no
1      no
2      no
3      no
4      no
       ..
281    no
282    no
283    no
284    no
285    no
Name: node-caps, Length: 286, dtype: object

In [12]:
X[X["node-caps"] == "?"]

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
145,44.5,2,27.0,1.0,?,2,left,right_low,yes
163,64.5,1,27.0,4.0,?,1,right,left_up,yes
164,64.5,1,27.0,4.0,?,1,right,left_low,yes
183,54.5,1,32.0,10.0,?,3,left,left_up,yes
184,54.5,1,32.0,10.0,?,3,left,left_low,yes
233,74.5,1,17.0,10.0,?,1,left,left_low,yes
263,54.5,0,22.0,1.0,?,1,left,left_up,no
264,54.5,0,22.0,1.0,?,1,left,left_low,no


In [13]:
# Removendo valores nulos
X = X[X["node-caps"] != "?"]
X


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,1.0,no,3,left,left_low,no
1,44.5,2,22.0,1.0,no,2,right,right_up,no
2,44.5,2,22.0,1.0,no,2,left,left_low,no
3,64.5,1,17.0,1.0,no,2,right,left_up,no
4,44.5,2,2.0,1.0,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,no,2,left,left_up,no
282,34.5,2,22.0,1.0,no,3,left,left_up,yes
283,64.5,1,22.0,1.0,no,1,right,left_up,no
284,44.5,1,32.0,4.0,no,3,left,left_low,no


In [14]:
X.loc[:,"node-caps"] = X["node-caps"].map({"no": 0, "yes": 1})
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,1.0,0,3,left,left_low,no
1,44.5,2,22.0,1.0,0,2,right,right_up,no
2,44.5,2,22.0,1.0,0,2,left,left_low,no
3,64.5,1,17.0,1.0,0,2,right,left_up,no
4,44.5,2,2.0,1.0,0,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,left,left_up,no
282,34.5,2,22.0,1.0,0,3,left,left_up,yes
283,64.5,1,22.0,1.0,0,1,right,left_up,no
284,44.5,1,32.0,4.0,0,3,left,left_low,no


In [15]:
X["breast"]


0       left
1      right
2       left
3      right
4      right
       ...  
281     left
282     left
283    right
284     left
285     left
Name: breast, Length: 278, dtype: object

In [16]:
X.loc[:,"breast"] = X["breast"].map({"left": 0, "right": 1})
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,1.0,0,3,0,left_low,no
1,44.5,2,22.0,1.0,0,2,1,right_up,no
2,44.5,2,22.0,1.0,0,2,0,left_low,no
3,64.5,1,17.0,1.0,0,2,1,left_up,no
4,44.5,2,2.0,1.0,0,2,1,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,0,left_up,no
282,34.5,2,22.0,1.0,0,3,0,left_up,yes
283,64.5,1,22.0,1.0,0,1,1,left_up,no
284,44.5,1,32.0,4.0,0,3,0,left_low,no


In [17]:
X["breast-quad"]


0       left_low
1       right_up
2       left_low
3        left_up
4      right_low
         ...    
281      left_up
282      left_up
283      left_up
284     left_low
285     left_low
Name: breast-quad, Length: 278, dtype: object

In [18]:
X[X["breast-quad"] == "?"]


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
206,54.5,1,32.0,1.0,0,3,0,?,no


In [19]:
X = X[X["breast-quad"] != "?"]
X



Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,34.5,2,32.0,1.0,0,3,0,left_low,no
1,44.5,2,22.0,1.0,0,2,1,right_up,no
2,44.5,2,22.0,1.0,0,2,0,left_low,no
3,64.5,1,17.0,1.0,0,2,1,left_up,no
4,44.5,2,2.0,1.0,0,2,1,right_low,no
...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,0,left_up,no
282,34.5,2,22.0,1.0,0,3,0,left_up,yes
283,64.5,1,22.0,1.0,0,1,1,left_up,no
284,44.5,1,32.0,4.0,0,3,0,left_low,no


In [20]:
# one-hot encoding
# https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
breast_quad_encoded = pd.get_dummies(X['breast-quad'], prefix='breast-quad')
X = pd.concat([X, breast_quad_encoded], axis=1)
X.drop("breast-quad", axis=1, inplace=True)
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,irradiat,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,34.5,2,32.0,1.0,0,3,0,no,False,True,False,False,False
1,44.5,2,22.0,1.0,0,2,1,no,False,False,False,False,True
2,44.5,2,22.0,1.0,0,2,0,no,False,True,False,False,False
3,64.5,1,17.0,1.0,0,2,1,no,False,False,True,False,False
4,44.5,2,2.0,1.0,0,2,1,no,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,0,no,False,False,True,False,False
282,34.5,2,22.0,1.0,0,3,0,yes,False,False,True,False,False
283,64.5,1,22.0,1.0,0,1,1,no,False,False,True,False,False
284,44.5,1,32.0,4.0,0,3,0,no,False,True,False,False,False


In [21]:
X["irradiat"]


0       no
1       no
2       no
3       no
4       no
      ... 
281     no
282    yes
283     no
284     no
285     no
Name: irradiat, Length: 277, dtype: object

In [22]:
X["irradiat"] = X["irradiat"].map({"no": 0, "yes": 1})
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,irradiat,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,34.5,2,32.0,1.0,0,3,0,0,False,True,False,False,False
1,44.5,2,22.0,1.0,0,2,1,0,False,False,False,False,True
2,44.5,2,22.0,1.0,0,2,0,0,False,True,False,False,False
3,64.5,1,17.0,1.0,0,2,1,0,False,False,True,False,False
4,44.5,2,2.0,1.0,0,2,1,0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,0,0,False,False,True,False,False
282,34.5,2,22.0,1.0,0,3,0,1,False,False,True,False,False
283,64.5,1,22.0,1.0,0,1,1,0,False,False,True,False,False
284,44.5,1,32.0,4.0,0,3,0,0,False,True,False,False,False


In [23]:
X

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,irradiat,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up
0,34.5,2,32.0,1.0,0,3,0,0,False,True,False,False,False
1,44.5,2,22.0,1.0,0,2,1,0,False,False,False,False,True
2,44.5,2,22.0,1.0,0,2,0,0,False,True,False,False,False
3,64.5,1,17.0,1.0,0,2,1,0,False,False,True,False,False
4,44.5,2,2.0,1.0,0,2,1,0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,34.5,2,32.0,1.0,0,2,0,0,False,False,True,False,False
282,34.5,2,22.0,1.0,0,3,0,1,False,False,True,False,False
283,64.5,1,22.0,1.0,0,1,1,0,False,False,True,False,False
284,44.5,1,32.0,4.0,0,3,0,0,False,True,False,False,False


In [24]:
k = 2
kmeans = KMeans(n_clusters=k).fit(X)
# kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
kmeans.labels_

array([1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0], dtype=int32)

In [25]:
pd.DataFrame(data=kmeans.labels_, columns=["Labels"]).value_counts()


Labels
0         144
1         133
Name: count, dtype: int64

In [26]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Class   286 non-null    object
dtypes: object(1)
memory usage: 2.4+ KB


In [27]:
y.value_counts()

Class               
no-recurrence-events    201
recurrence-events        85
Name: count, dtype: int64