In [1]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering

In [2]:
breast_cancer = pd.read_csv("breast_cancer.csv")
breast_cancer.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [3]:
# encontrando instancias com valores faltantes
breast_cancer[breast_cancer.apply(lambda row: any(row == "?"), axis=1)]

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
145,no-recurrence-events,40-49,premeno,25-29,0-2,?,2,left,right_low,yes
163,no-recurrence-events,60-69,ge40,25-29,3-5,?,1,right,left_up,yes
164,no-recurrence-events,60-69,ge40,25-29,3-5,?,1,right,left_low,yes
183,no-recurrence-events,50-59,ge40,30-34,9-11,?,3,left,left_up,yes
184,no-recurrence-events,50-59,ge40,30-34,9-11,?,3,left,left_low,yes
206,recurrence-events,50-59,ge40,30-34,0-2,no,3,left,?,no
233,recurrence-events,70-79,ge40,15-19,9-11,?,1,left,left_low,yes
263,recurrence-events,50-59,lt40,20-24,0-2,?,1,left,left_up,no
264,recurrence-events,50-59,lt40,20-24,0-2,?,1,left,left_low,no


In [4]:
breast_cancer.replace("?", pd.NA, inplace=True)
# removendo instancias com valores faltantes
breast_cancer = breast_cancer.dropna()
breast_cancer.shape

(277, 10)

In [5]:
X = breast_cancer.iloc[:, 1:]
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [6]:
y = breast_cancer.iloc[:, :1]
y.value_counts()

Class               
no-recurrence-events    196
recurrence-events        81
Name: count, dtype: int64

___

In [7]:
from sklearn.preprocessing import LabelBinarizer, OrdinalEncoder

oe = OrdinalEncoder()
X["age"] = oe.fit_transform(X[["age"]])
# X["age"] = X["age"].map({"10-19": 0, "20-29": 1, "30-39": 2, "40-49": 3, "50-59": 4, "60-69": 5, "70-79": 6, "80-89": 7, "90-99": 8})

X["menopause"] = oe.fit_transform(X[["menopause"]])
# X["menopause"] = X["menopause"].map({"lt40": 0, "ge40": 1, "premeno": 2})

X["tumor-size"] = oe.fit_transform(X[["tumor-size"]])
# X["tumor-size"] = X["tumor-size"].map({"0-4": 0, "5-9": 1, "10-14": 2, "15-19": 3, "20-24": 4, "25-29": 5, "30-34": 6, "35-39": 7, "40-44": 8, "45-49": 9, "50-54": 10, "55-59": 11})

X["inv-nodes"] = oe.fit_transform(X[["inv-nodes"]])
# X["inv-nodes"] = X["inv-nodes"].map({"0-2": 0, "3-5": 1, "6-8": 2, "9-11": 3, "12-14": 4, "15-17": 5, "18-20": 6, "21-23": 7, "24-26": 8, "27-29": 9, "30-32": 10, "33-35": 11, "36-39": 12})

X["breast-quad"] = oe.fit_transform(X[["breast-quad"]])
# X["breast-quad"] = X["breast-quad"].map({"left_up": 0, "left_low": 1, "right_up": 2, "right_low": 3, "central": 4})


lb = LabelBinarizer()
X["node-caps"] = lb.fit_transform(X["node-caps"])
# X["node-caps"] = X["node-caps"].map({"no": 0, "yes": 1})

X["breast"] = lb.fit_transform(X["breast"])
# X["breast"] = X["breast"].map({"left": 0, "right": 1})

X["irradiat"] = lb.fit_transform(X["irradiat"])
# X["irradiat"] = X["irradiat"].map({"no": 0, "yes": 1})


X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,1.0,2.0,5.0,0.0,0,3,0,1.0,0
1,2.0,2.0,3.0,0.0,0,2,1,4.0,0
2,2.0,2.0,3.0,0.0,0,2,0,1.0,0
3,4.0,0.0,2.0,0.0,0,2,1,2.0,0
4,2.0,2.0,0.0,0.0,0,2,1,3.0,0


___

In [8]:
# Alvo esperado
y.value_counts()

Class               
no-recurrence-events    196
recurrence-events        81
Name: count, dtype: int64

In [9]:
k = 2
kmeans = KMeans(n_clusters=k, init="random").fit(X)
# kmeans.labels_
pd.DataFrame(kmeans.labels_, columns=["labels"]).value_counts()

labels
1         214
0          63
Name: count, dtype: int64

In [10]:
ahc = AgglomerativeClustering(n_clusters=k).fit(X)
# ahc.labels_
pd.DataFrame(ahc.labels_, columns=["labels"]).value_counts()

labels
0         215
1          62
Name: count, dtype: int64