In [225]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import Adam
from sklearn.feature_selection import mutual_info_classif
from keras.layers import LeakyReLU

# DOI 10.24432/C5HP4Z

In [226]:
nomes_colunas = ['ID',  'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3']

In [227]:
df = pd.read_csv('wdbc.data')
df_original = df
df.columns = nomes_colunas

In [228]:
df.head()

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [229]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  568 non-null    int64  
 1   Diagnosis           568 non-null    object 
 2   radius1             568 non-null    float64
 3   texture1            568 non-null    float64
 4   perimeter1          568 non-null    float64
 5   area1               568 non-null    float64
 6   smoothness1         568 non-null    float64
 7   compactness1        568 non-null    float64
 8   concavity1          568 non-null    float64
 9   concave_points1     568 non-null    float64
 10  symmetry1           568 non-null    float64
 11  fractal_dimension1  568 non-null    float64
 12  radius2             568 non-null    float64
 13  texture2            568 non-null    float64
 14  perimeter2          568 non-null    float64
 15  area2               568 non-null    float64
 16  smoothne

In [230]:
df['Diagnosis'].value_counts()

Diagnosis
B    357
M    211
Name: count, dtype: int64

In [231]:
df.drop(columns=['ID'], inplace=True)
df.drop(columns=['Diagnosis'], inplace=True)

df.replace('?', np.nan, inplace=True)
df = df.astype('float64')
df.dropna(inplace=True)

In [232]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import random

In [233]:
scaler = MinMaxScaler()
df = scaler.fit_transform(df)

In [234]:
melhor_k = 0
melhor_razao = 0
melhor_wss = 0
melhor_bss = 0
seed_salva = 0

for k in range(1, 8):
    for i in range(1, 300):
        seed = random.randint(0, 100000)
        kmeans = KMeans(n_clusters=k, random_state=seed)
        kmeans.fit(df)
        
        wss = kmeans.inertia_
        total_ss = np.sum((df - np.mean(df, axis=0))**2).sum()
        bss = total_ss - wss
        razao = bss/total_ss
    
        if razao > melhor_razao:
            melhor_razao = razao
            melhor_k = k
            melhor_wss = wss
            melhor_bss = bss
            melhor_seed = seed
        
        print(f"razao: {razao:0.5f} | k: {k} | seed: {seed}", end='\r')

print(f"WSS (Within-Cluster Sum of Squares): {melhor_wss}")
print(f"BSS (Between-Cluster Sum of Squares): {melhor_bss}")
print(f"Melhor razao BSS/(WSS + BSS): {melhor_razao} | melhor k: {melhor_k} | seed {melhor_seed}")

WSS (Within-Cluster Sum of Squares): 137.19149902101066
BSS (Between-Cluster Sum of Squares): 214.65179921562515
Melhor razao BSS/(WSS + BSS): 0.6100778394569815 | melhor k: 7 | seed 57094


In [217]:
kmeans = KMeans(n_clusters=melhor_k, random_state=melhor_seed)
kmeans.fit(df)

df = pd.DataFrame(df, columns=df_original.columns)
df['cluster'] = kmeans.fit_predict(df)

In [218]:
cluster_summary = df.groupby('cluster').mean()

most_important_features = cluster_summary.idxmax(axis=1)

In [219]:
cluster_summary

Unnamed: 0_level_0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.362272,0.384286,0.370686,0.222475,0.537361,0.475718,0.405042,0.415603,0.521301,0.421533,...,0.346939,0.478627,0.345355,0.192405,0.615073,0.482246,0.456538,0.676681,0.438185,0.387111
1,0.365506,0.317548,0.357747,0.225645,0.365195,0.249708,0.190055,0.233898,0.343518,0.216711,...,0.31203,0.357246,0.29909,0.167812,0.37741,0.228544,0.234404,0.428956,0.261403,0.177162
2,0.713948,0.445627,0.72997,0.586362,0.528649,0.633188,0.681022,0.741314,0.578929,0.361272,...,0.67132,0.460864,0.671597,0.495783,0.488292,0.454593,0.50101,0.842337,0.368003,0.268041
3,0.581215,0.415429,0.572641,0.433972,0.421371,0.340743,0.365824,0.465915,0.392449,0.183019,...,0.561028,0.46312,0.530368,0.383686,0.446279,0.279033,0.32461,0.635095,0.275857,0.177006
4,0.193047,0.283395,0.201798,0.106742,0.424237,0.425155,0.398198,0.245287,0.497054,0.626041,...,0.148233,0.282235,0.151399,0.068878,0.406114,0.297043,0.363884,0.378772,0.246567,0.358389
5,0.262876,0.297172,0.250119,0.147385,0.28433,0.115071,0.059006,0.0875,0.292687,0.182172,...,0.208006,0.329038,0.190266,0.099732,0.274512,0.101219,0.083755,0.198162,0.201011,0.105739
6,0.185674,0.259304,0.181816,0.095168,0.455497,0.209279,0.099203,0.127112,0.394218,0.351923,...,0.145722,0.290634,0.137513,0.064057,0.460659,0.150132,0.118104,0.2428,0.237681,0.187944


In [220]:
for index, row in cluster_summary.iterrows():
    max_value = row.max()
    max_column = row.idxmax()
    
    print(f'{index} | max {max_value:0.4f} | {max_column}')

0 | max 0.6767 | concave_points3
1 | max 0.4290 | concave_points3
2 | max 0.8423 | concave_points3
3 | max 0.6351 | concave_points3
4 | max 0.6260 | fractal_dimension1
5 | max 0.3290 | texture3
6 | max 0.4607 | smoothness3


### Exemplo de análise de cluster: Cluester 0 'concave_points3'

In [224]:
cluster = df[df['cluster'] == 0]
cluster.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,cluster
2,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711,0
4,0.258839,0.20257,0.267984,0.141506,0.678613,0.461996,0.369728,0.402038,0.518687,0.551179,...,0.312633,0.263908,0.136748,0.712739,0.482784,0.427716,0.598282,0.477035,0.454939,0
6,0.318472,0.376057,0.32071,0.184263,0.598267,0.445126,0.219447,0.297465,0.573737,0.51706,...,0.429638,0.299766,0.174941,0.622268,0.330753,0.213898,0.534708,0.321506,0.393939,0
7,0.284869,0.409537,0.302052,0.159618,0.674099,0.533157,0.435567,0.464861,0.651515,0.504002,...,0.498667,0.277852,0.136183,0.654626,0.497531,0.430511,0.707904,0.554504,0.342123,0
8,0.259312,0.484613,0.277659,0.140997,0.595558,0.67548,0.532568,0.424602,0.489899,0.683867,...,0.763859,0.235271,0.129326,0.753682,1.0,0.882588,0.75945,0.552139,1.0,0


In [221]:
mutual_info = mutual_info_classif(X, y)
info_gains = {X.columns[i]: mutual_info[i] for i in range(len(X.columns))}
info_gains = sorted(info_gains.items(), key=lambda x: x[1], reverse=True)
for col, mi in info_gains:
    print(f'{col}: {mi}')

perimeter3: 0.47683951574905925
area3: 0.4642644670368359
radius3: 0.45241289673249785
concave_points1: 0.44084218169535405
concave_points3: 0.4374492553099376
perimeter1: 0.40330220842467
concavity1: 0.3723325542639735
radius1: 0.36587316834782513
area1: 0.35958159037184245
area2: 0.34053467599470766
concavity3: 0.3163283315696179
perimeter2: 0.27323858798938727
radius2: 0.24665670960182862
compactness3: 0.22596498190127146
compactness1: 0.2126770585703719
concave_points2: 0.1255152837947111
texture3: 0.12104368651027575
concavity2: 0.1162443084646505
texture1: 0.09763774901628852
smoothness3: 0.09745304870307336
symmetry3: 0.09702566451966876
smoothness1: 0.08225785451091583
compactness2: 0.07419776011241197
symmetry1: 0.06837766007100954
fractal_dimension3: 0.06596949897995219
fractal_dimension2: 0.037724751423699576
smoothness2: 0.016579411076591777
symmetry2: 0.012922043254021798
fractal_dimension1: 0.00666289219027294
texture2: 0.0


  y = column_or_1d(y, warn=True)
