In [157]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import Adam
from sklearn.feature_selection import mutual_info_classif
from keras.layers import LeakyReLU

# DOI 10.24432/C5HP4Z

In [158]:
nomes_colunas = ['ID',  'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3']

In [159]:
df = pd.read_csv('wdbc.data')
df_original = df
df.columns = nomes_colunas

In [160]:
df.head()

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  568 non-null    int64  
 1   Diagnosis           568 non-null    object 
 2   radius1             568 non-null    float64
 3   texture1            568 non-null    float64
 4   perimeter1          568 non-null    float64
 5   area1               568 non-null    float64
 6   smoothness1         568 non-null    float64
 7   compactness1        568 non-null    float64
 8   concavity1          568 non-null    float64
 9   concave_points1     568 non-null    float64
 10  symmetry1           568 non-null    float64
 11  fractal_dimension1  568 non-null    float64
 12  radius2             568 non-null    float64
 13  texture2            568 non-null    float64
 14  perimeter2          568 non-null    float64
 15  area2               568 non-null    float64
 16  smoothne

In [162]:
df['Diagnosis'].value_counts()

Diagnosis
B    357
M    211
Name: count, dtype: int64

In [163]:
df.drop(columns=['ID'], inplace=True)
df.drop(columns=['Diagnosis'], inplace=True)

df.replace('?', np.nan, inplace=True)
df = df.astype('float64')
df.dropna(inplace=True)

In [164]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import random

In [165]:
scaler = MinMaxScaler()
df = scaler.fit_transform(df)

In [167]:
melhor_k = 0
melhor_razao = 0
melhor_wss = 0
melhor_bss = 0
seed_salva = 0

for k in range(1, 4):
    for i in range(1, 200):
        seed = random.randint(0, 100000)
        kmeans = KMeans(n_clusters=k, random_state=seed)
        kmeans.fit(df)
        
        wss = kmeans.inertia_
        total_ss = np.sum((df - np.mean(df, axis=0))**2).sum()
        bss = total_ss - wss
        razao = bss/total_ss
    
        if razao > melhor_razao:
            melhor_razao = razao
            melhor_k = k
            melhor_wss = wss
            melhor_bss = bss
            melhor_seed = seed
        
        print(f"razao: {razao:0.5f} | k: {k} | seed: {seed}", end='\r')

print(f"WSS (Within-Cluster Sum of Squares): {melhor_wss}")
print(f"BSS (Between-Cluster Sum of Squares): {melhor_bss}")
print(f"Melhor razao BSS/(WSS + BSS): {melhor_razao} | melhor k: {melhor_k} | seed {melhor_seed}")

WSS (Within-Cluster Sum of Squares): 185.78582195582874
BSS (Between-Cluster Sum of Squares): 166.05747628080707
Melhor razao BSS/(WSS + BSS): 0.47196430090626146 | melhor k: 3 | seed 72577


In [151]:
kmeans = KMeans(n_clusters=melhor_k, random_state=melhor_seed)
kmeans.fit(df)

df = pd.DataFrame(df, columns=df_original.columns)
df['cluster'] = kmeans.fit_predict(df)

In [152]:
cluster_summary = df.groupby('cluster').mean()

most_important_features = cluster_summary.idxmax(axis=1)

In [153]:
cluster_summary

Unnamed: 0_level_0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.607226,0.417442,0.60361,0.463815,0.439405,0.397743,0.426903,0.51843,0.427863,0.216807,...,0.578693,0.454957,0.554683,0.402542,0.450424,0.314969,0.358879,0.672855,0.292834,0.195016
1,0.257248,0.287104,0.248116,0.145335,0.35045,0.169384,0.095004,0.125118,0.334609,0.243606,...,0.206734,0.318399,0.192612,0.100587,0.348234,0.138423,0.121035,0.252886,0.223163,0.144178
2,0.326856,0.357005,0.333893,0.196321,0.508556,0.439947,0.374446,0.361915,0.490468,0.43175,...,0.302482,0.432381,0.302943,0.162088,0.55993,0.415097,0.413111,0.59504,0.377514,0.353381


In [154]:
for index, row in cluster_summary.iterrows():
    max_value = row.max()
    max_column = row.idxmax()
    
    print(f'{index} | max {max_value:0.4f} | {max_column}')

0 | max 0.6729 | concave_points3
1 | max 0.3504 | smoothness1
2 | max 0.5950 | concave_points3


In [155]:
mutual_info = mutual_info_classif(X, y)
info_gains = {X.columns[i]: mutual_info[i] for i in range(len(X.columns))}
info_gains = sorted(info_gains.items(), key=lambda x: x[1], reverse=True)
for col, mi in info_gains:
    print(f'{col}: {mi}')

perimeter3: 0.47738014765285364
area3: 0.4654326671815687
radius3: 0.45371691800027425
concave_points1: 0.4410946291675577
concave_points3: 0.43478373768946565
perimeter1: 0.40192781968391844
concavity1: 0.37441289044800063
radius1: 0.36373550043748204
area1: 0.3622557453857238
area2: 0.3416377883675916
concavity3: 0.31482002733723635
perimeter2: 0.2748828868656725
radius2: 0.24837694480960693
compactness3: 0.22531349266669154
compactness1: 0.21338541582530213
concave_points2: 0.1280150875105286
texture3: 0.12134458817321603
concavity2: 0.1159789290275941
smoothness3: 0.10198971683726521
texture1: 0.09451899072850312
symmetry3: 0.09215682877106435
smoothness1: 0.07658714766960162
compactness2: 0.07342694578711617
symmetry1: 0.07186257698781406
fractal_dimension3: 0.06562451626257704
fractal_dimension2: 0.03646732402391706
smoothness2: 0.016523782498271133
symmetry2: 0.013033692003632025
fractal_dimension1: 0.01051593965052633
texture2: 0.0


  y = column_or_1d(y, warn=True)
