In [186]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import Adam
from sklearn.feature_selection import mutual_info_classif
from keras.layers import LeakyReLU

# DOI 10.24432/C5HP4Z

In [187]:
nomes_colunas = ['Sample_code_number', 'Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses', 'Class']

In [188]:
df = pd.read_csv('breast-cancer-wisconsin.data')
df_original = df
df.columns = nomes_colunas

In [189]:
df.head()

Unnamed: 0,Sample_code_number,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample_code_number           698 non-null    int64 
 1   Clump_thickness              698 non-null    int64 
 2   Uniformity_of_cell_size      698 non-null    int64 
 3   Uniformity_of_cell_shape     698 non-null    int64 
 4   Marginal_adhesion            698 non-null    int64 
 5   Single_epithelial_cell_size  698 non-null    int64 
 6   Bare_nuclei                  698 non-null    object
 7   Bland_chromatin              698 non-null    int64 
 8   Normal_nucleoli              698 non-null    int64 
 9   Mitoses                      698 non-null    int64 
 10  Class                        698 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [191]:
df['Class'].value_counts()

Class
2    457
4    241
Name: count, dtype: int64

In [192]:
df.drop(columns=['Sample_code_number'], inplace=True)
df.replace('?', np.nan, inplace=True)
df = df.astype('float64')
df.dropna(inplace=True)

In [193]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [194]:
df.drop(columns=['Class'], inplace=True)
df_original.drop(columns=['Class'], inplace=True)
scaler = MinMaxScaler()
df = scaler.fit_transform(df)

In [195]:
melhor_k = 0
melhor_razao = 0
melhor_wss = 0
melhor_bss = 0

for k in range(1, 9):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df)
    
    wss = kmeans.inertia_
    total_ss = np.sum((df - np.mean(df, axis=0))**2)
    bss = total_ss - wss
    razao = bss/total_ss

    if razao > melhor_razao:
        melhor_razao = razao
        melhor_k = k
        melhor_wss = wss
        melhor_bss = bss
    
    print(f"razao: {razao} | k: {k} | seed: ", end='\r')

print(f"WSS (Within-Cluster Sum of Squares): {melhor_wss}")
print(f"BSS (Between-Cluster Sum of Squares): {melhor_bss}")
print(f"Melhor razao BSS/(WSS + BSS): {melhor_razao} | melhor k: {melhor_k}")

WSS (Within-Cluster Sum of Squares): 147.3324343749311
BSS (Between-Cluster Sum of Squares): 450.41791861735743
Melhor razao BSS/(WSS + BSS): 0.7535217944457964 | melhor k: 8


In [196]:
kmeans = KMeans(n_clusters=8, random_state=42)
kmeans.fit(df)

df = pd.DataFrame(df, columns=df_original.columns)
df['cluster'] = kmeans.fit_predict(df)

In [197]:
cluster_summary = df.groupby('cluster').mean()

most_important_features = cluster_summary.idxmax(axis=1)

In [198]:
cluster_summary

Unnamed: 0_level_0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.598291,0.786325,0.683761,0.794872,0.581197,0.188034,0.521368,0.230769,0.094017
1,0.207059,0.021176,0.033464,0.029542,0.111373,0.025882,0.111111,0.013072,0.006797
2,0.495726,0.700855,0.700855,0.803419,0.514957,0.938034,0.649573,0.745726,0.106838
3,0.590476,0.24127,0.292063,0.174603,0.295238,0.234921,0.320635,0.336508,0.095238
4,0.736111,0.953704,0.912037,0.907407,0.782407,0.819444,0.759259,0.787037,0.712963
5,0.688576,0.391236,0.43349,0.402191,0.322379,0.946792,0.460094,0.267606,0.076682
6,0.903704,0.785185,0.785185,0.233333,0.674074,0.907407,0.574074,0.533333,0.2
7,0.777778,0.777778,0.690972,0.336806,0.486111,0.267361,0.569444,0.902778,0.131944


In [204]:
for index, row in cluster_summary.iterrows():
    max_value = row.max()
    max_column = row.idxmax()
    print(f'{index} | max {max_value:0.4f} | {max_column}')

0 | max 0.7949 | Marginal_adhesion
1 | max 0.2071 | Clump_thickness
2 | max 0.9380 | Bare_nuclei
3 | max 0.5905 | Clump_thickness
4 | max 0.9537 | Uniformity_of_cell_size
5 | max 0.9468 | Bare_nuclei
6 | max 0.9074 | Bare_nuclei
7 | max 0.9028 | Normal_nucleoli


In [200]:
mutual_info = mutual_info_classif(X, y)
info_gains = {X.columns[i]: mutual_info[i] for i in range(len(X.columns))}
info_gains = sorted(info_gains.items(), key=lambda x: x[1], reverse=True)
for col, mi in info_gains:
    print(f'{col}: {mi}')

NameError: name 'X' is not defined