In [None]:
# 1. IMPORTS & SETUP

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from scipy.stats import kruskal, f_oneway, chi2_contingency

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style="whitegrid", palette="Set2")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

: 

In [None]:
# 2. CARREGANDO OS DADOS

df = pd.read_csv('.\data\CreditCardCustomerData.csv')

print(f"Shape: {df.shape[0]} linhas √ó {df.shape[1]} colunas\n")
print("Colunas:")
for col in df.columns:
    print(f"   ‚Ä¢ {col} ({df[col].dtype})")

In [None]:
# Primeiras linhas
df.head()

In [None]:
# Estat√≠sticas descritivas
df.describe().T\
  .rename(columns={'50%': 'median'})\
  .style.background_gradient(cmap='Blues', subset=['mean', 'median', 'std'])

In [None]:
# Estat√≠sticas categ√≥ricas
df.describe(include='object').T

In [None]:
# 3. PR√â-PROCESSAMENTO

# --- 3.1 Valores nulos ---
nulls = df.isnull().sum()
nulls_pct = (nulls / len(df) * 100).round(2)

null_df = pd.DataFrame({
    'Nulos': nulls,
    '% do Total': nulls_pct
}).sort_values('Nulos', ascending=False)

print("Valores nulos por coluna:")
print(null_df[null_df['Nulos'] > 0].to_string() if null_df['Nulos'].sum() > 0 else "Nenhum valor nulo encontrado!")

In [None]:
# --- 3.2 Colunas irrelevantes & duplicatas ---

# Colunas que n√£o entram na clusteriza√ß√£o
cols_drop = ['Sl_No', 'Customer Key'] 

df_clean = df.drop(columns=[c for c in cols_drop if c in df.columns], errors='ignore')

# Duplicatas
dupes = df_clean.duplicated().sum()
print(f"Linhas duplicadas: {dupes}")
df_clean = df_clean.drop_duplicates()

print(f"Shape ap√≥s limpeza: {df_clean.shape}")

In [None]:
# --- 3.3 Separando tipos de colunas ---

num_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

print(f"Num√©ricas ({len(num_cols)}): {num_cols}")
print(f"Categ√≥ricas ({len(cat_cols)}): {cat_cols}")

In [None]:
# --- 3.4 Distribui√ß√£o das vari√°veis num√©ricas ---

n_cols = 3
n_rows = (len(num_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    axes[i].hist(df_clean[col].dropna(), bins=40, color='steelblue', edgecolor='white', alpha=0.85)
    axes[i].set_title(col)
    axes[i].xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

# Esconde eixos sobrando
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Distribui√ß√£o das Vari√°veis Num√©ricas', fontsize=16, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Boxplots para detectar outliers
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    axes[i].boxplot(df_clean[col].dropna(), patch_artist=True,
                    boxprops=dict(facecolor='steelblue', alpha=0.6))
    axes[i].set_title(col)

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Boxplots ‚Äî Detec√ß√£o de Outliers', fontsize=16, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Quantificando outliers via IQR
print("Outliers detectados (m√©todo IQR):\n")
for col in num_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df_clean[(df_clean[col] < Q1 - 1.5 * IQR) | (df_clean[col] > Q3 + 1.5 * IQR)]
    pct = round(len(outliers) / len(df_clean) * 100, 2)
    print(f"{col}: {len(outliers)} outliers ({pct}%)")

In [None]:
# --- 3.5 Encoding ---

df_encoded = df_clean.copy()
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le
    print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

In [None]:
# --- 3.6 Normaliza√ß√£o (StandardScaler) ---

scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_encoded),
    columns=df_encoded.columns
)

print("Dados normalizados!")
df_scaled.describe().T[['mean', 'std']].round(4)

In [None]:
# --- 3.7 Heatmap de Correla√ß√£o ---

plt.figure(figsize=(16, 12))
corr = df_encoded.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(
    corr, mask=mask, annot=True, fmt='.2f',
    cmap='coolwarm', linewidths=0.5,
    vmin=-1, vmax=1, annot_kws={'size': 8}
)
plt.title('Matriz de Correla√ß√£o ‚Äî Pr√©-Clusteriza√ß√£o', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# --- 3.8 PCA para redu√ß√£o de dimensionalidade ---

pca_full = PCA()
pca_full.fit(df_scaled)

variancia_acumulada = np.cumsum(pca_full.explained_variance_ratio_)

# Gr√°fico de vari√¢ncia explicada
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(variancia_acumulada) + 1), variancia_acumulada,
         marker='o', color='steelblue', linewidth=2)
plt.axhline(y=0.90, color='red', linestyle='--', label='90% de vari√¢ncia')
plt.axhline(y=0.95, color='orange', linestyle='--', label='95% de vari√¢ncia')
plt.xlabel('N√∫mero de Componentes')
plt.ylabel('Vari√¢ncia Acumulada Explicada')
plt.title('PCA ‚Äî Vari√¢ncia Explicada Acumulada')
plt.legend()
plt.tight_layout()
plt.show()

# Escolhe n_components que explica >= 90%
n_components = np.argmax(variancia_acumulada >= 0.90) + 1
print(f"\n‚úÖ Componentes para 90% de vari√¢ncia: {n_components}")

In [None]:
# Aplica PCA com n ideal
pca = PCA(n_components=n_components)
df_pca = pca.fit_transform(df_scaled)

print(f"Shape ap√≥s PCA: {df_pca.shape}")

In [None]:
# 4. ESCOLHA DO K IDEAL

# --- 4.1 M√©todo do Cotovelo ---

inertias = []
k_range = range(2, 12)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df_pca)
    inertias.append(kmeans.inertia_)

# Gr√°fico
plt.figure(figsize=(10, 5))
plt.plot(k_range, inertias, marker='o', color='steelblue', linewidth=2, markersize=8)
plt.xlabel('N√∫mero de Clusters (K)')
plt.ylabel('In√©rcia (WCSS)')
plt.title('M√©todo do Cotovelo ‚Äî Escolha do K')
plt.xticks(k_range)
plt.tight_layout()
plt.show()

In [None]:
# --- 4.2 Silhueta para cada K ---

silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_pca)
    score = silhouette_score(df_pca, labels)
    silhouette_scores.append(score)
    print(f"   K={k} ‚Üí Silhueta: {score:.4f}")

# Gr√°fico
plt.figure(figsize=(10, 5))
plt.plot(k_range, silhouette_scores, marker='s', color='darkorange', linewidth=2, markersize=8)
plt.xlabel('N√∫mero de Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score por K')
plt.xticks(k_range)
plt.tight_layout()
plt.show()

best_k_sil = k_range[np.argmax(silhouette_scores)]
print(f"\nMelhor K pela Silhueta: {best_k_sil} (score={max(silhouette_scores):.4f})")

In [None]:
# --- 4.3 M√©tricas complementares ---

db_scores = []
ch_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_pca)
    db_scores.append(davies_bouldin_score(df_pca, labels))
    ch_scores.append(calinski_harabasz_score(df_pca, labels))

# Gr√°ficos lado a lado
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Davies-Bouldin (menor = melhor)
axes[0].plot(k_range, db_scores, marker='o', color='tomato', linewidth=2, markersize=8)
axes[0].set_title('Davies-Bouldin Index\n(menor = melhor)')
axes[0].set_xlabel('K')
axes[0].set_ylabel('DB Score')
axes[0].set_xticks(k_range)

# Calinski-Harabasz (maior = melhor)
axes[1].plot(k_range, ch_scores, marker='o', color='mediumseagreen', linewidth=2, markersize=8)
axes[1].set_title('Calinski-Harabasz Index\n(maior = melhor)')
axes[1].set_xlabel('K')
axes[1].set_ylabel('CH Score')
axes[1].set_xticks(k_range)

plt.suptitle('M√©tricas Complementares de Avalia√ß√£o', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# --- 4.4 Painel de decis√£o consolidado ---

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Cotovelo
axes[0, 0].plot(k_range, inertias, marker='o', color='steelblue', linewidth=2)
axes[0, 0].set_title('Cotovelo (In√©rcia)')
axes[0, 0].set_xlabel('K'); axes[0, 0].set_xticks(k_range)

# Silhueta
axes[0, 1].plot(k_range, silhouette_scores, marker='s', color='darkorange', linewidth=2)
axes[0, 1].set_title('Silhouette Score ‚Üë')
axes[0, 1].set_xlabel('K'); axes[0, 1].set_xticks(k_range)

# Davies-Bouldin
axes[1, 0].plot(k_range, db_scores, marker='o', color='tomato', linewidth=2)
axes[1, 0].set_title('Davies-Bouldin ‚Üì')
axes[1, 0].set_xlabel('K'); axes[1, 0].set_xticks(k_range)

# Calinski-Harabasz
axes[1, 1].plot(k_range, ch_scores, marker='o', color='mediumseagreen', linewidth=2)
axes[1, 1].set_title('Calinski-Harabasz ‚Üë')
axes[1, 1].set_xlabel('K'); axes[1, 1].set_xticks(k_range)

plt.suptitle('Painel de Decis√£o ‚Äî Escolha do K Ideal', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# --- 4.5 Silhouette plot para o K escolhido ---

best_k = best_k_sil  # ou troque manualmente: best_k = 4

kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels_final = kmeans_final.fit_predict(df_pca)

from sklearn.metrics import silhouette_samples

sil_samples = silhouette_samples(df_pca, labels_final)
sil_avg = silhouette_score(df_pca, labels_final)

fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
colors = plt.cm.Set2(np.linspace(0, 1, best_k))

for i in range(best_k):
    sil_vals = np.sort(sil_samples[labels_final == i])
    size = sil_vals.shape[0]
    y_upper = y_lower + size

    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, sil_vals,
                     facecolor=colors[i], edgecolor=colors[i], alpha=0.85)
    ax.text(-0.02, y_lower + 0.5 * size, f'C{i}', fontsize=10)
    y_lower = y_upper + 10

ax.axvline(x=sil_avg, color='red', linestyle='--', label=f'Score m√©dio: {sil_avg:.3f}')
ax.set_xlabel('Silhouette Coefficient')
ax.set_ylabel('Cluster')
ax.set_title(f'Silhouette Plot ‚Äî K={best_k}')
ax.legend()
plt.tight_layout()
plt.show()

print(f"\nResumo das m√©tricas para K={best_k}:")
print(f"   ‚Ä¢ Silhouette Score  : {sil_avg:.4f}  (‚Üë melhor pr√≥ximo de 1)")
print(f"   ‚Ä¢ Davies-Bouldin    : {db_scores[best_k - 2]:.4f}  (‚Üì melhor pr√≥ximo de 0)")
print(f"   ‚Ä¢ Calinski-Harabasz : {ch_scores[best_k - 2]:.2f}  (‚Üë maior √© melhor)")

In [None]:
# --- 4.6 Tabela comparativa de todos os K ---

summary = pd.DataFrame({
    'K': list(k_range),
    'In√©rcia': inertias,
    'Silhouette ‚Üë': silhouette_scores,
    'Davies-Bouldin ‚Üì': db_scores,
    'Calinski-Harabasz ‚Üë': ch_scores
})

summary.set_index('K')\
       .style\
       .background_gradient(cmap='Greens', subset=['Silhouette ‚Üë', 'Calinski-Harabasz ‚Üë'])\
       .background_gradient(cmap='Reds_r', subset=['Davies-Bouldin ‚Üì'])\
       .format(precision=4)

In [None]:
# 5. CLUSTERIZA√á√ÉO

best_k = 5

# --- 5.1 K-Means ---

kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels_kmeans = kmeans.fit_predict(df_pca)

# Adiciona ao dataframe original
df_clean['Cluster_KMeans'] = labels_kmeans

print("‚úÖ K-Means aplicado!")
print(f"\nDistribui√ß√£o dos clusters:")
dist = df_clean['Cluster_KMeans'].value_counts().sort_index()
for cluster, count in dist.items():
    pct = count / len(df_clean) * 100
    print(f"   Cluster {cluster}: {count} clientes ({pct:.1f}%)")

In [None]:
# --- 5.2 Hier√°rquico (Agglomerative) ---

hierarquico = AgglomerativeClustering(n_clusters=best_k, linkage='ward')
labels_hier = hierarquico.fit_predict(df_pca)

df_clean['Cluster_Hier'] = labels_hier

print("Clusteriza√ß√£o Hier√°rquica aplicada!")
print(f"\nDistribui√ß√£o dos clusters:")
dist_h = df_clean['Cluster_Hier'].value_counts().sort_index()
for cluster, count in dist_h.items():
    pct = count / len(df_clean) * 100
    print(f"   Cluster {cluster}: {count} clientes ({pct:.1f}%)")

In [None]:
# --- 5.3 Dendrograma ---

from scipy.cluster.hierarchy import dendrogram, linkage

# Usa amostra para n√£o pesar o plot
sample_idx = np.random.choice(len(df_pca), size=200, replace=False)
df_pca_sample = df_pca[sample_idx]

linked = linkage(df_pca_sample, method='ward')

plt.figure(figsize=(18, 6))
dendrogram(
    linked,
    truncate_mode='lastp',
    p=30,
    leaf_rotation=90,
    leaf_font_size=9,
    show_contracted=True,
    color_threshold=linked[-best_k + 1, 2]
)
plt.axhline(
    y=linked[-best_k + 1, 2],
    color='red', linestyle='--',
    label=f'Corte em K={best_k}'
)
plt.title('üåø Dendrograma ‚Äî Clusteriza√ß√£o Hier√°rquica (amostra 200 pts)', fontsize=14)
plt.xlabel('Amostras')
plt.ylabel('Dist√¢ncia de Ward')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# --- 5.4 Compara√ß√£o das m√©tricas ---

metricas = {
    'M√©todo': ['K-Means', 'Hier√°rquico'],
    'Silhouette ‚Üë': [
        silhouette_score(df_pca, labels_kmeans),
        silhouette_score(df_pca, labels_hier)
    ],
    'Davies-Bouldin ‚Üì': [
        davies_bouldin_score(df_pca, labels_kmeans),
        davies_bouldin_score(df_pca, labels_hier)
    ],
    'Calinski-Harabasz ‚Üë': [
        calinski_harabasz_score(df_pca, labels_kmeans),
        calinski_harabasz_score(df_pca, labels_hier)
    ]
}

df_metricas = pd.DataFrame(metricas).set_index('M√©todo')

print("Compara√ß√£o de M√©tricas:\n")
print(df_metricas.round(4).to_string())

df_metricas.style\
    .background_gradient(cmap='Greens', subset=['Silhouette ‚Üë', 'Calinski-Harabasz ‚Üë'])\
    .background_gradient(cmap='Reds_r', subset=['Davies-Bouldin ‚Üì'])\
    .format(precision=4)

In [None]:
# Concord√¢ncia entre os dois m√©todos
from sklearn.metrics import adjusted_rand_score

ari = adjusted_rand_score(labels_kmeans, labels_hier)
print(f"\nAdjusted Rand Index (concord√¢ncia K-Means vs Hier√°rquico): {ari:.4f}")
print("   ‚Üí Pr√≥ximo de 1.0 = alta concord√¢ncia entre os m√©todos")

In [None]:
# --- 5.5 Perfil m√©dio por cluster ---

# Usa colunas num√©ricas originais (sem scaling) pra interpretabilidade
perfil = df_clean.groupby('Cluster_KMeans')[num_cols].mean().round(2)

print("Perfil m√©dio por cluster (valores originais):\n")
perfil.T.style\
    .background_gradient(cmap='coolwarm', axis=1)\
    .format(precision=2)

In [None]:
# Tamanho de cada cluster
tamanhos = df_clean['Cluster_KMeans'].value_counts().sort_index().rename('Qtd. Clientes')
tamanhos_pct = (tamanhos / len(df_clean) * 100).round(1).rename('% Total')

pd.concat([tamanhos, tamanhos_pct], axis=1)\
  .style.background_gradient(cmap='Blues')

In [None]:
# --- 5.6 Nomeando os clusters ---

nomes_clusters = {
    0: 'Perfil A ‚Äî (ex: Cliente Conservador)',
    1: 'Perfil B ‚Äî (ex: Alto Consumidor)',
    2: 'Perfil C ‚Äî (ex: Cliente Inativo)',
    3: 'Perfil D ‚Äî (ex: Cliente Premium)',
    4: 'Perfil E ‚Äî (ex: Uso Moderado)'
}

df_clean['Perfil'] = df_clean['Cluster_KMeans'].map(nomes_clusters)

print("Perfis atribu√≠dos:")
print(df_clean['Perfil'].value_counts().to_string())

In [None]:
# 6. VISUALIZA√á√ïES

# Paleta consistente pra usar em todos os plots
PALETTE = ['#2196F3', '#FF5722', '#4CAF50', '#9C27B0', '#FF9800']

# Reduz pra 2 componentes s√≥ pra visualiza√ß√£o
pca_2d = PCA(n_components=2)
coords_2d = pca_2d.fit_transform(df_scaled)

df_plot = pd.DataFrame({
    'PC1': coords_2d[:, 0],
    'PC2': coords_2d[:, 1],
    'Cluster': df_clean['Cluster_KMeans'].astype(str),
    'Perfil': df_clean['Perfil']
})

# --- 6.1 Scatter est√°tico ---
fig, ax = plt.subplots(figsize=(12, 7))

for i, (cluster, grupo) in enumerate(df_plot.groupby('Cluster')):
    ax.scatter(
        grupo['PC1'], grupo['PC2'],
        label=nomes_clusters[int(cluster)],
        color=PALETTE[i], alpha=0.6, s=40, edgecolors='white', linewidths=0.3
    )

# Centroides
centroides_2d = pca_2d.transform(
    scaler.transform(
        pd.DataFrame(
            kmeans.cluster_centers_,
            columns=df_encoded.columns
        )
    )
)

ax.scatter(
    centroides_2d[:, 0], centroides_2d[:, 1],
    c='black', marker='X', s=200, zorder=5, label='Centr√≥ides'
)

ax.set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}% vari√¢ncia)')
ax.set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}% vari√¢ncia)')
ax.set_title('üó∫Ô∏è Clusters no Espa√ßo PCA 2D ‚Äî K-Means', fontsize=15)
ax.legend(bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
# --- 6.1b Vers√£o interativa (Plotly) ---

fig = px.scatter(
    df_plot, x='PC1', y='PC2',
    color='Perfil',
    color_discrete_sequence=PALETTE,
    title='üó∫Ô∏è Clusters no Espa√ßo PCA 2D ‚Äî Interativo',
    labels={'PC1': f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}%)',
            'PC2': f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}%)'},
    opacity=0.65,
    hover_data=['Perfil']
)

fig.update_traces(marker=dict(size=5))
fig.update_layout(legend_title_text='Perfil', height=550)
fig.show()

In [None]:
# --- 6.2 Heatmap de perfil m√©dio normalizado ---

# Normaliza o perfil pra compara√ß√£o visual justa (0 a 1)
from sklearn.preprocessing import MinMaxScaler

perfil_norm = perfil.copy()
mms = MinMaxScaler()
perfil_norm[num_cols] = mms.fit_transform(perfil[num_cols])

plt.figure(figsize=(16, 6))
sns.heatmap(
    perfil_norm.T,
    annot=perfil.T,          # mostra valores originais nas c√©lulas
    fmt='.1f',
    cmap='YlOrRd',
    linewidths=0.5,
    linecolor='white',
    cbar_kws={'label': 'Valor Normalizado (0‚Äì1)'},
    annot_kws={'size': 8}
)

plt.xticks(
    ticks=np.arange(best_k) + 0.5,
    labels=[f'C{i}\n{nomes_clusters[i].split("‚Äî")[1].strip()}' for i in range(best_k)],
    rotation=0, fontsize=9
)
plt.title('Heatmap de Perfis ‚Äî M√©dia por Cluster', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# --- 6.3 Radar Chart por cluster ---

# Seleciona as features mais relevantes pro radar (evita polui√ß√£o visual)
radar_cols = [
    'Avg_Credit_Limit',
    'Total_Credit_Cards',
    'Total_visits_bank',
    'Total_visits_online',
    'Total_calls_made'
]

perfil_radar = perfil[radar_cols].copy()
perfil_radar_norm = pd.DataFrame(
    MinMaxScaler().fit_transform(perfil_radar),
    columns=radar_cols,
    index=perfil_radar.index
)

angles = np.linspace(0, 2 * np.pi, len(radar_cols), endpoint=False).tolist()
angles += angles[:1]  # fecha o pol√≠gono

fig, axes = plt.subplots(
    1, best_k, figsize=(20, 5),
    subplot_kw=dict(polar=True)
)

for i, ax in enumerate(axes):
    values = perfil_radar_norm.iloc[i].tolist()
    values += values[:1]

    ax.plot(angles, values, color=PALETTE[i], linewidth=2)
    ax.fill(angles, values, color=PALETTE[i], alpha=0.25)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(radar_cols, size=7)
    ax.set_yticks([0.25, 0.50, 0.75, 1.0])
    ax.set_yticklabels(['25%', '50%', '75%', '100%'], size=6)
    ax.set_title(
        f'C{i}\n{nomes_clusters[i].split("‚Äî")[1].strip()}',
        size=10, pad=12, color=PALETTE[i], fontweight='bold'
    )

plt.suptitle('Radar Chart ‚Äî Perfil por Cluster', fontsize=15, y=1.03)
plt.tight_layout()
plt.show()

In [None]:
# --- 6.4 Tamanho dos clusters ---

contagens = df_clean['Cluster_KMeans'].value_counts().sort_index()
labels_nomes = [nomes_clusters[i].split('‚Äî')[1].strip() for i in contagens.index]

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Barras
bars = axes[0].bar(labels_nomes, contagens.values, color=PALETTE, edgecolor='white', linewidth=1.2)
axes[0].bar_label(bars, labels=[f'{v}\n({v/len(df_clean)*100:.1f}%)' for v in contagens.values],
                  padding=4, fontsize=10)
axes[0].set_title('Distribui√ß√£o de Clientes por Cluster')
axes[0].set_ylabel('Qtd. Clientes')
axes[0].set_ylim(0, contagens.max() * 1.2)

# Pizza
axes[1].pie(
    contagens.values,
    labels=labels_nomes,
    colors=PALETTE,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops=dict(edgecolor='white', linewidth=1.5),
    textprops={'fontsize': 10}
)
axes[1].set_title('Propor√ß√£o dos Clusters')

plt.suptitle('Tamanho dos Clusters', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# --- 6.5 Boxplots das principais features por cluster ---

features_box = [
    'Avg_Credit_Limit',
    'Total_Credit_Cards',
    'Total_visits_online',
    'Total_calls_made'
]

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for i, col in enumerate(features_box):
    data_by_cluster = [
        df_clean[df_clean['Cluster_KMeans'] == k][col].dropna().values
        for k in range(best_k)
    ]

    bp = axes[i].boxplot(
        data_by_cluster,
        patch_artist=True,
        notch=True,
        medianprops=dict(color='black', linewidth=2)
    )

    for patch, color in zip(bp['boxes'], PALETTE):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)

    axes[i].set_title(col, fontsize=12)
    axes[i].set_xticklabels(
        [f'C{k}' for k in range(best_k)],
        fontsize=9
    )
    axes[i].set_ylabel('Valor')

plt.suptitle('Boxplots das Features por Cluster', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# --- 6.6 Pairplot ---

pairplot_cols = features_box + ['Cluster_KMeans']
df_pair = df_clean[pairplot_cols].copy()
df_pair['Cluster_KMeans'] = df_pair['Cluster_KMeans'].astype(str)

g = sns.pairplot(
    df_pair,
    hue='Cluster_KMeans',
    palette={str(i): PALETTE[i] for i in range(best_k)},
    diag_kind='kde',
    plot_kws={'alpha': 0.5, 's': 20},
    corner=True
)

g.fig.suptitle('Pairplot ‚Äî Rela√ß√£o entre Features por Cluster', y=1.02, fontsize=14)
plt.show()

In [None]:
# --- 6.7 Distribui√ß√£o das categ√≥ricas por cluster ---

fig, axes = plt.subplots(1, len(cat_cols), figsize=(6 * len(cat_cols), 5))

if len(cat_cols) == 1:
    axes = [axes]

for i, col in enumerate(cat_cols):
    ct = pd.crosstab(df_clean['Cluster_KMeans'], df_clean[col], normalize='index') * 100

    ct.plot(
        kind='bar', stacked=True, ax=axes[i],
        colormap='Set2', edgecolor='white', linewidth=0.5
    )
    axes[i].set_title(f'Distribui√ß√£o: {col}', fontsize=12)
    axes[i].set_xlabel('Cluster')
    axes[i].set_ylabel('% dentro do cluster')
    axes[i].set_xticklabels([f'C{k}' for k in range(best_k)], rotation=0)
    axes[i].legend(title=col, bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=8)

plt.suptitle('Vari√°veis Categ√≥ricas por Cluster', fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# VALIDA√á√ïES ESTAT√çSTICAS

from scipy.stats import kruskal, chi2_contingency

alpha = 0.05

# --- 7.1 Kruskal-Wallis ---

print("=" * 60)
print("KRUSKAL-WALLIS ‚Äî Vari√°veis Num√©ricas")
print(f"   H0: os clusters t√™m a mesma distribui√ß√£o")
print(f"   H1: ao menos um cluster difere")
print(f"   N√≠vel de signific√¢ncia: {alpha}")
print("=" * 60)

resultados_kw = []

for col in num_cols:
    grupos = [
        df_clean[df_clean['Cluster_KMeans'] == k][col].dropna().values
        for k in range(best_k)
    ]
    stat, p = kruskal(*grupos)
    significativo = 'Sim' if p < alpha else 'N√£o'

    resultados_kw.append({
        'Feature': col,
        'H-statistic': round(stat, 4),
        'p-value': round(p, 6),
        'Significativo?': significativo
    })

df_kw = pd.DataFrame(resultados_kw).sort_values('p-value')

print(df_kw.to_string(index=False))

In [None]:
# Visualiza√ß√£o dos p-values
plt.figure(figsize=(12, 6))

colors_bar = [PALETTE[0] if p < alpha else '#BDBDBD' for p in df_kw['p-value']]

bars = plt.barh(df_kw['Feature'], df_kw['p-value'],
                color=colors_bar, edgecolor='white')

plt.axvline(x=alpha, color='red', linestyle='--', linewidth=1.5, label=f'Œ± = {alpha}')
plt.xlabel('p-value')
plt.title('Kruskal-Wallis ‚Äî p-value por Feature\n(azul = significativo | cinza = n√£o significativo)', fontsize=13)
plt.legend()
plt.tight_layout()
plt.show()

sig = df_kw[df_kw['p-value'] < alpha].shape[0]
print(f"\n{sig} de {len(num_cols)} features com diferen√ßa significativa entre clusters (p < {alpha})")

In [None]:
# --- 7.2 Chi-Quadrado ---

print("=" * 60)
print("CHI-QUADRADO ‚Äî Vari√°veis Categ√≥ricas")
print(f"   H0: a vari√°vel √© independente do cluster")
print(f"   H1: existe associa√ß√£o entre a vari√°vel e o cluster")
print(f"   N√≠vel de signific√¢ncia: {alpha}")
print("=" * 60)

resultados_chi = []

for col in cat_cols:
    tabela = pd.crosstab(df_clean['Cluster_KMeans'], df_clean[col])
    chi2, p, dof, _ = chi2_contingency(tabela)
    significativo = 'Sim' if p < alpha else 'N√£o'

    resultados_chi.append({
        'Feature': col,
        'Chi¬≤': round(chi2, 4),
        'Graus de Liberdade': dof,
        'p-value': round(p, 6),
        'Significativo?': significativo
    })

df_chi = pd.DataFrame(resultados_chi).sort_values('p-value')
print(df_chi.to_string(index=False))

In [None]:
# --- 7.3 Resumo consolidado ---

print("\n" + "=" * 60)
print("RESUMO DAS VALIDA√á√ïES ESTAT√çSTICAS")
print("=" * 60)

sig_kw  = df_kw[df_kw['p-value'] < alpha]['Feature'].tolist()
nsig_kw = df_kw[df_kw['p-value'] >= alpha]['Feature'].tolist()
sig_chi  = df_chi[df_chi['p-value'] < alpha]['Feature'].tolist()
nsig_chi = df_chi[df_chi['p-value'] >= alpha]['Feature'].tolist()

print(f"\nKruskal-Wallis (num√©ricas):")
print(f"  Significativas : {sig_kw}")
print(f"  N√£o significat.: {nsig_kw}")

print(f"\nChi-Quadrado (categ√≥ricas):")
print(f"   Significativas : {sig_chi}")
print(f"   N√£o significat.: {nsig_chi}")

print(f"""
Interpreta√ß√£o:
   ‚Ä¢ Features significativas no Kruskal-Wallis indicam que
     os clusters se comportam de forma estatisticamente
     diferente nessas vari√°veis ‚Äî ou seja, a segmenta√ß√£o
     capturou padr√µes reais de comportamento.

   ‚Ä¢ Features significativas no Chi¬≤ indicam que a
     distribui√ß√£o das categorias n√£o √© aleat√≥ria entre
     clusters ‚Äî refor√ßando a qualidade da segmenta√ß√£o.
""")