In [1]:
# Importando as bibliotecas utilizadas no projeto
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Carregando e visualizando os dados
dados = pd.read_csv('../Dados/Customer_Data.csv')
dados.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [3]:
# Visualizando a quantidade de linhas e colunas presentes no Dataset
print('O Dataset possui %d linhas e %d colunas'%dados.shape)

O Dataset possui 8950 linhas e 18 colunas


In [4]:
# Obtendo informações sobre os dados
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   CUST_ID                           8950 non-null   object 
 1   BALANCE                           8950 non-null   float64
 2   BALANCE_FREQUENCY                 8950 non-null   float64
 3   PURCHASES                         8950 non-null   float64
 4   ONEOFF_PURCHASES                  8950 non-null   float64
 5   INSTALLMENTS_PURCHASES            8950 non-null   float64
 6   CASH_ADVANCE                      8950 non-null   float64
 7   PURCHASES_FREQUENCY               8950 non-null   float64
 8   ONEOFF_PURCHASES_FREQUENCY        8950 non-null   float64
 9   PURCHASES_INSTALLMENTS_FREQUENCY  8950 non-null   float64
 10  CASH_ADVANCE_FREQUENCY            8950 non-null   float64
 11  CASH_ADVANCE_TRX                  8950 non-null   int64  
 12  PURCHA

In [5]:
# Existem alguma colunas com dados faltantes.
dados.isnull().sum()

CUST_ID                               0
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64

In [6]:
# Removendo essa colunas com dados faltantes
dados.dropna(inplace = True)
# Resetando o index
dados.reset_index(inplace = True, drop = True)
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8636 entries, 0 to 8635
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   CUST_ID                           8636 non-null   object 
 1   BALANCE                           8636 non-null   float64
 2   BALANCE_FREQUENCY                 8636 non-null   float64
 3   PURCHASES                         8636 non-null   float64
 4   ONEOFF_PURCHASES                  8636 non-null   float64
 5   INSTALLMENTS_PURCHASES            8636 non-null   float64
 6   CASH_ADVANCE                      8636 non-null   float64
 7   PURCHASES_FREQUENCY               8636 non-null   float64
 8   ONEOFF_PURCHASES_FREQUENCY        8636 non-null   float64
 9   PURCHASES_INSTALLMENTS_FREQUENCY  8636 non-null   float64
 10  CASH_ADVANCE_FREQUENCY            8636 non-null   float64
 11  CASH_ADVANCE_TRX                  8636 non-null   int64  
 12  PURCHA

Antes de prosseguir para as próxima etapas, é importante compreender qual é a informação presente em cada coluna, por isso, segure abaixo um dicionário dos dados simplificado:

- **CUST_ID**: ID do cliente.
- **BALANCE**: Saldo devido pelo cliente.
- **BALANCE_FREQUENCY**: Frequência de atualização do saldo.
- **PURCHASES**: Total em compras.
- **ONEOFF_PURCHASES**: Compras à vista.
- **INSTALLMENTS_PURCHASES**: Compras parceladas.
- **CASH_ADVANCE**: Saques utilizando o cartão(Empréstimos).
- **PURCHASES_FREQUENCY**: Frequência de compras.
- **ONEOFF_PURCHASES_FREQUENCY**: Frequencia de compras à vista.
- **PURCHASES_INSTALLMENTS_FREQUENCY**: Frequência de compras parcelada.
- **CASH_ADVANCE_FREQUENCY**: Frequência de realização dos saques.
- **CASH_ADVANCE_TRX**: Taxa de transações relacionadas aos saques.
- **PURCHASES_TRX**: Número de transações relacionadas a compra. 
- **CREDIT_LIMIT**: Limite do cartão de crédito.
- **PAYMENTS**: Valor do pagamento feito pelos clientes.
- **MINIMUM_PAYMENTS**: Valor mínimos dos pagamentos realizados pelo cliente.
- **PRC_FULL_PAYMENT**: Porcentagem de pagamentos totais feitos pelo cliente.
- **TENURE**: Tempo de contrato como cliente.

In [7]:
# Removendo as colunas Cust_id e Tenure 
dados = dados.drop(['CUST_ID', 'TENURE'], axis = 1)

In [8]:
# Importando e instanciando o StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Normalizando os dados.
dados_norm = scaler.fit_transform(dados)

In [9]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters= 4)
y_pred = kmeans.fit_predict(dados_norm)

In [10]:
from sklearn import metrics

labels = kmeans.labels_
silhouette = metrics.silhouette_score(dados_norm, labels, metric = 'euclidean')
print(silhouette)

0.2155839424378504
