In [1]:
%pip install plotly -q

Note: you may need to restart the kernel to use updated packages.


In [51]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Base idade e salário

In [3]:
from sklearn.cluster import KMeans

In [4]:
x = [20,  27,  21,  37,  46, 53, 55,  47,  52,  32,  39,  41,  39,  48,  48]
y = [1000, 1200, 2900, 1850, 900, 950, 2000, 2100,
     3000, 5900, 4100, 5100, 7000, 5000, 6500]

In [5]:
graph = px.scatter(x = x, y = y)
graph.show()

In [7]:
base_salary = np.array([
  [age, salary]
  for age, salary in zip(x, y)
])
base_salary

array([[  20, 1000],
       [  27, 1200],
       [  21, 2900],
       [  37, 1850],
       [  46,  900],
       [  53,  950],
       [  55, 2000],
       [  47, 2100],
       [  52, 3000],
       [  32, 5900],
       [  39, 4100],
       [  41, 5100],
       [  39, 7000],
       [  48, 5000],
       [  48, 6500]])

In [8]:
scaler_salary = StandardScaler()
base_salary = scaler_salary.fit_transform(base_salary)

In [9]:
base_salary

array([[-1.87963884, -1.11413572],
       [-1.23255006, -1.01725435],
       [-1.78719758, -0.19376273],
       [-0.30813751, -0.70238991],
       [ 0.52383377, -1.1625764 ],
       [ 1.17092255, -1.13835606],
       [ 1.35580506, -0.62972888],
       [ 0.61627503, -0.5812882 ],
       [ 1.0784813 , -0.14532205],
       [-0.77034379,  1.25945777],
       [-0.12325501,  0.38752547],
       [ 0.0616275 ,  0.8719323 ],
       [-0.12325501,  1.79230528],
       [ 0.70871628,  0.82349162],
       [ 0.70871628,  1.55010187]])

In [10]:
kmeans_salary = KMeans(n_clusters=3)
kmeans_salary.fit(base_salary)





In [12]:
centroides = kmeans_salary.cluster_centers_
centroides

array([[ 0.07703438,  1.11413572],
       [ 0.73953003, -0.72661025],
       [-1.63312883, -0.77505093]])

In [13]:
scaler_salary.inverse_transform(kmeans_salary.cluster_centers_)

array([[  41.16666667, 5600.        ],
       [  48.33333333, 1800.        ],
       [  22.66666667, 1700.        ]])

In [14]:
labels = kmeans_salary.labels_
labels

array([2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int32)

Explicação dos rótulos (labels):

- 2 representa o grupo com idade de $22.66666667$ e salário de $1700$
- 1 representa o grupo com idade de $48.33333333$ e salário de $1800$
- 0 representa o grupo com idade de $41.16666667$ e salário de $5600$

In [16]:
graph1 = px.scatter(x = base_salary[:, 0], y = base_salary[:, 1], color=labels)
graph2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size=[12, 12, 12])
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()

# Dados randômicos

In [35]:
from sklearn.datasets import make_blobs

In [40]:
X_random, y_random = make_blobs(n_samples=200, centers=5, random_state=42)

In [41]:
X_random

array([[ -8.0799236 ,  -7.21461083],
       [ -2.42215055,   8.71527878],
       [  0.57421589,   2.75398778],
       [ -8.01291141,   8.13703255],
       [  5.1618204 ,   2.27015436],
       [ -8.55733589,   6.7008234 ],
       [  1.30385601,   3.9480044 ],
       [  2.24639272,   4.17404396],
       [ -7.10308998,  -6.1661091 ],
       [  6.04267315,   0.57131862],
       [  3.66519717,   2.76025429],
       [ -4.7356831 ,  -6.24619057],
       [ -6.30873668,  -5.74454395],
       [ -6.81534717,  -7.95785437],
       [ -9.76525823,   7.26399756],
       [  2.00328403,   3.15892219],
       [ -2.1475616 ,   8.36916637],
       [  3.0323956 ,   2.15780354],
       [  2.3450188 ,   3.33422061],
       [ -9.09089591,   6.07573973],
       [ -8.76852567,   6.93820932],
       [ -9.42769251,   8.17312501],
       [  3.44857534,   2.62972329],
       [ -3.05358035,   9.12520872],
       [ -9.65654844,   9.41591019],
       [ -1.4781981 ,   9.94556625],
       [ -3.10983631,   8.72259238],
 

In [42]:
y_random

array([2, 0, 4, 3, 1, 3, 4, 4, 2, 1, 1, 2, 2, 2, 3, 4, 0, 1, 4, 3, 3, 3,
       1, 0, 3, 0, 0, 1, 0, 2, 3, 3, 3, 2, 0, 3, 3, 1, 3, 0, 2, 4, 0, 2,
       2, 3, 3, 4, 4, 4, 4, 1, 3, 0, 4, 1, 0, 0, 1, 3, 1, 4, 2, 1, 3, 4,
       4, 2, 1, 0, 4, 0, 0, 4, 0, 3, 1, 2, 4, 0, 3, 0, 0, 2, 2, 3, 3, 3,
       4, 0, 2, 3, 2, 1, 1, 2, 1, 2, 4, 0, 1, 0, 1, 2, 1, 1, 3, 1, 2, 4,
       2, 3, 0, 1, 3, 0, 4, 1, 4, 4, 0, 0, 0, 3, 3, 1, 4, 4, 2, 2, 0, 3,
       2, 3, 0, 4, 4, 1, 2, 2, 1, 3, 1, 0, 4, 1, 2, 0, 1, 4, 3, 3, 0, 4,
       2, 4, 2, 4, 4, 1, 2, 1, 0, 2, 2, 4, 0, 4, 2, 1, 1, 3, 2, 3, 1, 2,
       3, 1, 3, 1, 0, 1, 3, 0, 2, 4, 4, 3, 0, 1, 2, 2, 0, 0, 2, 4, 4, 0,
       4, 1])

In [50]:
px.scatter(x = X_random[:, 0], y = X_random[:, 1], color=y_random)

In [44]:
kmeans_blobs = KMeans(n_clusters=5)
kmeans_blobs.fit(X_random)





In [45]:
labels = kmeans_blobs.predict(X_random)
labels

array([1, 0, 4, 3, 2, 3, 4, 4, 1, 2, 2, 1, 1, 1, 3, 4, 0, 2, 4, 3, 3, 3,
       2, 0, 3, 0, 0, 2, 0, 1, 3, 3, 3, 1, 0, 3, 3, 2, 3, 0, 1, 4, 0, 1,
       1, 3, 3, 4, 4, 4, 4, 2, 3, 0, 4, 2, 0, 0, 2, 3, 2, 4, 1, 2, 3, 4,
       4, 1, 2, 0, 4, 0, 0, 2, 0, 3, 2, 1, 4, 0, 3, 0, 0, 1, 1, 3, 3, 3,
       4, 0, 1, 3, 1, 2, 2, 1, 2, 1, 4, 0, 2, 0, 2, 1, 2, 2, 3, 2, 1, 4,
       1, 3, 0, 2, 3, 0, 4, 2, 4, 4, 0, 0, 0, 3, 3, 2, 4, 4, 1, 1, 0, 3,
       1, 3, 0, 4, 4, 2, 1, 1, 2, 3, 2, 0, 4, 2, 1, 0, 2, 4, 3, 3, 0, 4,
       1, 4, 1, 4, 4, 2, 1, 2, 0, 1, 1, 2, 0, 4, 1, 2, 2, 3, 1, 3, 2, 1,
       3, 2, 3, 2, 0, 2, 3, 0, 1, 4, 4, 3, 0, 2, 1, 1, 0, 0, 1, 4, 4, 0,
       4, 2], dtype=int32)

In [46]:
centroides = kmeans_blobs.cluster_centers_
centroides

array([[-2.71038846,  8.8833827 ],
       [-6.96417736, -6.7006896 ],
       [ 4.54996635,  2.16341636],
       [-8.56863765,  7.26172299],
       [ 1.88691387,  4.23193965]])

In [49]:
graph1 = px.scatter(x = X_random[:, 0], y = X_random[:, 1], color=labels)
graph2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size=[5] * 5)
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()

# Base dados cartão de crédito

- Fonte: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

In [55]:
base_credit_card = pd.read_csv("../../assets/credit_card_clients.csv", header=1)
base_credit_card

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [56]:
base_credit_card['BILL_TOTAL'] = base_credit_card['BILL_AMT1'] + base_credit_card['BILL_AMT2'] + base_credit_card['BILL_AMT3'] + base_credit_card['BILL_AMT4'] + base_credit_card['BILL_AMT5'] + base_credit_card['BILL_AMT6']

In [57]:
base_credit_card

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,BILL_TOTAL
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,7704
1,2,120000,2,2,2,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,17077
2,3,90000,2,2,2,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,101653
3,4,50000,2,2,1,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,231334
4,5,50000,1,2,1,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,109339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,0,725349
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,5190,0,1837,3526,8998,129,0,0,0,21182
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20582,19357,0,0,22000,4200,2000,3100,1,70496
29998,29999,80000,1,3,1,41,1,-1,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,1,266611


In [58]:
X_credit_card = base_credit_card.iloc[:, [1, 25]].values
X_credit_card

array([[ 20000,   7704],
       [120000,  17077],
       [ 90000, 101653],
       ...,
       [ 30000,  70496],
       [ 80000, 266611],
       [ 50000, 230874]])

In [60]:
scaler_credit_card = StandardScaler()
X_credit_card = scaler_credit_card.fit_transform(X_credit_card)

In [61]:
X_credit_card

array([[-1.13672015, -0.69069198],
       [-0.3659805 , -0.66599747],
       [-0.59720239, -0.44316987],
       ...,
       [-1.05964618, -0.52525745],
       [-0.67427636, -0.00856436],
       [-0.90549825, -0.10271861]])

In [62]:
wcss = []

for i in range(1, 11):
  kmeans_credit_card = KMeans(n_clusters=i, random_state=0)
  kmeans_credit_card.fit(X_credit_card)
  wcss.append(kmeans_credit_card.inertia_)























In [63]:
wcss

[59999.999999999985,
 35197.61457982185,
 20128.13107593303,
 14943.801288165881,
 10707.346740388306,
 8603.40240664868,
 7398.2660879170235,
 6354.390259676886,
 5665.834420607418,
 5052.690789379014]

In [64]:
px.line(x = range(1, 11), y = wcss)

Pode-se concluir que o valor ideal para o $k$ é de 4 ou 5 agrupamentos (região de cotovelo).

In [65]:
kmeans_credit_card = KMeans(n_clusters=4, random_state=0)
labels = kmeans_credit_card.fit_predict(X_credit_card)





In [66]:
px.scatter(x = X_credit_card[:, 0], y = X_credit_card[:, 1], color=labels)

In [67]:
client_list = np.column_stack((base_credit_card, labels))
client_list

array([[     1,  20000,      2, ...,      1,   7704,      2],
       [     2, 120000,      2, ...,      1,  17077,      2],
       [     3,  90000,      2, ...,      0, 101653,      2],
       ...,
       [ 29998,  30000,      1, ...,      1,  70496,      2],
       [ 29999,  80000,      1, ...,      1, 266611,      2],
       [ 30000,  50000,      1, ...,      1, 230874,      2]])

In [70]:
client_list = client_list[client_list[:, 26].argsort()]
client_list

array([[11, 25,  8, ...,  0, 26,  7],
       [11, 25,  8, ...,  0, 26,  7],
       [11, 25,  8, ...,  0, 26,  7],
       ...,
       [11, 25,  8, ...,  0, 26,  7],
       [11, 25,  8, ...,  0, 26,  7],
       [11, 25,  8, ...,  0, 26,  7]])

# Base de dados cartão de crédito com mais atributos

In [71]:
base_credit_card.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'BILL_TOTAL'],
      dtype='object')

In [72]:
X_credit_card = base_credit_card.iloc[:, [1, 2, 3, 4, 5, 25]].values
X_credit_card

array([[ 20000,      2,      2,      1,     24,   7704],
       [120000,      2,      2,      2,     26,  17077],
       [ 90000,      2,      2,      2,     34, 101653],
       ...,
       [ 30000,      1,      2,      2,     37,  70496],
       [ 80000,      1,      3,      1,     41, 266611],
       [ 50000,      1,      2,      1,     46, 230874]])

In [73]:
scaler_credit_card = StandardScaler()
X_credit_card = scaler_credit_card.fit_transform(X_credit_card)

In [74]:
X_credit_card

array([[-1.13672015,  0.81016074,  0.18582826, -1.05729503, -1.24601985,
        -0.69069198],
       [-0.3659805 ,  0.81016074,  0.18582826,  0.85855728, -1.02904717,
        -0.66599747],
       [-0.59720239,  0.81016074,  0.18582826,  0.85855728, -0.16115646,
        -0.44316987],
       ...,
       [-1.05964618, -1.23432296,  0.18582826,  0.85855728,  0.16430256,
        -0.52525745],
       [-0.67427636, -1.23432296,  1.45111372, -1.05729503,  0.59824792,
        -0.00856436],
       [-0.90549825, -1.23432296,  0.18582826, -1.05729503,  1.14067961,
        -0.10271861]])

In [75]:
wcss = []

for i in range(1, 11):
  kmeans_credit_card = KMeans(n_clusters=i, random_state=0)
  kmeans_credit_card.fit(X_credit_card)
  wcss.append(kmeans_credit_card.inertia_)























In [76]:
px.line(x = range(1, 11), y = wcss)

In [77]:
kmeans_credit_card = KMeans(n_clusters=4, random_state=0)
labels = kmeans_credit_card.fit_predict(X_credit_card)





In [78]:
labels

array([2, 3, 3, ..., 0, 2, 2], dtype=int32)

In [79]:
from sklearn.decomposition import PCA

In [80]:
pca = PCA(n_components=2)
X_credit_card_pca = pca.fit_transform(X_credit_card)

In [82]:
X_credit_card_pca.shape

(30000, 2)

In [83]:
X_credit_card_pca

array([[-0.74082054, -1.13671858],
       [-1.48027121, -0.30100547],
       [-0.94737386, -0.48666789],
       ...,
       [-0.79468657, -0.90012663],
       [ 1.17562376, -1.54746987],
       [ 1.13614987, -1.14039836]])

In [85]:
px.scatter(x = X_credit_card_pca[:, 0], y = X_credit_card_pca[:, 1], color=labels)

In [86]:
client_list = np.column_stack((base_credit_card, labels))
client_list

array([[     1,  20000,      2, ...,      1,   7704,      2],
       [     2, 120000,      2, ...,      1,  17077,      3],
       [     3,  90000,      2, ...,      0, 101653,      3],
       ...,
       [ 29998,  30000,      1, ...,      1,  70496,      0],
       [ 29999,  80000,      1, ...,      1, 266611,      2],
       [ 30000,  50000,      1, ...,      1, 230874,      2]])