In [1]:
%pip install plotly -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from sklearn.preprocessing import StandardScaler

# Base idade e salário

In [3]:
from sklearn.cluster import KMeans

In [4]:
x = [20,  27,  21,  37,  46, 53, 55,  47,  52,  32,  39,  41,  39,  48,  48]
y = [1000, 1200, 2900, 1850, 900, 950, 2000, 2100,
     3000, 5900, 4100, 5100, 7000, 5000, 6500]

In [5]:
graph = px.scatter(x = x, y = y)
graph.show()

In [7]:
base_salary = np.array([
  [age, salary]
  for age, salary in zip(x, y)
])
base_salary

array([[  20, 1000],
       [  27, 1200],
       [  21, 2900],
       [  37, 1850],
       [  46,  900],
       [  53,  950],
       [  55, 2000],
       [  47, 2100],
       [  52, 3000],
       [  32, 5900],
       [  39, 4100],
       [  41, 5100],
       [  39, 7000],
       [  48, 5000],
       [  48, 6500]])

In [8]:
scaler_salary = StandardScaler()
base_salary = scaler_salary.fit_transform(base_salary)

In [9]:
base_salary

array([[-1.87963884, -1.11413572],
       [-1.23255006, -1.01725435],
       [-1.78719758, -0.19376273],
       [-0.30813751, -0.70238991],
       [ 0.52383377, -1.1625764 ],
       [ 1.17092255, -1.13835606],
       [ 1.35580506, -0.62972888],
       [ 0.61627503, -0.5812882 ],
       [ 1.0784813 , -0.14532205],
       [-0.77034379,  1.25945777],
       [-0.12325501,  0.38752547],
       [ 0.0616275 ,  0.8719323 ],
       [-0.12325501,  1.79230528],
       [ 0.70871628,  0.82349162],
       [ 0.70871628,  1.55010187]])

In [10]:
kmeans_salary = KMeans(n_clusters=3)
kmeans_salary.fit(base_salary)





In [12]:
centroides = kmeans_salary.cluster_centers_
centroides

array([[ 0.07703438,  1.11413572],
       [ 0.73953003, -0.72661025],
       [-1.63312883, -0.77505093]])

In [13]:
scaler_salary.inverse_transform(kmeans_salary.cluster_centers_)

array([[  41.16666667, 5600.        ],
       [  48.33333333, 1800.        ],
       [  22.66666667, 1700.        ]])

In [14]:
labels = kmeans_salary.labels_
labels

array([2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int32)

Explicação dos rótulos (labels):

- 2 representa o grupo com idade de $22.66666667$ e salário de $1700$
- 1 representa o grupo com idade de $48.33333333$ e salário de $1800$
- 0 representa o grupo com idade de $41.16666667$ e salário de $5600$

In [16]:
graph1 = px.scatter(x = base_salary[:, 0], y = base_salary[:, 1], color=labels)
graph2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size=[12, 12, 12])
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()

# Dados randômicos

In [35]:
from sklearn.datasets import make_blobs

In [40]:
X_random, y_random = make_blobs(n_samples=200, centers=5, random_state=42)

In [41]:
X_random

array([[ -8.0799236 ,  -7.21461083],
       [ -2.42215055,   8.71527878],
       [  0.57421589,   2.75398778],
       [ -8.01291141,   8.13703255],
       [  5.1618204 ,   2.27015436],
       [ -8.55733589,   6.7008234 ],
       [  1.30385601,   3.9480044 ],
       [  2.24639272,   4.17404396],
       [ -7.10308998,  -6.1661091 ],
       [  6.04267315,   0.57131862],
       [  3.66519717,   2.76025429],
       [ -4.7356831 ,  -6.24619057],
       [ -6.30873668,  -5.74454395],
       [ -6.81534717,  -7.95785437],
       [ -9.76525823,   7.26399756],
       [  2.00328403,   3.15892219],
       [ -2.1475616 ,   8.36916637],
       [  3.0323956 ,   2.15780354],
       [  2.3450188 ,   3.33422061],
       [ -9.09089591,   6.07573973],
       [ -8.76852567,   6.93820932],
       [ -9.42769251,   8.17312501],
       [  3.44857534,   2.62972329],
       [ -3.05358035,   9.12520872],
       [ -9.65654844,   9.41591019],
       [ -1.4781981 ,   9.94556625],
       [ -3.10983631,   8.72259238],
 

In [42]:
y_random

array([2, 0, 4, 3, 1, 3, 4, 4, 2, 1, 1, 2, 2, 2, 3, 4, 0, 1, 4, 3, 3, 3,
       1, 0, 3, 0, 0, 1, 0, 2, 3, 3, 3, 2, 0, 3, 3, 1, 3, 0, 2, 4, 0, 2,
       2, 3, 3, 4, 4, 4, 4, 1, 3, 0, 4, 1, 0, 0, 1, 3, 1, 4, 2, 1, 3, 4,
       4, 2, 1, 0, 4, 0, 0, 4, 0, 3, 1, 2, 4, 0, 3, 0, 0, 2, 2, 3, 3, 3,
       4, 0, 2, 3, 2, 1, 1, 2, 1, 2, 4, 0, 1, 0, 1, 2, 1, 1, 3, 1, 2, 4,
       2, 3, 0, 1, 3, 0, 4, 1, 4, 4, 0, 0, 0, 3, 3, 1, 4, 4, 2, 2, 0, 3,
       2, 3, 0, 4, 4, 1, 2, 2, 1, 3, 1, 0, 4, 1, 2, 0, 1, 4, 3, 3, 0, 4,
       2, 4, 2, 4, 4, 1, 2, 1, 0, 2, 2, 4, 0, 4, 2, 1, 1, 3, 2, 3, 1, 2,
       3, 1, 3, 1, 0, 1, 3, 0, 2, 4, 4, 3, 0, 1, 2, 2, 0, 0, 2, 4, 4, 0,
       4, 1])

In [50]:
px.scatter(x = X_random[:, 0], y = X_random[:, 1], color=y_random)

In [44]:
kmeans_blobs = KMeans(n_clusters=5)
kmeans_blobs.fit(X_random)





In [45]:
labels = kmeans_blobs.predict(X_random)
labels

array([1, 0, 4, 3, 2, 3, 4, 4, 1, 2, 2, 1, 1, 1, 3, 4, 0, 2, 4, 3, 3, 3,
       2, 0, 3, 0, 0, 2, 0, 1, 3, 3, 3, 1, 0, 3, 3, 2, 3, 0, 1, 4, 0, 1,
       1, 3, 3, 4, 4, 4, 4, 2, 3, 0, 4, 2, 0, 0, 2, 3, 2, 4, 1, 2, 3, 4,
       4, 1, 2, 0, 4, 0, 0, 2, 0, 3, 2, 1, 4, 0, 3, 0, 0, 1, 1, 3, 3, 3,
       4, 0, 1, 3, 1, 2, 2, 1, 2, 1, 4, 0, 2, 0, 2, 1, 2, 2, 3, 2, 1, 4,
       1, 3, 0, 2, 3, 0, 4, 2, 4, 4, 0, 0, 0, 3, 3, 2, 4, 4, 1, 1, 0, 3,
       1, 3, 0, 4, 4, 2, 1, 1, 2, 3, 2, 0, 4, 2, 1, 0, 2, 4, 3, 3, 0, 4,
       1, 4, 1, 4, 4, 2, 1, 2, 0, 1, 1, 2, 0, 4, 1, 2, 2, 3, 1, 3, 2, 1,
       3, 2, 3, 2, 0, 2, 3, 0, 1, 4, 4, 3, 0, 2, 1, 1, 0, 0, 1, 4, 4, 0,
       4, 2], dtype=int32)

In [46]:
centroides = kmeans_blobs.cluster_centers_
centroides

array([[-2.71038846,  8.8833827 ],
       [-6.96417736, -6.7006896 ],
       [ 4.54996635,  2.16341636],
       [-8.56863765,  7.26172299],
       [ 1.88691387,  4.23193965]])

In [49]:
graph1 = px.scatter(x = X_random[:, 0], y = X_random[:, 1], color=labels)
graph2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size=[5] * 5)
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()