### **Clustering**

---
Separates records based on the number of clusters and categorizes them based on approximation.

#### K-Means

In [None]:
# General imports
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

##### Base Ages and Salaries

In [5]:
x = [20,27,21,37,46,53,55,47,52,32,39,41,39,48,48]
y = [1000,1200,2900,1850,900,950,2000,2100,3000,5900,4100,5100,7000,5000,6500]

In [None]:
graphic = px.scatter(x = x, y = y)
graphic.show()

In [None]:
# convert array to matrix
base_salary = []
for num in range(len(x)):
  base_salary.append([x[num], y[num]])

In [18]:
# normalize
scaler_salary = StandardScaler()
base_salary = scaler_salary.fit_transform(base_salary)
base_salary

In [None]:
# trainning
kmeans_salary = KMeans(n_clusters=3)
kmeans_salary.fit(base_salary)

In [None]:
# cluster values normalized
centroids = kmeans_salary.cluster_centers_
centroids

In [None]:
# cluster values
scaler_salary.inverse_transform(kmeans_salary.cluster_centers_)

In [None]:
labels = kmeans_salary.labels_
labels

In [None]:
graphic1 = px.scatter(x = base_salary[:, 0], y = base_salary[:, 1], color = labels)
graphic2 = px.scatter(x = centroids[:, 0], y = centroids[:, 1], size = [12, 12, 12])
graphic3 = go.Figure(data = graphic1.data + graphic2.data)
graphic3.show()

##### Base Random

In [8]:
# generate random data
from sklearn.datasets import make_blobs
x_random, y_random = make_blobs(n_samples=200, centers=5, random_state=0)

In [None]:
graphic = px.scatter(x = x_random[:, 0], y = x_random[:, 1])
graphic.show()

In [None]:
# trainning
kmeans_blobs = KMeans(n_clusters=5)
kmeans_blobs.fit(x_random)

In [None]:
labels = kmeans_blobs.predict(x_random)
labels

In [None]:
centroids = kmeans_blobs.cluster_centers_
centroids

In [None]:
graphic1 = px.scatter(x = x_random[:,0], y = x_random[:,1], color = labels)
graphic2 = px.scatter(x = centroids[:,0], y = centroids[:,1], size = [5, 5, 5, 5, 5])
graphic3 = go.Figure(data = graphic1.data + graphic2.data)
graphic3.show()

##### Base Credit Card

In [None]:
# import base
base_card = pd.read_csv('../examples/credit_card_clients.csv', header = 1)
base_card

In [None]:
# totalize value with prefix BILL_AM
base_card['BILL_TOTAL'] = base_card['BILL_AMT1'] + base_card['BILL_AMT2'] + base_card['BILL_AMT3'] + base_card['BILL_AMT4'] + base_card['BILL_AMT5'] + base_card['BILL_AMT6']
base_card

In [None]:
x_card = base_card.iloc[:, [1,25]].values
x_card

In [None]:
x_card[:, 0], x_card[:, 1]

In [45]:
# normalize values
scaler_card = StandardScaler()
x_card = scaler_card.fit_transform(x_card)
x_card

array([[-1.13672015, -0.69069198],
       [-0.3659805 , -0.66599747],
       [-0.59720239, -0.44316987],
       ...,
       [-1.05964618, -0.52525745],
       [-0.67427636, -0.00856436],
       [-0.90549825, -0.10271861]])

In [None]:
# generate values for analysis of number_clusters x count
wcss = []
for i in range(1,11):
  kmeans_card = KMeans(n_clusters=i, random_state=0)
  kmeans_card.fit(x_card)
  wcss.append(kmeans_card.inertia_)
# the best cluster size is when there is no longer a sharp decline
graphic = px.line(x = range(1,11), y = wcss)
graphic.show()

In [46]:
# trainning
kmeans_card = KMeans(n_clusters=4, random_state=0)
labels = kmeans_card.fit_predict(x_card)

In [None]:
# x = LIMIT_BAL
# y = BILL_TOTAL
graphic = px.scatter(x = x_card[:, 0], y = x_card[:, 1], color = labels)
graphic.show()

#### Hierarchical

In [2]:
# General imports
import pandas as pd
import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import dendrogram, linkage

import plotly.express as px

from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

##### Base Ages and Salaries

In [None]:
# generate fixed base
x = [20,27,21,37,46,53,55,47,52,32,39,41,39,48,48]
y = [1000,1200,2900,1850,900,950,2000,2100,3000,5900,4100,5100,7000,5000,6500]

# convert array to matrix
base_salary = []
for num in range(len(x)):
  base_salary.append([x[num], y[num]])

# normalize
scaler_salary = StandardScaler()
base_salary = scaler_salary.fit_transform(base_salary)
base_salary

In [None]:
dendgr = dendrogram(linkage(base_salary, method='ward'))
plt.title('Dendgram')
plt.xlabel('Persons')
plt.ylabel('Distances')

In [None]:
# trainning
hc_salary = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels = hc_salary.fit_predict(base_salary)
labels

In [None]:
graphic = px.scatter(x = base_salary[:,0], y = base_salary[:,1], color=labels)
graphic.show()

##### Base Credit Card

In [12]:
# import base
base_card = pd.read_csv('../examples/credit_card_clients.csv', header = 1)
base_card

# totalize value with prefix BILL_AM
base_card['BILL_TOTAL'] = base_card['BILL_AMT1'] + base_card['BILL_AMT2'] + base_card['BILL_AMT3'] + base_card['BILL_AMT4'] + base_card['BILL_AMT5'] + base_card['BILL_AMT6']
base_card

x_card = base_card.iloc[:, [1,25]].values[:1000]

# normalize values
scaler_card = StandardScaler()
x_card = scaler_card.fit_transform(x_card)

In [None]:
# used to identify the best number of clusters 
dendgr = dendrogram(linkage(x_card, method='ward'))
# get the longest straight line with the greatest number of intersections

In [14]:
# trainning
hc_card = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels = hc_card.fit_predict(x_card)

In [None]:
graphic = px.scatter(x = x_card[:,0], y = x_card[:,1], color=labels)
graphic.show()

#### DBSCAN

In [31]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

##### Base Ages and Salaries

In [None]:
# generate fixed base
x = [20,27,21,37,46,53,55,47,52,32,39,41,39,48,48]
y = [1000,1200,2900,1850,900,950,2000,2100,3000,5900,4100,5100,7000,5000,6500]

# convert array to matrix
base_salary = []
for num in range(len(x)):
  base_salary.append([x[num], y[num]])

# normalize
scaler_salary = StandardScaler()
base_salary = scaler_salary.fit_transform(base_salary)
base_salary

In [None]:
# trainning
dbscan_salary = DBSCAN(eps=0.95, min_samples=2)
dbscan_salary.fit(base_salary)

In [None]:
labels = dbscan_salary.labels_
labels

In [None]:
graphic = px.scatter(x = base_salary[:,0], y = base_salary[:,1], color = labels)
graphic.show()

#### Base Credit Card

In [27]:
# import base
base_card = pd.read_csv('../examples/credit_card_clients.csv', header = 1)
base_card

# totalize value with prefix BILL_AM
base_card['BILL_TOTAL'] = base_card['BILL_AMT1'] + base_card['BILL_AMT2'] + base_card['BILL_AMT3'] + base_card['BILL_AMT4'] + base_card['BILL_AMT5'] + base_card['BILL_AMT6']
base_card

x_card = base_card.iloc[:, [1,25]].values[:1000]

# normalize values
scaler_card = StandardScaler()
x_card = scaler_card.fit_transform(x_card)

In [37]:
# trainning
dbscan_card = DBSCAN(eps=0.37, min_samples=5)
labels = dbscan_card.fit_predict(x_card)

In [None]:
np.unique(labels, return_counts = True)

In [None]:
graphic = px.scatter(x = x_card[:,0], y = x_card[:,1], color=labels)
graphic.show()