<a href="https://colab.research.google.com/github/MpRonald/Machine-Learning/blob/main/K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K-Means

In [3]:
!pip install -q plotly --upgrade

[K     |████████████████████████████████| 28.8 MB 1.8 MB/s 
[?25h

In [6]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Salary Dataset

In [19]:
# loading dataset
salary = pd.read_csv('https://raw.githubusercontent.com/MpRonald/datasets/main/Salary.csv')
salary.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891


In [20]:
salary.shape

(35, 2)

In [21]:
X = salary['YearsExperience'].values
X

array([ 1.1,  1.3,  1.5,  2. ,  2.2,  2.9,  3. ,  3.2,  3.2,  3.7,  3.9,
        4. ,  4. ,  4.1,  4.5,  4.9,  5.1,  5.3,  5.9,  6. ,  6.8,  7.1,
        7.9,  8.2,  8.7,  9. ,  9.5,  9.6, 10.3, 10.5, 11.2, 11.5, 12.3,
       12.9, 13.5])

In [22]:
y = salary['Salary'].values
y

array([ 39343,  46205,  37731,  43525,  39891,  56642,  60150,  54445,
        64445,  57189,  63218,  55794,  56957,  57081,  61111,  67938,
        66029,  83088,  81363,  93940,  91738,  98273, 101302, 113812,
       109431, 105582, 116969, 112635, 122391, 121872, 127345, 126756,
       128765, 135675, 139465])

In [23]:
graph = px.scatter(x = X, y = y)
graph.show()

In [24]:
# scaling the variables
scaler_salary = StandardScaler()
salary_base = scaler_salary.fit_transform(salary)

In [25]:
salary_base

array([[-1.46039873, -1.40702756],
       [-1.40432198, -1.19055984],
       [-1.34824523, -1.45787949],
       [-1.20805336, -1.27510276],
       [-1.15197661, -1.38974043],
       [-0.955708  , -0.86131566],
       [-0.92766962, -0.75065276],
       [-0.87159288, -0.93062192],
       [-0.87159288, -0.61516328],
       [-0.73140101, -0.84406007],
       [-0.67532426, -0.65387005],
       [-0.64728589, -0.88806655],
       [-0.64728589, -0.85137871],
       [-0.61924751, -0.84746702],
       [-0.50709402, -0.72033719],
       [-0.39494052, -0.50497357],
       [-0.33886377, -0.56519463],
       [-0.28278703, -0.02705373],
       [-0.11455678, -0.08147035],
       [-0.08651841,  0.31528198],
       [ 0.13778858,  0.24581799],
       [ 0.2219037 ,  0.45197021],
       [ 0.44621069,  0.54752264],
       [ 0.53032581,  0.9421614 ],
       [ 0.67051768,  0.80395897],
       [ 0.7546328 ,  0.68253893],
       [ 0.89482467,  1.04175169],
       [ 0.92286305,  0.90503191],
       [ 1.11913166,

In [26]:
# creating kmeans object
kmeans_salary = KMeans(n_clusters=3).fit(salary_base)

In [27]:
# checking centroids
centroids = kmeans_salary.cluster_centers_
centroids

array([[-0.86829424, -0.92667126],
       [ 0.25305745,  0.431192  ],
       [ 1.3870539 ,  1.31918705]])

In [29]:
# inverse scaler
centroids_inverse = scaler_salary.inverse_transform(kmeans_salary.cluster_centers_)

In [31]:
centroids_inverse

array([[3.21176471e+00, 5.45702353e+04],
       [7.21111111e+00, 9.76143333e+04],
       [1.12555556e+01, 1.25763667e+05]])

In [32]:
labels = kmeans_salary.labels_
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [40]:
graph1 = px.scatter(x = salary_base[:,0], y = salary_base[:,1], color=labels)
graph2 = px.scatter(x = centroids[:,0], y = centroids[:,1], size=[12,12,12])
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()

# Random Dataset

In [41]:
# library to create random dataset
from sklearn.datasets import make_blobs

In [46]:
X_random, y_random = make_blobs(n_samples=200, centers=5, random_state=123)

In [47]:
graph_random = px.scatter(x=X_random[:,0], y=X_random[:,1])
graph_random.show()

In [49]:
kmeans_blobs = KMeans(n_clusters=5).fit(X_random)
labels_blobs = kmeans_blobs.predict(X_random)
centroids_blobs = kmeans_blobs.cluster_centers_

In [52]:
graph1 = px.scatter(x = X_random[:,0], y = X_random[:,1], color=labels)
graph2 = px.scatter(x = centroids_blobs[:,0], y = centroids_blobs[:,1], size=[5,5,5,5,5])
graph3 = go.Figure(data = graph1.data + graph2.data)
graph3.show()

# Credit Dataset

In [53]:
credit = pd.read_csv('https://raw.githubusercontent.com/MpRonald/datasets/main/credit_card_clients.csv', header=1)
credit.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [54]:
credit.shape

(30000, 25)

In [56]:
# sum total bills
credit['TOTAL_BILL'] = credit['BILL_AMT1'] + credit['BILL_AMT2'] + credit['BILL_AMT3']\
                      + credit['BILL_AMT4'] + credit['BILL_AMT5'] + credit['BILL_AMT6']

In [58]:
credit.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,TOTAL_BILL
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,7704
1,2,120000,2,2,2,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,17077
2,3,90000,2,2,2,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,101653
3,4,50000,2,2,1,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,231334
4,5,50000,1,2,1,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,109339


In [60]:
X_card = credit.iloc[:, [1, 25]].values
X_card

array([[ 20000,   7704],
       [120000,  17077],
       [ 90000, 101653],
       ...,
       [ 30000,  70496],
       [ 80000, 266611],
       [ 50000, 230874]])

In [61]:
scaler_card = StandardScaler()
X_card = scaler_card.fit_transform(X_card)

In [67]:
wcss = []
for i in range(1, 11):
  kmeans_card = KMeans(n_clusters=i, random_state=7).fit(X_card)
  wcss.append(kmeans_card.inertia_)

In [68]:
wcss

[59999.99999999978,
 35197.10962274106,
 20128.162901796117,
 14943.801288165927,
 10707.257061206496,
 8603.445378576838,
 7397.864309984142,
 6351.592150960087,
 5669.005519689427,
 5053.519290292112]

In [69]:
grap_line = px.line(x=range(1,11), y=wcss)
grap_line.show()

In [71]:
kmeans_card = KMeans(n_clusters=4, random_state=7)
labels_card = kmeans_card.fit_predict(X_card)

In [74]:
graph1 = px.scatter(x = X_card[:,0], y = X_card[:,1], color=labels_card)
graph1.show()

In [76]:
# concat dataset and labels
costumer_list = np.column_stack((credit, labels_card))
costumer_list

array([[     1,  20000,      2, ...,      1,   7704,      0],
       [     2, 120000,      2, ...,      1,  17077,      0],
       [     3,  90000,      2, ...,      0, 101653,      0],
       ...,
       [ 29998,  30000,      1, ...,      1,  70496,      0],
       [ 29999,  80000,      1, ...,      1, 266611,      0],
       [ 30000,  50000,      1, ...,      1, 230874,      0]])

# Credit Dataset Part II
### We'll use more attributes about customers

In [79]:
X_card_plus = credit.iloc[:,[1,2,3,4,5,25]].values

In [80]:
scale_card_plus = StandardScaler()
X_card_plus = scale_card_plus.fit_transform(X_card_plus)

In [81]:
wcss = []
for i in range(1, 11):
  kmeans_card = KMeans(n_clusters=i, random_state=7).fit(X_card_plus)
  wcss.append(kmeans_card.inertia_)

In [82]:
grap_line = px.line(x=range(1,11), y=wcss)
grap_line.show()

In [84]:
kmeans_card_plus = KMeans(n_clusters=4, random_state=7)
labels_card_plus = kmeans_card.fit_predict(X_card_plus)

In [85]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_card_plus_pca = pca.fit_transform(X_card_plus)

In [86]:
graph1 = px.scatter(x = X_card_plus_pca[:,0], y = X_card_plus_pca[:,1], color=labels_card)
graph1.show()

# END