#### Imports

In [85]:
%matplotlib notebook

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.datasets import load_breast_cancer

plt.rcParams['figure.figsize']=(9,8)
plt.style.use('fivethirtyeight')

#### PCA

In [86]:
plt.style.use('ggplot')
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])

from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']

x = df.loc[:, features].values

y = df.loc[:,['target']].values

x = StandardScaler().fit_transform(x)


pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,target
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.36795,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa


In [87]:
fig = plt.figure(figsize = (9,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [62]:
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [63]:
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,target
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.36795,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa


In [64]:
principalDf = principalDf.values

#### K-means

In [65]:
kmeans = KMeans(n_clusters = 4, random_state = 0)
kmeans.fit(principalDf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [66]:
kmeans.cluster_centers_

array([[ 0.56713803, -0.8076751 ],
       [-2.20485567,  0.96383837],
       [ 1.72236912,  0.59990509],
       [-2.23819831, -0.49619109]])

In [67]:
print(kmeans.cluster_centers_[:, 0:1])
print("\n",kmeans.cluster_centers_[:, 1:2])

[[ 0.56713803]
 [-2.20485567]
 [ 1.72236912]
 [-2.23819831]]

 [[-0.8076751 ]
 [ 0.96383837]
 [ 0.59990509]
 [-0.49619109]]


In [68]:
kmeans.cluster_centers_[:, 0]

array([ 0.56713803, -2.20485567,  1.72236912, -2.23819831])

In [69]:
kmeans.cluster_centers_[:, 1]

array([-0.8076751 ,  0.96383837,  0.59990509, -0.49619109])

In [70]:
import matplotlib.pyplot as plt

plt.scatter(principalDf[:,0], principalDf[:,1], s = 100, c = kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s = 300, c = 'red',label = 'Centroids')
plt.title('Iris Clusters and Centroids')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.legend()

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Method Elbow

In [71]:
wcss = []
 
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'random')
    kmeans.fit(principalDf)
    print(i,kmeans.inertia_)
    wcss.append(kmeans.inertia_)  
plt.plot(range(1, 11), wcss)
plt.title('O Metodo Elbow')
plt.xlabel('Numero de Clusters')
plt.ylabel('WSS') #within cluster sum of squares
plt.show()

1 574.8058521688922
2 198.7068359263738
3 116.10924021401539
4 90.08937759503056
5 66.36791000282973
6 57.44837562364759
7 48.06757232729
8 40.07362197263733
9 32.39181272772188
10 28.60364556238956


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Breast Cancer Dataset

In [72]:
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

df = pd.DataFrame(data = y_cancer
             , columns = ['target'])
df['target'] = df['target'].map({0: 'malignant', 1: 'benign'})

# Before applying PCA, each feature should be centered (zero mean) and with unit variance
x = StandardScaler().fit_transform(X_cancer)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf_cancer = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf_cancer = pd.concat([principalDf_cancer, df[['target']]], axis = 1)
finalDf_cancer.head()

Unnamed: 0,principal component 1,principal component 2,target
0,9.192837,1.948583,malignant
1,2.387802,-3.768172,malignant
2,5.733896,-1.075174,malignant
3,7.122953,10.275589,malignant
4,3.935302,-1.948072,malignant


In [73]:
fig = plt.figure(figsize = (15,10))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('Breast Cancer Dataset PCA (n_components = 2)', fontsize = 20)
targets = ['malignant', 'benign']
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf_cancer['target'] == target
    ax.scatter(finalDf_cancer.loc[indicesToKeep, 'principal component 1']
               , finalDf_cancer.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [74]:
principalDf_cancer = principalDf_cancer.values

#### K-means

In [75]:
kmeans = KMeans(n_clusters = 6, random_state = 0)
kmeans.fit(principalDf_cancer)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [76]:
kmeans.cluster_centers_

array([[-1.75755432,  1.21165013],
       [ 3.51008647,  7.61714952],
       [ 3.18216498, -2.25165731],
       [-2.86454853, -1.22420533],
       [ 8.60785363, -1.20937002],
       [ 2.8829055 ,  2.1090608 ]])

In [77]:
print(kmeans.cluster_centers_[:, 0:1])
print("\n",kmeans.cluster_centers_[:, 1:2])

[[-1.75755432]
 [ 3.51008647]
 [ 3.18216498]
 [-2.86454853]
 [ 8.60785363]
 [ 2.8829055 ]]

 [[ 1.21165013]
 [ 7.61714952]
 [-2.25165731]
 [-1.22420533]
 [-1.20937002]
 [ 2.1090608 ]]


In [78]:
kmeans.cluster_centers_[:, 0]

array([-1.75755432,  3.51008647,  3.18216498, -2.86454853,  8.60785363,
        2.8829055 ])

In [79]:
kmeans.cluster_centers_[:, 1]

array([ 1.21165013,  7.61714952, -2.25165731, -1.22420533, -1.20937002,
        2.1090608 ])

In [80]:
import matplotlib.pyplot as plt

plt.scatter(principalDf_cancer[:,0], principalDf_cancer[:,1], s = 100, c = kmeans.labels_)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], s = 300, c = 'red',label = 'Centroids')
plt.title('Iris Clusters and Centroids')
plt.xlabel('principal component 1')
plt.ylabel('principal component 2')
plt.legend()

plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Method Elbow

In [81]:
wcss = []
 
for i in range(1, 20):
    kmeans = KMeans(n_clusters = i, init = 'random')
    kmeans.fit(principalDf)
    print(i,kmeans.inertia_)
    wcss.append(kmeans.inertia_)  
plt.plot(range(1, 20), wcss)
plt.title('O Metodo Elbow')
plt.xlabel('Numero de Clusters')
plt.ylabel('WSS') #within cluster sum of squares
plt.show()

1 574.8058521688922
2 198.7068359263738
3 116.10924021401539
4 89.79727963494844
5 66.3739482870062
6 57.435099342876015
7 48.0669164718562
8 40.12456710844639
9 36.92657659605983
10 28.51610234581303
11 25.58846847474551
12 24.28254870705473
13 20.14354338236941
14 19.629810172257116
15 18.562496501697836
16 16.94738855791256
17 14.027845456423726
18 14.602487999581838
19 14.131496074940816


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### fruit_data_with_color

In [82]:
fruits = pd.read_table('../data/fruit_data_with_colors.txt')
X_fruits = fruits[['mass','width','height', 'color_score']].values
y_fruits = (fruits[['fruit_label']] - 1).values


df_fruits = pd.DataFrame(data = y_fruits
             , columns = ['target'])
df_fruits['target'] = df_fruits['target'].map({0: 'apple', 1: 'mandarin',2: 'orange', 3: 'lemon'})

# Before applying PCA, each feature should be centered (zero mean) and with unit variance
x = StandardScaler().fit_transform(X_fruits)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf_fruits = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf_fruits = pd.concat([principalDf_fruits, df_fruits[['target']]], axis = 1)
finalDf_fruits.head()

Unnamed: 0,principal component 1,principal component 2,target
0,1.572169,-1.980044,apple
1,0.878417,-1.579373,apple
2,0.529773,-1.732891,apple
3,-2.71261,0.414674,mandarin
4,-2.893412,0.242003,mandarin


In [83]:
fig = plt.figure(figsize = (15,10))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('Fruits Dataset PCA (n_components = 2)', fontsize = 20)
targets = ['apple','mandarin','orange','lemon']
colors = ['r', 'b','black','g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf_fruits['target'] == target
    ax.scatter(finalDf_fruits.loc[indicesToKeep, 'principal component 1']
               , finalDf_fruits.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>