In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='white')
%matplotlib inline
%matplotlib notebook
#%config InlineBackend.figure_format = 'retina'
from sklearn import decomposition
from sklearn import datasets
from mpl_toolkits.mplot3d import Axes3D


In [5]:
iris = datasets.load_iris()
X= iris.data
y = iris.target

In [8]:
fig = plt.figure(1, figsize=(6, 5))
plt.clf()
ax = fig.add_subplot(projection='3d')
plt.cla()

for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Change the order of labels, so that they match
y_clr = np.choose(y, [1, 2, 0]).astype(float)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y_clr, 
           cmap=plt.cm.nipy_spectral)

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([]);

<IPython.core.display.Javascript object>

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

clf = DecisionTreeClassifier(max_depth=2, random_state=42)
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_test)
print(accuracy_score(y_test, preds.argmax(axis=1)))

0.8888888888888888


In [10]:
pca = decomposition.PCA(n_components=2)

X_centred = X - X.mean(axis=0)
pca.fit(X_centred)


In [11]:
X_pca = pca.transform(X_centred)

In [12]:
X.shape

(150, 4)

In [13]:
X_pca.shape

(150, 2)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, stratify=y, random_state=42)
clf = DecisionTreeClassifier(max_depth=2, random_state=42)
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_test)
print(accuracy_score(y_test, preds.argmax(axis=1)))

0.9111111111111111


In [15]:
pca.explained_variance_ratio_

array([0.92461872, 0.05306648])

In [16]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [17]:
plt.figure(figsize=(10,6))
for i in range(10):
    plt.subplot(2,5, i+1)
    plt.imshow(X[i,:].reshape([8,8]), cmap = 'gray')

<IPython.core.display.Javascript object>

In [18]:
%%time
pca = decomposition.PCA(n_components=2)
X_reduced = pca.fit_transform(X)

plt.figure(figsize=(10,6))
plt.scatter(X_reduced[:, 0], X_reduced[:,1], c = y, alpha=0.7, cmap = plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()



<IPython.core.display.Javascript object>

CPU times: total: 1.48 s
Wall time: 370 ms




<matplotlib.colorbar.Colorbar at 0x1ade842efb0>

In [19]:
%%time

from sklearn.manifold import TSNE

tsne = TSNE(random_state=17)

X_tsne = tsne.fit_transform(X)
plt.figure(figsize=(10,6))
plt.scatter(X_tsne[:, 0], X_tsne[:,1], c = y, alpha=0.7, cmap = plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()

<IPython.core.display.Javascript object>

CPU times: total: 28.5 s
Wall time: 6.39 s




<matplotlib.colorbar.Colorbar at 0x1ade8dfdb40>

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [7]:
mall_data = pd.read_csv("Mall_Customers.csv")

In [8]:
mall_data

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [9]:
mall_data.describe()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [10]:
mall_data.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [11]:
from sklearn.cluster import KMeans

In [12]:
X_numerics = mall_data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

In [13]:
!pip install yellowbrick



In [14]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans(random_state=1)

visualizer = KElbowVisualizer(model, k=(2,10))
visualizer.fit(X_numerics)

visualizer.show()
plt.show()

<IPython.core.display.Javascript object>

In [19]:
model = KMeans(random_state=1)

visualizer = KElbowVisualizer(model, k=(2,10), metric='silhouette')
visualizer.fit(X_numerics)

visualizer.show()
plt.show()

In [20]:
KM_5_clusters = KMeans(n_clusters=5).fit(X_numerics)

In [21]:
KM_5_clusters.labels_

array([2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
       2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0,
       2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 4, 0, 4, 1, 4, 1, 4,
       1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4,
       1, 4])

In [22]:
KM_clustered = X_numerics.copy()
KM_clustered.loc[:,'Cluster'] = KM_5_clusters.labels_

In [48]:
KM_clustered

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Cluster
0,19,15,39,2
1,21,15,81,3
2,20,16,6,2
3,23,16,77,3
4,31,17,40,2
...,...,...,...,...
195,35,120,79,4
196,45,126,28,1
197,32,126,74,4
198,32,137,18,1


In [35]:
X_numerics = mall_data[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

In [54]:
fig1, (axes) = plt.subplots(1,2, figsize = (12,5))

sns.scatterplot(x=KM_clustered['Annual Income (k$)'], y=KM_clustered['Spending Score (1-100)'], data=KM_clustered, hue = 'Cluster', ax = axes[0], palette='Set1', legend = 'full')
sns.scatterplot(x=KM_clustered['Age'], y=KM_clustered['Spending Score (1-100)'], data = KM_clustered, hue = 'Cluster', ax = axes[1], palette='Set1', legend = 'full')

axes[0].scatter(KM_5_clusters.cluster_centers_[:,1], KM_5_clusters.cluster_centers_[:,2], marker = 's', s=40, c='blue')
axes[1].scatter(KM_5_clusters.cluster_centers_[:,0], KM_5_clusters.cluster_centers_[:,2], marker = 's', s=40, c='blue')


plt.show()


<IPython.core.display.Javascript object>

In [36]:
KM_clust_sizes = KM_clustered.groupby("Cluster").size().to_frame()
KM_clust_sizes.columns = ["KM_size"]
KM_clust_sizes

Unnamed: 0_level_0,KM_size
Cluster,Unnamed: 1_level_1
0,79
1,37
2,23
3,22
4,39


In [39]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(projection='3d')

ax.scatter(KM_clustered['Age'], KM_clustered['Annual Income (k$)'],
           KM_clustered['Spending Score (1-100)'], c=KM_clustered['Cluster'], s=35, edgecolor='k', cmap = plt.cm.Set1)

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])

ax.set_xlabel('Age')
ax.set_ylabel('Annual Income (k$)')
ax.set_zlabel('Spending Score (1-100)')
ax.set_title('3D View')


<IPython.core.display.Javascript object>

Text(0.5, 0.92, '3D View')