In [1]:
from keras.datasets import mnist
import numpy as np
import pandas as pd
import os, time
import pickle, gzip

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import matplotlib as mpl

%matplotlib inline

from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
# loading
(X_train, y_train), (X_test, y_test) = mnist.load_data()

#shape of dataset
print('X_train: ' + str(X_train.shape))
print('Y_train: ' + str(y_train.shape))
print('X_test:  '  + str(X_test.shape))
print('Y_test:  '  + str(y_test.shape))

X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


In [3]:
X_validation = X_train[50000:]
y_validation = y_train[50000:]
X_train = X_train[:50000]
y_train = y_train[:50000]

print('X_train: ' + str(X_train.shape))
print('Y_train: ' + str(y_train.shape))
print('X_validation: ' + str(X_validation.shape))
print('Y_validation: ' + str(y_validation.shape))

X_train = X_train.reshape(-1,28*28)
X_validation = X_validation.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

print('X_train: ' + str(X_train.shape))
print('X_validation: ' + str(X_validation.shape))
print('X_test: ' + str(X_test.shape))

X_train: (50000, 28, 28)
Y_train: (50000,)
X_validation: (10000, 28, 28)
Y_validation: (10000,)
X_train: (50000, 784)
X_validation: (10000, 784)
X_test: (10000, 784)


In [4]:
# 데이터셋으로부터 pandas data frame 생성
train_index = range(0, len(X_train))
validation_index = range(len(X_train), len(X_train)+len(X_validation))
test_index = range(len(X_train)+len(X_validation), len(X_train)+len(X_validation)+len(X_test))

X_train = pd.DataFrame(data = X_train, index = train_index)
y_train = pd.Series(data = y_train, index=train_index)

X_validation = pd.DataFrame(data=X_validation, index=validation_index)
y_validation = pd.Series(data=y_validation, index=validation_index)

X_test = pd.DataFrame(data=X_test, index=test_index)
y_test = pd.Series(data=y_test, index=test_index)

In [5]:
from sklearn.decomposition import PCA

n_components = 784
whiten = False
random_state=2018

pca = PCA(n_components=n_components, whiten=whiten, random_state=random_state)

X_train_PCA = pca.fit_transform(X_train)
X_train_PCA = pd.DataFrame(data=X_train_PCA, index=train_index)

In [6]:
def analyzeCluster(clusterDF, labelsDF):
    countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
    countByCluster.reset_index(inplace=True, drop=False)
    countByCluster.columns = ['cluster', 'clusterCount']
    
    preds = pd.concat([labelsDF, clusterDF], axis=1)
    preds.columns = ['trueLabel', 'cluster']
    
    countByLabel = pd.DataFrame(data=preds.groupby('trueLabel').count())
    
    countMostFreq = pd.DataFrame(data=preds.groupby('cluster').agg(lambda x:x.value_counts().iloc[0])) # 가장 자주 발생하는 숫자의 개수
    countMostFreq.reset_index(inplace=True, drop=False)
    countMostFreq.columns = ['cluster', 'countMostFrequent']
    
    # 군집의 전체 정확도
    accuracyDF = countMostFreq.merge(countByCluster, left_on="cluster", right_on = "cluster")
    overallAccuracy = accuracyDF.countMostFrequent.sum()/accuracyDF.clusterCount.sum()
    # 모든 군집별로 가장 자주 발생하는 관측치 개수 합을 훈련셋의 총 관측치 개수로 나눈 값
    
    accuracyByLabel = accuracyDF.countMostFrequent/accuracyDF.clusterCount
    
    return countByCluster, countByLabel, countMostFreq, accuracyDF, overallAccuracy, accuracyByLabel

### DBSCAN
- 노이즈 응용 밀도 기반 공간 클러스터링(density-based spatial clustering of applications with noise : DBSCAN)
- 데이터 포인트의 밀도에 따라 그룹화한다.
- eps(최대 거리)와 min_samples(최소 샘플) 설정 필요

In [8]:
from sklearn.cluster import DBSCAN

eps = 3
min_samples = 5
leaf_size = 30
n_jobs = 4

db = DBSCAN(eps=eps, min_samples=min_samples, leaf_size=leaf_size, n_jobs=n_jobs)

cutoff=99
X_train_PCA_dbscanClustered = db.fit_predict(X_train_PCA.loc[:, 0:cutoff])
X_train_PCA_dbscanClustered = pd.DataFrame(data=X_train_PCA_dbscanClustered, index=X_train.index, columns=['cluster'])

countByCluster_dbscan, countByLabel_dbscan, countMostFreq_dbscan, accuracyDF_dbscan,\
overallAccuracy_dbscan, accuracyByLabel_dbscan = analyzeCluster(X_train_PCA_dbscanClustered, y_train)

overallAccuracy_dbscan

0.11356

- DBSCAN은 이 데이터셋의 관측치를 클러스터링하는 데 적합하지 않음

In [9]:
countByCluster_dbscan

Unnamed: 0,cluster,clusterCount
0,-1,50000


### HDBSCAN
- 계층적 노이즈 응용 밀도 기반 공간 클러스터링(HDBSCAN) 또는 계층적 DBSCAN으로 알려져 있다. 
- 밀도를 기반으로 그룹화하고 계층적 클러스터링 알고리즘첯럼 거리를 기준으로 밀도 기반 군집을 반복적으로 연결한다.
- min_samples를 none으로 설정하면 min_cluster_size로 기본 설정 된다,

In [17]:
import hdbscan
min_cluster_size = 30
min_samples = None
alpha = 1.0
cluster_selection_method = 'eom'

hdb = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size, min_samples = min_samples,
                      alpha = alpha, cluster_selection_method = cluster_selection_method)

cutoff = 10
X_train_PCA_hdbscanClustered = hdb.fit_predict(X_train_PCA.loc[:, 0:cutoff])

X_train_PCA_hdbscanClustered = pd.DataFrame(data=X_train_PCA_hdbscanClustered, index = X_train.index, columns = ['cluster'])

countByCluster_hdbscan, countByLabel_hdbscan, countMostFreq_hdbscan, accuracyDF_hdbscan,\
overallAccuracy_hdbscan, accuracyByLabel_hdbscan = analyzeCluster(X_train_PCA_hdbscanClustered, y_train)

overallAccuracy_hdbscan

0.24696

In [18]:
countByCluster_hdbscan

Unnamed: 0,cluster,clusterCount
0,-1,42570
1,4,5140
2,7,942
3,0,605
4,6,295
5,3,252
6,1,119
7,5,45
8,2,32


- 포인트 대부분은 클러스터링되지 않았고 그 이후부터는 작은 크기의 군집들이 long tail을 이룬다.