In [1]:
import numpy as np
import pandas as pd
import os
from sklearn import cluster, mixture, metrics
from sklearn.neighbors import kneighbors_graph,KNeighborsClassifier
import warnings
from sklearn.model_selection import train_test_split

Загружаем данные с фотометрическими данными

In [2]:
path = '/home/bulat/data/'
import time
_start_time = time.time()
train = pd.read_csv(path+'all_phot_gals.csv')
train['|dered_g-dered_r|'] = np.abs(train['dered_g']-train['dered_r'])
labels = train.iGrId
train_X = train[['ra','dec','|dered_g-dered_r|']]

In [3]:
bandwidth = cluster.estimate_bandwidth(train_X,quantile=0.006)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=107)
birch = cluster.Birch(n_clusters=107)
gmm = mixture.GaussianMixture(n_components=107, covariance_type='full')
clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('MeanShift', ms),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )

In [4]:
warnings.filterwarnings('ignore', category=UserWarning, append=True)
for name, algorithm in clustering_algorithms:
    y_pred = algorithm.fit_predict(train_X)
    print(name+' : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, y_pred),3), 
           round(metrics.v_measure_score(labels, y_pred),3)))

MiniBatchKMeans : fowlkes_mallows_score = 0.314, v_measure_score = 0.822
MeanShift : fowlkes_mallows_score = 0.686, v_measure_score = 0.927
Birch : fowlkes_mallows_score = 0.538, v_measure_score = 0.892
GaussianMixture : fowlkes_mallows_score = 0.357, v_measure_score = 0.855


Так как при использовании всей таблицы алгоритмами DBSCAN и AgglomerativeClustering возникает Memory error, то разобьем наш датасет методом train_test_split на две части. Одну часть будем кластеризовать DBSCAN и AgglomerativeClustering и по получившимся кластерам классифицировать с помощью KNeighborsClassifier все галактики

In [5]:
dbscan = cluster.DBSCAN(eps=4.3)
X_train, X_test, y_train, y_test = train_test_split(train_X, labels, test_size=0.8, random_state=42, shuffle=True) 
dbscan.fit(X_train)
labels_1 = dbscan.labels_
KN = KNeighborsClassifier(n_neighbors=3)
KN.fit(X_train,labels_1)
labels_pred = KN.predict(train_X)
print('dbscan : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, labels_pred),3), 
           round(metrics.v_measure_score(labels, labels_pred),3)))

dbscan : fowlkes_mallows_score = 0.917, v_measure_score = 0.927


In [6]:
single = cluster.AgglomerativeClustering(linkage="single", affinity="cityblock",n_clusters=107)
X_train, X_test, y_train, y_test = train_test_split(train_X, labels, test_size=0.9, random_state=42, shuffle=True) 
single.fit(X_train)
labels_1 = single.labels_
KN = KNeighborsClassifier(n_neighbors=3)
KN.fit(X_train,labels_1)
y_pred = KN.predict(train_X)
print('single linkage AgglomerativeClustering : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, y_pred),3), 
           round(metrics.v_measure_score(labels, y_pred),3)))

single_linkage_AgglomerativeClustering : fowlkes_mallows_score = 0.939, v_measure_score = 0.947


При использовании в качестве параметров только прямое восхождение и склонение 

In [7]:
train_X = train[['ra','dec']]

In [8]:
bandwidth = cluster.estimate_bandwidth(train_X,quantile=0.006)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=107)
birch = cluster.Birch(n_clusters=107)
gmm = mixture.GaussianMixture(n_components=107, covariance_type='full')
clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('MeanShift', ms),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )

In [9]:
warnings.filterwarnings('ignore', category=UserWarning, append=True)
for name, algorithm in clustering_algorithms:
    y_pred = algorithm.fit_predict(train_X)
    print(name+' : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, y_pred),3), 
           round(metrics.v_measure_score(labels, y_pred),3)))

MiniBatchKMeans : fowlkes_mallows_score = 0.308, v_measure_score = 0.825
MeanShift : fowlkes_mallows_score = 0.606, v_measure_score = 0.916
Birch : fowlkes_mallows_score = 0.396, v_measure_score = 0.872
GaussianMixture : fowlkes_mallows_score = 0.366, v_measure_score = 0.862


In [10]:
dbscan = cluster.DBSCAN(eps=4.3)
X_train, X_test, y_train, y_test = train_test_split(train_X, labels, test_size=0.8, random_state=42, shuffle=True) 
dbscan.fit(X_train)
labels_1 = dbscan.labels_
KN = KNeighborsClassifier(n_neighbors=3)
KN.fit(X_train,labels_1)
labels_pred = KN.predict(train_X)
print('dbscan : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, labels_pred),3), 
           round(metrics.v_measure_score(labels, labels_pred),3)))

dbscan : fowlkes_mallows_score = 0.917, v_measure_score = 0.927


In [11]:
single = cluster.AgglomerativeClustering(linkage="single", affinity="cityblock",n_clusters=107)
X_train, X_test, y_train, y_test = train_test_split(train_X, labels, test_size=0.9, random_state=42, shuffle=True) 
single.fit(X_train)
labels_1 = single.labels_
KN = KNeighborsClassifier(n_neighbors=3)
KN.fit(X_train,labels_1)
y_pred = KN.predict(train_X)
print('single_linkage_AgglomerativeClustering : fowlkes_mallows_score = %.3f, v_measure_score = %.3f' % 
          (round(metrics.fowlkes_mallows_score(labels, y_pred),3), 
           round(metrics.v_measure_score(labels, y_pred),3)))

single_linkage_AgglomerativeClustering : fowlkes_mallows_score = 0.978, v_measure_score = 0.990
