In [19]:
import os
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import django
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits


os.environ.setdefault("DJANGO_SETTINGS_MODULE", "multidex.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

from plotter.models import *
# from mastspec.views import *
# from mastspec.forms import *
from plotter.spectrum_ops import *
from plotter_utils import modeldict, model_metadata_df
pd.set_option('display.max_rows', 200)

In [2]:
def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [7]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(
    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
)

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
                random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

print(82 * '_')

# digits: 10; # samples: 1797; # features 64
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	0.476s	69662	0.680	0.719	0.699	0.570	0.695	0.187
random   	0.115s	69707	0.675	0.716	0.694	0.560	0.691	0.190
PCA-based	0.105s	72686	0.636	0.658	0.647	0.521	0.643	0.142
__________________________________________________________________________________


In [12]:
from sklearn.datasets import make_blobs

In [31]:
data_df = filter_df_from_queryset(ZSpec.objects.all(), r_star=True)
data_df.drop(columns=[col for col in data_df.columns if "err" in col], inplace=True)

In [33]:
data_df

Unnamed: 0,L6,L0B,R0B,L5,L0G,R0G,L4,L0R,R0R,L3,L2,L1,R1,R2,R3,R4,R5,R6
1969,0.101825,0.114425,0.106213,0.133006,0.146212,0.137808,0.177851,0.184683,0.175899,0.198943,0.205083,0.200521,0.200521,0.193958,0.195344,0.197105,0.198091,0.203690
1970,0.081421,0.093563,0.086578,0.113543,0.127098,0.121458,0.164440,0.172168,0.167139,0.191289,0.201357,0.198339,0.198339,0.190085,0.192423,0.194240,0.197107,0.200714
1971,0.083117,0.099813,0.095644,0.128166,0.148107,0.142158,0.204586,0.219787,0.214216,0.248697,0.264427,0.261497,0.261497,0.250427,0.250496,0.250859,0.253128,0.257506
1972,0.104663,0.120340,0.115622,0.144159,0.158442,0.152781,0.191970,0.199688,0.192890,0.214434,0.218219,0.215014,0.215014,0.208452,0.209140,0.209909,0.211098,0.214559
1973,0.083878,0.101010,0.094088,0.131209,0.149491,0.146243,0.215988,0.230655,0.223293,0.264556,0.280933,0.276205,0.276205,0.264085,0.263433,0.265659,0.268818,0.273596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2075,0.062777,0.078958,0.078453,0.109899,0.123608,0.121739,0.198729,0.209632,0.213764,0.254119,0.277061,0.267312,0.267312,0.253756,0.253602,0.251357,0.250076,0.256505
2076,0.115250,0.139887,0.143459,0.193085,0.205303,0.213855,0.306264,0.312830,0.326268,0.368743,0.384962,0.377778,0.377778,0.364152,0.368043,0.363777,0.361397,0.353910
2077,0.084241,0.106351,0.107222,0.147424,0.165380,0.167286,0.266395,0.278910,0.286102,0.337792,0.365774,0.357293,0.357293,0.343548,0.345595,0.343269,0.343131,0.347244
2078,0.079946,0.092427,0.094197,0.124090,0.128941,0.129254,0.183635,0.188116,0.194175,0.220414,0.233717,0.223098,0.223098,0.212652,0.215181,0.211340,0.209617,0.212912


array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1], dtype=int32)

In [57]:
km = KMeans(n_clusters=10)
metadata_df = model_metadata_df(ZSpec, ["observation"])
metadata_df['km'] = km.fit_predict(data_df)
metadata_df[['feature', 'name', 'km']]

Unnamed: 0,feature,name,km
1969,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1970,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1971,remnant,Santa Cruz (Eastern Remnant) (1 of 2),9
1972,massive,Santa Cruz (Eastern Remnant) (1 of 2),5
1973,remnant,Santa Cruz (Eastern Remnant) (1 of 2),9
1974,massive,Santa Cruz (Eastern Remnant) (1 of 2),8
1975,wheel track,Santa Cruz (Eastern Remnant) (1 of 2),5
1976,massive,Santa Cruz (Eastern Remnant) (1 of 2),5
1977,massive,Santa Cruz (Eastern Remnant) (1 of 2),2
1978,massive,Santa Cruz (Eastern Remnant) (1 of 2),5


Unnamed: 0,feature,name,km
1969,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1970,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1971,remnant,Santa Cruz (Eastern Remnant) (1 of 2),0
1972,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1973,remnant,Santa Cruz (Eastern Remnant) (1 of 2),1
1974,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1975,wheel track,Santa Cruz (Eastern Remnant) (1 of 2),0
1976,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1977,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
1978,massive,Santa Cruz (Eastern Remnant) (1 of 2),0
