In [1]:
import os
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import django
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits

os.chdir('..')


os.environ.setdefault("DJANGO_SETTINGS_MODULE", "multidex.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

from plotter.models import MSpec, ZSpec
from plotter.spectrum_ops import filter_df_from_queryset

from marslab.compat.xcam import DERIVED_CAM_DICT

from multidex_utils import modeldict, model_metadata_df
pd.set_option('display.max_rows', 200)

In [2]:
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)


[0.99244289 0.00755711]
[6.30061232 0.54980396]


In [3]:

filter_info = DERIVED_CAM_DICT['ZCAM']['filters']
filts = list(filter_info.keys())
metadata_df = model_metadata_df(ZSpec)
data_df = filter_df_from_queryset(ZSpec.objects.all(), r_star=True)
corpus = pd.concat([metadata_df, data_df], axis=1)



In [None]:
filter_df_from_queryset?

In [None]:
data_df = filter_df_from_queryset(
    ZSpec.objects.all(), r_star=True, scale_to=('L1', 'R1')
)
corpus = pd.concat([metadata_df, data_df], axis=1)

search = corpus.copy().dropna(subset=filts)
search = search.loc[search['incidence_angle'] > -40]

In [7]:
%time metadata_df = model_metadata_df(ZSpec)


CPU times: user 2.61 s, sys: 41 µs, total: 2.61 s
Wall time: 2.66 s


In [16]:
specs = ZSpec.objects.values()

In [17]:
%%time 
[
    (spec['l2'],spec['l1']) for spec in specs
]

CPU times: user 24.7 ms, sys: 54 µs, total: 24.7 ms
Wall time: 22.8 ms


[(0.24141717, 0.23081711),
 (0.24149962, 0.24018946),
 (0.27857652, 0.2790807),
 (0.23707154, 0.23513882),
 (0.23591913, 0.23239164),
 (0.18949856, 0.18993708),
 (0.21870886, 0.21314985),
 (0.29120457, 0.28858602),
 (0.32466516, 0.32357174),
 (0.197741956, 0.191958293),
 (0.231959507, 0.226896018),
 (0.282623708, 0.284218669),
 (0.325634897, 0.329335123),
 (0.324177086, 0.330428749),
 (0.193949938, 0.190844193),
 (0.079289913, 0.077772684),
 (0.166932732, 0.163194954),
 (0.144594193, 0.139957532),
 (0.217636406, 0.218485326),
 (0.237188622, 0.241927981),
 (0.232274979, 0.238997862),
 (0.24107103, 0.24850592),
 (0.233577043, 0.243746832),
 (0.269571066, 0.280177742),
 (0.237135917, 0.247573659),
 (0.252961904, 0.26055187),
 (0.22560069, 0.23312895),
 (None, None),
 (0.2549088, 0.265953302),
 (0.2376827, 0.219601676),
 (0.196651191, 0.183966592),
 (0.197589129, 0.181061357),
 (0.138202399, 0.132859141),
 (0.352282524, 0.336171865),
 (0.25336355, 0.241466165),
 (0.258846134, 0.245556265),

In [None]:
try:
    dict_function = getattr(model, "metadata_dict")
except AttributeError:
    dict_function = modeldict
if relation_names is None:
    relation_names = []
value_list = []
id_list = []
for obj in model.objects.all().prefetch_related(*relation_names):
    value_list.append(dict_function(obj))
    id_list.append(obj.id)
return pd.DataFrame(value_list, index=id_list)


In [None]:
search = corpus.copy().dropna(subset=filts)
# search = search.loc[search['incidence_angle'] > -10]
means = search[filts]
vectors = means.T.to_dict('list')
vectarray = np.array(tuple(vectors.values()))
pca = PCA(n_components='mle', whiten=True)
transform = pca.fit_transform(vectarray)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)


In [None]:
pca.n_components

In [None]:
transform

In [None]:
def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [None]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(
    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
)

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
                random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

print(82 * '_')

In [None]:
from sklearn.datasets import make_blobs

In [None]:
data_df = filter_df_from_queryset(ZSpec.objects.all(), r_star=True)
data_df.drop(columns=[col for col in data_df.columns if "err" in col], inplace=True)

In [None]:
data_df

In [None]:
km = KMeans(n_clusters=10)
metadata_df = model_metadata_df(ZSpec, ["observation"])
metadata_df['km'] = km.fit_predict(data_df)
metadata_df[['feature', 'name', 'km']]