In [1]:
import os
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import django
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits

os.chdir('..')


os.environ.setdefault("DJANGO_SETTINGS_MODULE", "multidex.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

from plotter.models import MSpec, ZSpec
from plotter.spectrum_ops import filter_df_from_queryset

from marslab.compat.xcam import DERIVED_CAM_DICT

from multidex_utils import modeldict, model_metadata_df
pd.set_option('display.max_rows', 200)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)


In [None]:

filter_info = DERIVED_CAM_DICT['ZCAM']['filters']
filts = list(filter_info.keys())
metadata_df = model_metadata_df(ZSpec)
data_df = filter_df_from_queryset(ZSpec.objects.all(), r_star=True)
corpus = pd.concat([metadata_df, data_df], axis=1)



In [None]:
filter_df_from_queryset?

In [None]:
data_df = filter_df_from_queryset(
    ZSpec.objects.all(), r_star=False, scale_to='L1_R1'
)
corpus = pd.concat([metadata_df, data_df], axis=1)

search = corpus.copy().dropna(subset=filts)

In [None]:
np.cos(np.radians(corpus['incidence_angle']))

In [None]:
queryset = ZSpec.objects.all()

In [None]:
queryset[0].incidence_angle

In [None]:
from multidex_utils import qlist

In [None]:
%%time
[
    getattr(spec, 'incidence_angle')
    for spec in queryset
]

In [None]:
.23 / np.cos(np.radians(-7.922798))

In [None]:
corpus[["L1", "R1", "incidence_angle"]]

In [None]:
data_df[['L1', 'R1']]

In [None]:
means = search[filts]

In [None]:
vectors = means.T.to_dict('list')

In [None]:
vectarray = np.array(tuple(vectors.values()))

In [None]:
pca = PCA(n_components=3, whiten=True)
%time transform = pca.fit_transform(vectarray)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)


In [None]:
transform

In [None]:
def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))

In [None]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(
    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
)

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
                random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

print(82 * '_')

In [None]:
from sklearn.datasets import make_blobs

In [None]:
data_df = filter_df_from_queryset(ZSpec.objects.all(), r_star=True)
data_df.drop(columns=[col for col in data_df.columns if "err" in col], inplace=True)

In [None]:
data_df

In [None]:
km = KMeans(n_clusters=10)
metadata_df = model_metadata_df(ZSpec, ["observation"])
metadata_df['km'] = km.fit_predict(data_df)
metadata_df[['feature', 'name', 'km']]