In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __init__ import *

PROJECT_JSON = kraft.read_json("../project.json")

PATH = make_path_dict(PROJECT_JSON)

In [None]:
w = pd.read_csv(PATH["w.tsv"], sep="\t", index_col=0)

h = pd.read_csv(PATH["h.tsv"], sep="\t", index_col=0)

w.columns.name = h.index.name

h.columns.name = PROJECT_JSON["sample_alias"]

In [None]:
for feature_type, w_or_h, dataframe, apply_axis, drop_axis in (
    ("feature", "w", w, 1, 0),
    ("sample", "h", h, 0, 1),
):

    gps_map_caotic_element_fraction_to_drop = PROJECT_JSON[
        "gps_map_caotic_{}_element_fraction_to_drop".format(w_or_h)
    ]

    if gps_map_caotic_element_fraction_to_drop is None:

        continue

    dataframe_shape_before = dataframe.shape

    element_entropy = dataframe.apply(kraft.compute_vector_entropy, axis=apply_axis)

    element_alias = PROJECT_JSON["{}_alias".format(feature_type)]

    dataframe.drop(
        kraft.select_series_indices(
            element_entropy,
            ">",
            fraction=gps_map_caotic_element_fraction_to_drop,
            layout={
                "title": {"text": "Dropping {}".format(element_alias)},
                "yaxis": {"title": "Entropy in {}".format(w_or_h.title())},
            },
        ),
        axis=drop_axis,
        inplace=True,
    )

    print(
        "{} shape: {} ==> {}.".format(
            w_or_h.title(), dataframe_shape_before, dataframe.shape
        )
    )

In [None]:
gps_map = kraft.GPSMap(
    w=w.T,
    h=h,
    w_n_pull=PROJECT_JSON["gps_map_w_n_pull"],
    w_pull_power=PROJECT_JSON["gps_map_w_pull_power"],
    h_n_pull=PROJECT_JSON["gps_map_h_n_pull"],
    h_pull_power=PROJECT_JSON["gps_map_h_pull_power"],
    plot=False,
)

In [None]:
for element_type, w_or_h, elements in (
    ("feature", "w", gps_map.w_elements),
    ("sample", "h", gps_map.h_elements),
):

    if elements is None:

        continue

    element_marker_size = PROJECT_JSON["gps_map_{}_element_marker_size".format(w_or_h)]

    gps_map.plot(
        w_or_h,
        element_marker_size=element_marker_size,
        html_file_path=os.path.join(PATH["{}|gps_map/".format(w_or_h)], "gps_map.html"),
    )

    if not os.path.isfile(PATH["{}|cluster_x_element.tsv".format(w_or_h)]):

        continue

    element_cluster = pd.read_csv(
        PATH["{}|cluster_x_element.tsv".format(w_or_h)], sep="\t", index_col=0
    ).apply(lambda column: column.idxmax())

    cluster_i = (
        element_cluster.value_counts().rank(method="first", ascending=False).astype(int)
        - 1
    ).to_dict()

    bandwidth_factor = PROJECT_JSON["gps_map_{}_bandwidth_factor".format(w_or_h)]

    gps_map.set_element_label(
        w_or_h, element_cluster.map(cluster_i), bandwidth_factor=bandwidth_factor
    )

    gps_map.plot(
        w_or_h,
        element_marker_size=element_marker_size,
        html_file_path=os.path.join(
            PATH["{}|hcc|gps_map/".format(w_or_h)], "gps_map.html"
        ),
    )

In [None]:
kraft.write_gps_map(gps_map, PATH["gps_map.pickle.gz"])

In [None]:
for element_type, w_or_h, elements in (
    ("feature", "w", gps_map.w_elements),
    ("sample", "h", gps_map.h_elements),
):

    elements_to_peek = PROJECT_JSON["{}s_to_peek".format(element_type)]

    element_value = pd.Series(
        (element in elements_to_peek for element in elements), index=elements
    )

    if not element_value.any():

        continue

    element_marker_size = PROJECT_JSON["gps_map_{}_element_marker_size".format(w_or_h)]

    gps_map.plot(
        w_or_h,
        element_value=element_value,
        element_value_data_type="binary",
        element_marker_size=element_marker_size,
        element_value_binary_annotation={"font": {"size": 16}},
        html_file_path=os.path.join(
            PATH["{}|hcc|gps_map/".format(w_or_h)], "gps_map.peek.html"
        ),
    )