In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __init__ import *

PROJECT_JSON = kraft.read_json("../project.json")

PATH = make_path_dict(PROJECT_JSON)

In [None]:
feature_x_sample_1 = pd.read_csv(
    PROJECT_JSON["new_feature_x_sample_file_path"], sep="\t", index_col=0
)

feature_x_sample_1.index.name = PROJECT_JSON["feature_alias"]

feature_x_sample_1.columns.name = PROJECT_JSON["new_sample_alias"]

summarize_feature_x_sample_keyword_arguments = {
    "feature_x_sample_alias": PROJECT_JSON["feature_x_sample_alias"],
    "feature_x_sample_value_name": PROJECT_JSON["feature_x_sample_value_name"],
    "plot_heat_map_max_size": PROJECT_JSON["plot_heat_map_max_size"],
    "plot_histogram_max_size": PROJECT_JSON["plot_histogram_max_size"],
    "plot_rug_max_size": PROJECT_JSON["plot_rug_max_size"],
    "plot": False,
}

feature_x_sample_processed_1 = kraft.process_feature_x_sample(
    feature_x_sample_1,
    features_to_drop=PROJECT_JSON["new_features_to_drop"],
    samples_to_drop=PROJECT_JSON["new_samples_to_drop"],
    nanize=PROJECT_JSON["new_nanize"],
    drop_axis=PROJECT_JSON["new_drop_axis"],
    max_na=PROJECT_JSON["new_max_na"],
    min_n_not_na_value=PROJECT_JSON["new_min_n_not_na_value"],
    min_n_not_na_unique_value=PROJECT_JSON["new_min_n_not_na_unique_value"],
    shift_as_necessary_to_achieve_min_before_logging=PROJECT_JSON[
        "new_shift_as_necessary_to_achieve_min_before_logging"
    ],
    log_base=PROJECT_JSON["new_log_base"],
    normalization_axis=PROJECT_JSON["new_normalization_axis"],
    normalization_method=PROJECT_JSON["new_normalization_method"],
    clip_min=PROJECT_JSON["new_clip_min"],
    clip_max=PROJECT_JSON["new_clip_max"],
    **summarize_feature_x_sample_keyword_arguments,
)

feature_x_sample_processed_0 = pd.read_csv(
    PATH["feature_x_sample.processed.tsv"], sep="\t", index_col=0
)

feature_x_sample_processed_0.columns.name = PROJECT_JSON["sample_alias"]

w_0 = pd.read_csv(PATH["w.tsv"], sep="\t", index_col=0)

w_0.columns.name = "Factor"

h_0 = pd.read_csv(PATH["h.tsv"], sep="\t", index_col=0)

h_0.columns.name = PROJECT_JSON["sample_alias"]

gps_map_0 = kraft.read_gps_map(PATH["gps_map.pickle.gz"])

feature_x_sample_processed_1 = feature_x_sample_processed_1.reindex(
    index=pd.Index(w_0.index.str.lstrip("(-+) ").unique(), name=w_0.index.name)
)

print(
    "{:.2%} feature are missing.".format(
        feature_x_sample_processed_1.isna().all(axis=1).sum()
        / feature_x_sample_processed_1.shape[0]
    )
)

output_directory_path = os.path.join(
    PATH["infer/"], PROJECT_JSON["new_feature_x_sample_alias"]
)

kraft.establish_path(output_directory_path, "directory")

In [None]:
if PROJECT_JSON["signal_type"] == "raw":

    def make_raw_signal(series, signal_normalization_method, using):

        if using is None:

            return series

        elif using == "training":

            vector = feature_x_sample_processed_0.loc[series.name].values

        elif using == "testing":

            vector = series.values

        else:

            raise

        vector_good = vector[~kraft.check_array_for_bad(vector, raise_for_bad=False)]

        if vector_good.size == 0:

            return pd.Series(index=series.index, name=series.name)

        elif signal_normalization_method == "0-1":

            min_ = vector_good.min()

            max_ = vector_good.max()

            return (series - min_) / (max_ - min_)

    feature_x_sample_signal_1 = feature_x_sample_processed_1.apply(
        make_raw_signal,
        axis=PROJECT_JSON["signal_normalization_axis"],
        signal_normalization_method=PROJECT_JSON["signal_normalization_method"],
        using=None,
    )

In [None]:
if PROJECT_JSON["signal_type"] == "context":

    inconsistent_features = kraft.select_series_indices(
        feature_x_sample_processed_1.apply(
            lambda feature_values: abs(
                feature_values.median()
                - feature_x_sample_processed_0.loc[feature_values.name].median()
            ),
            axis=1,
        ),
        ">",
        fraction=PROJECT_JSON["new_inconsistent_feature_fraction_to_drop"],
        title={"text": PROJECT_JSON["feature_alias"]},
        yaxis={"title": "Median Difference"},
    ).tolist()

    extend = []

    for inconsistent_feature in inconsistent_features:

        for template in ("(-) {}", "(+) {}"):

            extend.append(template.format(inconsistent_feature))

    inconsistent_features += extend

    w_0.drop(w_0.index & inconsistent_features, inplace=True)

    feature_x_sample_signal_1 = pd.DataFrame(
        index=w_0.index, columns=feature_x_sample_processed_1.columns
    )

    feature_x_fit_parameter_0 = pd.read_csv(
        PATH["feature_x_fit_parameter.tsv"], sep="\t", index_col=0
    )

    n = w_0.shape[0]

    n_per_print = n // 10

    for i, sign_feature in enumerate(w_0.index):

        if i % n_per_print == 0:

            print("{}/{}...".format(i + 1, n))

        sign, feature = sign_feature.split()

        n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
            feature
        ]

        context_dict_0 = kraft.compute_vector_context(
            feature_x_sample_processed_0.loc[feature].values,
            n_data=n_data_0,
            location=location_0,
            scale=scale_0,
            degree_of_freedom=degree_of_freedom_0,
            shape=shape_0,
        )

        grid_0 = context_dict_0["grid"]

        context_indices_0 = context_dict_0["context"]

        if sign == "(-)":

            signals_0 = -context_indices_0.clip(max=0)

        elif sign == "(+)":

            signals_0 = context_indices_0.clip(min=0)

            signals_0 = kraft.normalize_array(
                signals_0,
                None,
                PROJECT_JSON["signal_normalization_method"],
                raise_for_bad=False,
            )

        values_1 = feature_x_sample_processed_1.loc[feature].values

        is_good_1 = ~kraft.check_array_for_bad(values_1, raise_for_bad=False)

        feature_x_sample_signal_1.iloc[i, is_good_1] = signals_0[
            [np.absolute(value_1 - grid_0).argmin() for value_1 in values_1[is_good_1]]
        ]

    for sign_feature in np.random.choice(
        feature_x_sample_signal_1.index, size=8, replace=False
    ):

        feature = sign_feature.split()[1]

        n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
            feature
        ]

        kraft.plot_context(
            feature_x_sample_processed_0.loc[feature],
            n_data=n_data_0,
            location=location_0,
            scale=scale_0,
            degree_of_freedom=degree_of_freedom_0,
            shape=shape_0,
            title=sign_feature,
        )

        values_1 = feature_x_sample_processed_1.loc[feature].sort_values()

        kraft.plot_plotly_figure(
            {
                "layout": {
                    "title": {"text": "{} Context in New Data".format(sign_feature)}
                },
                "data": [
                    {
                        "type": "scatter",
                        "x": values_1,
                        "y": feature_x_sample_signal_1.loc[
                            sign_feature, values_1.index
                        ],
                        "text": values_1.index,
                        "mode": "markers",
                    }
                ],
            },
            None,
        )

In [None]:
title = "{}<br>Infers<br>{} (n={})".format(
    PROJECT_JSON["feature_x_sample_alias"],
    PROJECT_JSON["new_feature_x_sample_alias"],
    feature_x_sample_signal_1.shape[1],
)

In [None]:
h_1 = kraft.solve_for_h(feature_x_sample_signal_1.fillna(0), w_0)

h_1_file_path = os.path.join(output_directory_path, "h.tsv")

h_1.to_csv(h_1_file_path, sep="\t")

if h_1.shape[1] < 16:

    function = kraft.plot_bubble_map

else:

    function = kraft.plot_heat_map

dataframe = kraft.normalize_dataframe(h_1, 0, "-0-")

if dataframe.shape[0] < PROJECT_JSON["plot_cluster_max_size"]:

    dataframe = dataframe.iloc[kraft.cluster_matrix(dataframe.values, 0)]

if dataframe.shape[1] < PROJECT_JSON["plot_cluster_max_size"]:

    dataframe = dataframe.iloc[:, kraft.cluster_matrix(dataframe.values, 1)]

function(
    dataframe,
    title_text=title,
    xaxis_title_text=h_1.columns.name,
    yaxis_title_text=h_1.index.name,
    html_file_path=h_1_file_path.replace(".tsv", ".html"),
)

In [None]:
predict_arguments = ("h", h_1)

predict_keyword_arguments = {
    "n_pull": PROJECT_JSON["gps_map_h_n_pull"],
    "pull_power": PROJECT_JSON["gps_map_h_pull_power"],
    "element_marker_size": PROJECT_JSON["new_gps_map_h_element_marker_size"],
}

In [None]:
gps_map_0.predict(
    *predict_arguments,
    title=title,
    html_file_path=os.path.join(output_directory_path, "gps_map.html"),
    **predict_keyword_arguments,
)

In [None]:
new_sample_data_dicts = PROJECT_JSON["new_sample_data"]

new_sample_data_dicts = {
    data_name: data_dict
    for data_name, data_dict in new_sample_data_dicts.items()
    if "indices_to_peek" in data_dict
}

for data_name, data_dict in new_sample_data_dicts.items():

    print(data_name)

    data_dict["dataframe"] = pd.read_csv(data_dict["file_path"], sep="\t", index_col=0)

In [None]:
for data_name, data_dict in new_sample_data_dicts.items():

    indices_to_peek = data_dict["indices_to_peek"]

    if indices_to_peek == "all":

        indices_to_peek = data_dict["dataframe"].index

    data_type = data_dict["type"]

    if PROJECT_JSON["plot_std"] is None:

        annotation_std_maxs = None

    else:

        annotation_std_maxs = (PROJECT_JSON["plot_std"],)

    if data_type == "categorical":

        annotation_colorscales = ("Portland",)

    else:

        annotation_colorscales = None

    for index, element_value in (
        data_dict["dataframe"]
        .loc[data_dict["dataframe"].index & indices_to_peek]
        .iterrows()
    ):

        gps_map_file_name = kraft.normalize_file_name(
            "gps_map.{}.{}.html".format(data_name, index)
        )

        print(gps_map_file_name)

        gps_map_0.predict(
            *predict_arguments,
            annotation_x_element=element_value.to_frame().T,
            annotation_types=(data_type,),
            annotation_std_maxs=annotation_std_maxs,
            annotation_colorscales=annotation_colorscales,
            title="({}) {}".format(data_name, index),
            html_file_path=os.path.join(
                output_directory_path,
                kraft.normalize_file_name(
                    "gps_map.{}.{}.html".format(data_name, index)
                ),
            ),
            **predict_keyword_arguments,
        )