In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from environment import *

SETTING = kraft.read_json("setting.json")

PATH = make_path_dict(SETTING)

In [None]:
feature_x_sample_1 = pd.read_csv(
    SETTING["new_feature_x_sample_file_path"], sep="\t", index_col=0
)

feature_x_sample_1.index.name = SETTING["feature_alias"]

feature_x_sample_1.columns.name = SETTING["new_sample_alias"]

summarize_feature_x_sample_keyword_arguments = {
    "feature_x_sample_alias": SETTING["feature_x_sample_alias"],
    "feature_x_sample_value_name": SETTING["feature_x_sample_value_name"],
    "plot_heat_map_max_size": SETTING["plot_heat_map_max_size"],
    "plot_histogram_max_size": SETTING["plot_histogram_max_size"],
    "plot_rug_max_size": SETTING["plot_rug_max_size"],
    "plot": True,
}

feature_x_sample_processed_1 = kraft.process_feature_x_sample(
    feature_x_sample_1,
    features_to_drop=SETTING["new_features_to_drop"],
    samples_to_drop=SETTING["new_samples_to_drop"],
    nanize=SETTING["new_nanize"],
    drop_axis=SETTING["new_drop_axis"],
    max_na=SETTING["new_max_na"],
    min_n_not_na_value=SETTING["new_min_n_not_na_value"],
    min_n_not_na_unique_value=SETTING["new_min_n_not_na_unique_value"],
    shift_as_necessary_to_achieve_min_before_logging=SETTING[
        "new_shift_as_necessary_to_achieve_min_before_logging"
    ],
    log_base=SETTING["new_log_base"],
    normalization_axis=SETTING["new_normalization_axis"],
    normalization_method=SETTING["new_normalization_method"],
    clip_min=SETTING["new_clip_min"],
    clip_max=SETTING["new_clip_max"],
    **summarize_feature_x_sample_keyword_arguments,
)

feature_x_sample_processed_0 = pd.read_csv(
    PATH["feature_x_sample.processed.tsv"], sep="\t", index_col=0
)

feature_x_sample_processed_0.columns.name = SETTING["sample_alias"]

w_0 = pd.read_csv(PATH["w.tsv"], sep="\t", index_col=0)

w_0.columns.name = "Factor"

h_0 = pd.read_csv(PATH["h.tsv"], sep="\t", index_col=0)

h_0.columns.name = SETTING["sample_alias"]

gps_map_0 = kraft.read_gps_map(PATH["gps_map.pickle.gz"])

feature_x_sample_processed_1 = feature_x_sample_processed_1.reindex(
    index=pd.Index(w_0.index.str.lstrip("(-+) ").unique(), name=w_0.index.name)
)

print(
    f"{feature_x_sample_processed_1.isna().all(axis=1).sum() / feature_x_sample_processed_1.shape[0]:.2%} feature are missing."
)

output_directory_path = os.path.join(
    PATH["infer/"], SETTING["new_feature_x_sample_alias"]
)

kraft.establish_path(output_directory_path, "directory")

In [None]:
if SETTING["signal_type"] == "raw":

    def make_raw_signal(series, signal_normalization_method, using):

        if using is None:

            return series

        elif using == "training":

            _1d_array = feature_x_sample_processed_0.loc[series.name].values

        elif using == "testing":

            _1d_array = series.values

        else:

            raise ValueError(f"using can be None, training, or testing.")

        _1d_array_good = _1d_array[
            ~kraft.check_nd_array_for_bad(_1d_array, raise_for_bad=False)
        ]

        if _1d_array_good.size == 0:

            return pd.Series(index=series.index, name=series.name)

        elif signal_normalization_method == "0-1":

            min_ = _1d_array_good.min()

            max_ = _1d_array_good.max()

            return (series - min_) / (max_ - min_)

    feature_x_sample_signal_1 = feature_x_sample_processed_1.apply(
        make_raw_signal,
        axis=SETTING["signal_normalization_axis"],
        signal_normalization_method=SETTING["signal_normalization_method"],
        using=None,
    )

In [None]:
if SETTING["signal_type"] == "context":

    inconsistent_features = kraft.select_series_indices(
        feature_x_sample_processed_1.apply(
            lambda feature_values: abs(
                feature_values.median()
                - feature_x_sample_processed_0.loc[feature_values.name].median()
            ),
            axis=1,
        ),
        ">",
        fraction=SETTING["new_inconsistent_feature_fraction_to_drop"],
        title={"text": SETTING["feature_alias"]},
        xaxis={"title": "Rank"},
        yaxis={"title": "Median Difference"},
    ).tolist()

    extend = []

    for inconsistent_feature in inconsistent_features:

        for template in ("(-) {}", "(+) {}"):

            extend.append(template.format(inconsistent_feature))

    inconsistent_features += extend

    w_0.drop(w_0.index & inconsistent_features, inplace=True)

    feature_x_sample_signal_1 = pd.DataFrame(
        index=w_0.index, columns=feature_x_sample_processed_1.columns
    )

    feature_x_fit_parameter_0 = pd.read_csv(
        PATH["feature_x_fit_parameter.tsv"], sep="\t", index_col=0
    )

    n = w_0.shape[0]

    n_per_print = n // 10

    for i, sign_feature in enumerate(w_0.index):

        if i % n_per_print == 0:

            print(f"{i + 1}/{n} ...")

        sign, feature = sign_feature.split()

        n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
            feature
        ]

        context_dict_0 = kraft.compute_1d_array_context(
            feature_x_sample_processed_0.loc[feature].values,
            n_data=n_data_0,
            location=location_0,
            scale=scale_0,
            degree_of_freedom=degree_of_freedom_0,
            shape=shape_0,
        )

        grid_0 = context_dict_0["grid"]

        context_indices_0 = context_dict_0["context"]

        if sign == "(-)":

            signals_0 = -context_indices_0.clip(max=0)

        elif sign == "(+)":

            signals_0 = context_indices_0.clip(min=0)

            signals_0 = kraft.normalize_nd_array(
                signals_0,
                None,
                SETTING["signal_normalization_method"],
                raise_for_bad=False,
            )

        values_1 = feature_x_sample_processed_1.loc[feature].values

        is_good_1 = ~kraft.check_nd_array_for_bad(values_1, raise_for_bad=False)

        feature_x_sample_signal_1.iloc[i, is_good_1] = signals_0[
            [np.absolute(value_1 - grid_0).argmin() for value_1 in values_1[is_good_1]]
        ]

    for sign_feature in np.random.choice(
        feature_x_sample_signal_1.index, size=8, replace=False
    ):

        feature = sign_feature.split()[1]

        n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
            feature
        ]

        kraft.plot_context(
            feature_x_sample_processed_0.loc[feature],
            n_data=n_data_0,
            location=location_0,
            scale=scale_0,
            degree_of_freedom=degree_of_freedom_0,
            shape=shape_0,
            title=sign_feature,
        )

        values_1 = feature_x_sample_processed_1.loc[feature].sort_values()

        kraft.plot_and_save(
            {
                "layout": {"title": {"text": f"{sign_feature} Context in New Data"}},
                "data": [
                    {
                        "type": "scatter",
                        "x": values_1,
                        "y": feature_x_sample_signal_1.loc[
                            sign_feature, values_1.index
                        ],
                        "text": values_1.index,
                        "mode": "markers",
                    }
                ],
            },
            None,
        )

In [None]:
title = f"{SETTING['feature_x_sample_alias']}<br>Infers<br>{SETTING['new_feature_x_sample_alias']} (n={feature_x_sample_signal_1.shape[1]})"

In [None]:
h_1 = kraft.solve_for_H(feature_x_sample_signal_1.fillna(0), w_0)

h_1_file_path = os.path.join(output_directory_path, "h.tsv")

h_1.to_csv(h_1_file_path, sep="\t")

if h_1.shape[1] < 16:

    function = kraft.plot_bubble_map

else:

    function = kraft.plot_heat_map

dataframe = kraft.normalize_series_or_dataframe(h_1, 0, "-0-")

if dataframe.shape[0] < SETTING["plot_cluster_max_size"]:

    dataframe = dataframe.iloc[kraft.cluster_2d_array(dataframe.values, 0)]

if dataframe.shape[1] < SETTING["plot_cluster_max_size"]:

    dataframe = dataframe.iloc[:, kraft.cluster_2d_array(dataframe.values, 1)]

function(
    dataframe,
    title=title,
    xaxis_title=h_1.columns.name,
    yaxis_title=h_1.index.name,
    html_file_path=h_1_file_path.replace(".tsv", ".html"),
)

In [None]:
predict_arguments = ("h", h_1)

predict_keyword_arguments = {
    "n_pull": SETTING["gps_map_h_n_pull"],
    "pull_power": SETTING["gps_map_h_pull_power"],
    "element_marker_size": SETTING["gps_map_h_element_marker_size"],
}

In [None]:
gps_map_0.predict(
    *predict_arguments,
    title=title,
    html_file_path=os.path.join(output_directory_path, "gps_map.html"),
    **predict_keyword_arguments,
)

In [None]:
annotation_x_sample = pd.read_csv(
    "~/garden/data/densely_interconnected_transcriptional_circuits_control_cell_states_in_human_hematopoiesis/binary_information_x_sample.tsv",
    sep="\t",
    index_col=0,
)

for annotation_name, sample_value in annotation_x_sample.iterrows():

    gps_map_0.predict(
        *predict_arguments,
        annotation_x_element=sample_value.to_frame().T,
        title=f"{annotation_name} (n={sample_value.sum()})",
        html_file_path=os.path.join(
            output_directory_path,
            kraft.normalize_file_name(f"gps_map.{annotation_name}.html"),
        ),
        **predict_keyword_arguments,
    )