In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from environment import *

with open("setting.yaml") as yaml_file:

    SETTING = yaml.load(yaml_file)

PATH = make_path_dict(SETTING)

In [None]:
feature_x_sample_processed_0 = pd.read_csv(
    PATH["feature_x_sample.processed.tsv"], sep="\t", index_col=0
)

feature_x_sample_processed_0.columns.name = SETTING["sample_alias"]

In [None]:
w_0 = pd.read_csv(PATH["w.tsv"], sep="\t", index_col=0)

w_0.columns.name = "Factor"

h_0 = pd.read_csv(PATH["h.tsv"], sep="\t", index_col=0)

h_0.columns.name = feature_x_sample_processed_0.columns.name

In [None]:
gps_map_0 = ccal.read_gps_map(PATH["gps_map.pickle.gz"])

In [None]:
feature_x_sample_1 = pd.read_csv(
    SETTING["new_feature_x_sample_file_path"], sep="\t", index_col=0
)

feature_x_sample_1.index.name = feature_x_sample_processed_0.index.name

feature_x_sample_1.columns.name = SETTING["new_sample_alias"]

In [None]:
feature_x_sample_processed_1 = ccal.process_feature_x_sample(
    feature_x_sample_1,
    shift_as_necessary_to_achieve_min_before_logging=SETTING[
        "new_shift_as_necessary_to_achieve_min_before_logging"
    ],
    log_base=SETTING["new_log_base"],
    normalization_axis=SETTING["new_normalization_axis"],
    normalization_method=SETTING["new_normalization_method"],
    clip_min=SETTING["new_clip_min"],
    clip_max=SETTING["new_clip_max"],
    feature_x_sample_alias=SETTING["new_feature_x_sample_alias"],
    feature_x_sample_value_name=SETTING["feature_x_sample_value_name"],
    plot_heat_map_max_size=SETTING["plot_heat_map_max_size"],
    plot_histogram_max_size=SETTING["plot_histogram_max_size"],
    plot_rug_max_size=SETTING["plot_rug_max_size"],
)

In [None]:
feature_x_sample_processed_1 = feature_x_sample_processed_1.reindex(
    index=pd.Index(
        set(feature.replace("(-) ", "").replace("(+) ", "") for feature in w_0.index),
        name=w_0.index.name,
    )
)

In [None]:
inconsistent_features = ccal.select_series_indices(
    feature_x_sample_processed_1.apply(
        lambda feature_values: abs(
            feature_values.median()
            - feature_x_sample_processed_0.loc[feature_values.name].median()
        ),
        axis=1,
    ),
    ">",
    fraction=SETTING["new_inconsistent_feature_fraction_to_drop"],
    title={"text": "Ranking of Feature Median Difference"},
    xaxis={"title": "Rank"},
    yaxis={"title": "Feature Median Difference"},
).tolist()

extend = []

for inconsistent_feature in inconsistent_features:

    for template in ("(-) {}", "(+) {}"):

        extend.append(template.format(inconsistent_feature))

inconsistent_features += extend

w_0.drop(w_0.index & inconsistent_features, inplace=True)

In [None]:
feature_x_sample_signal_1 = pd.DataFrame(
    index=w_0.index, columns=feature_x_sample_processed_1.columns
)

In [None]:
feature_x_fit_parameter_0 = pd.read_csv(
    PATH["feature_x_fit_parameter.tsv"], sep="\t", index_col=0
)

n = w_0.shape[0]

n_per_print = n // 10

for i, sign_feature in enumerate(w_0.index):

    if i % n_per_print == 0:

        print("{}/{} ...".format(i + 1, n))

    sign, feature = sign_feature.split()

    n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
        feature
    ]

    context_dict_0 = ccal.compute_1d_array_context(
        feature_x_sample_processed_0.loc[feature].values,
        n_data=n_data_0,
        location=location_0,
        scale=scale_0,
        degree_of_freedom=degree_of_freedom_0,
        shape=shape_0,
    )

    grid_0 = context_dict_0["grid"]

    context_indices_0 = context_dict_0["context"]

    if sign == "(-)":

        signals_0 = -context_indices_0.clip(max=0)

    elif sign == "(+)":

        signals_0 = context_indices_0.clip(min=0)

        signals_0 = ccal.normalize_nd_array(
            signals_0, None, SETTING["signal_normalization_method"], raise_for_bad=False
        )

    values_1 = feature_x_sample_processed_1.loc[feature].values

    is_good_1 = ~ccal.check_nd_array_for_bad(values_1, raise_for_bad=False)

    feature_x_sample_signal_1.iloc[i, is_good_1] = signals_0[
        [np.absolute(value_1 - grid_0).argmin() for value_1 in values_1[is_good_1]]
    ]

In [None]:
for sign_feature in np.random.choice(
    feature_x_sample_signal_1.index, size=8, replace=False
):

    feature = sign_feature.split()[1]

    n_data_0, location_0, scale_0, degree_of_freedom_0, shape_0 = feature_x_fit_parameter_0.loc[
        feature
    ]

    ccal.plot_context(
        feature_x_sample_processed_0.loc[feature],
        n_data=n_data_0,
        location=location_0,
        scale=scale_0,
        degree_of_freedom=degree_of_freedom_0,
        shape=shape_0,
        title=sign_feature,
    )

    values_1 = feature_x_sample_processed_1.loc[feature].sort_values()

    ccal.plot_and_save(
        {
            "layout": {"title": {"text": "{} (new)".format(sign_feature)}},
            "data": [
                {
                    "type": "scatter",
                    "x": values_1,
                    "y": feature_x_sample_signal_1.loc[sign_feature, values_1.index],
                    "text": values_1.index,
                    "mode": "markers",
                }
            ],
        },
        None,
    )

In [None]:
h_1 = ccal.solve_for_H(feature_x_sample_signal_1.fillna(0), w_0, method="nnls")

In [None]:
title = "{}<br>Infers<br>{}".format(
    SETTING["feature_x_sample_alias"], SETTING["new_feature_x_sample_alias"]
)

h_element_states_1 = gps_map_0.predict(
    "h",
    h_1,
    n_pull=SETTING["gps_map_h_n_pull"],
    pull_power=SETTING["gps_map_h_pull_power"],
    element_marker_size=32,
    title=title,
    html_file_path=os.path.join(PATH["infer/"], "gps_map.html"),
)

ccal.plot_heat_map(
    h_1,
    column_annotation=h_element_states_1,
    title=title,
    xaxis_title=h_1.columns.name,
    yaxis_title=h_1.index.name,
)