In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from environment import *

with open("setting.yaml") as yaml_file:

    SETTING = yaml.load(yaml_file)

PATH = make_path_dict(SETTING)

In [None]:
gene_x_sample = pd.read_csv(PATH["gene_x_sample.tsv"], sep="\t", index_col=0)

target_x_sample = pd.read_csv(PATH["target_x_sample.tsv"], sep="\t", index_col=0)

gene_sets = ccal.read_gmts(SETTING["gene_set_file_paths"])

gene_set_x_information = pd.read_csv(
    PATH["gene_set_x_information.tsv"], sep="\t", index_col=0
)

In [None]:
for target_name, target_values in target_x_sample.iterrows():

    output_directory_path = "{}/{}".format(PATH["gsea/"], target_name)

    ccal.establish_path(output_directory_path, "directory")

    prefix = "all.log_ratio"

    score_moe_p_value_fdr = pd.read_csv(
        "{}/{}/{}.tsv".format(
            PATH["find_differentially_expressed_gene/"], target_name, prefix
        ),
        sep="\t",
        index_col=0,
    )

    gene_score = score_moe_p_value_fdr["Score"]

    gene_set_scores = (
        ccal.single_sample_gseas(
            gene_score.to_frame(),
            gene_sets.loc[
                gene_set_x_information.index[
                    gene_set_x_information["Good"].astype(bool)
                ]
            ],
            statistic=SETTING["gsea_statistic"],
            n_job=SETTING["n_job"],
        )
        .squeeze()
        .sort_values()
    )

    print(gene_set_x_information.loc[gene_set_scores.index])

    if SETTING["gene_sets_to_peek"] is None:

        ranks = []

    else:

        ranks = np.nonzero(
            [
                gene_set in SETTING["gene_sets_to_peek"]
                for gene_set in gene_set_scores.index
            ]
        )[0]

    ccal.plot_and_save(
        dict(
            layout=dict(
                title=dict(text="Gene Sets Ranked by GSEA Score"),
                xaxis=dict(title="Rank"),
                yaxis=dict(title="GSEA Score"),
            ),
            data=[
                dict(
                    type="scatter",
                    name="All",
                    x=tuple(range(gene_set_scores.size)),
                    y=gene_set_scores,
                    text=gene_set_scores.index,
                    mode="markers",
                    marker=dict(color="#d0d0d0"),
                ),
                dict(
                    type="scatter",
                    name="Peek",
                    x=ranks,
                    y=gene_set_scores[ranks],
                    text=gene_set_scores[ranks].index,
                    mode="markers",
                    marker=dict(color="#20d9ba", size=10),
                ),
            ],
        ),
        "{}/{}.html".format(output_directory_path, prefix),
        None,
    )

    if SETTING["gene_sets_to_peek"] is not None:

        for gene_set_name in SETTING["gene_sets_to_peek"]:

            ccal.single_sample_gsea(
                gene_score,
                gene_sets.loc[gene_set_name],
                statistic=SETTING["gsea_statistic"],
                title="{}<br>{}".format(target_name, gene_set_name),
                gene_prefix="Gene Score",
                html_file_path="{}/{}.html".format(
                    output_directory_path, gene_set_name
                ),
            )