In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
from pprint import pprint
import sys

import numpy as np
import pandas as pd
import matplotlib as mpl

sys.path.insert(0, os.path.expanduser('~/Jumis/github_kwatme'))
from match.match.make_match_panel import make_match_panel
from file.file.access_gmt import read_gmts
from gsea.gsea.single_sample_gseas import single_sample_gseas
from gsea.gsea.gsea import gsea
from support.support.path import clean_name
from plot.plot.plot_violin_box_or_bar import plot_violin_box_or_bar
from plot.plot.plot_distribution import plot_distribution
from information.information.compute_information_coefficient import compute_information_coefficient
from linear_model.linear_model.correlate import correlate
from nd_array.nd_array.normalize_2d_array import normalize_2d_array

In [None]:
n_job = 16

In [None]:
max_tpm_log__gene_x_sample = pd.read_table(
    '../output/max_tpm_log__gene_x_sample.tsv', index_col=0)
print(max_tpm_log__gene_x_sample.shape)

max_tpm_log__gene_x_sample

In [None]:
gmt_file_paths = []

directory_path = '../../../data/gene_set/msigdb_v6.0'
gmt_file_paths += [
    os.path.join(directory_path, name) for name in os.listdir(directory_path)
]

gmt_file_paths.append('../../../data/gene_set/affymetrix.gmt')
gmt_file_paths.append('../../../data/gene_set/ipa_regulator.gmt')
gmt_file_paths.append('../../../data/gene_set/isogenic_signature.gmt')
gmt_file_paths.append('../../../data/gene_set/not_in_msigdb_yet.gmt')
gmt_file_paths.append('../../../data/gene_set/yap_signature.gmt')

pprint(gmt_file_paths)

gmt = read_gmts(gmt_file_paths)
print(gmt.shape)

gmt = gmt.loc[~gmt.index.duplicated()]
print(gmt.shape)

gmt = gmt.loc[5 <= gmt.apply(lambda genes: genes.dropna().size, axis=1)]
print(gmt.shape)

In [None]:
statistic = 'auc'

In [None]:
gene_set_x_sample_file_path = '../output/gene_set/gene_set_x_sample.tsv'

if os.path.isfile(gene_set_x_sample_file_path):
    gene_set_x_sample = pd.read_table(gene_set_x_sample_file_path, index_col=0)
else:
    gene_set_x_sample = single_sample_gseas(
        max_tpm_log__gene_x_sample,
        gmt,
        normalization_method='0-1',
        statistic=statistic,
        file_path=gene_set_x_sample_file_path)
print(gene_set_x_sample.shape)

gene_set_x_sample

In [None]:
target = pd.Series((0, ) * 3 + (1, ) * 3, index=gene_set_x_sample.columns)

target_int_to_str = {
    0: 'C',
    1: 'VS',
}

In [None]:
feature_score_moe_p_value_fdr = make_match_panel(
    target,
    gene_set_x_sample,
    target_ascending=True,
    n_job=n_job,
    n_top_feature=0.98,
    max_n_feature=None,
    title='Gene Set Expression',
    target_type='binary',
    target_int_to_str=target_int_to_str,
    plot_column_names=True,
    file_path_prefix=
    '../output/gene_set/find_differentially_expressed_gene_sets')

feature_score_moe_p_value_fdr

In [None]:
selected_gene_sets = gmt.index[gmt.index.str.lower().str.contains('yap')]

selected_gene_sets

In [None]:
feature_score_moe_p_value_fdr = make_match_panel(
    target,
    gene_set_x_sample.loc[selected_gene_sets],
    target_ascending=True,
    n_job=n_job,
    n_top_feature=0.5,
    max_n_feature=None,
    title='Gene Set Expression',
    target_type='binary',
    target_int_to_str=target_int_to_str,
    plot_column_names=True,
    file_path_prefix=
    '../output/gene_set/selected_gene_sets/find_differentially_expressed_gene_sets'
)

feature_score_moe_p_value_fdr

In [None]:
gene_set_score_p_value_fdr = gsea(
    max_tpm_log__gene_x_sample,
    target,
    gmt.loc[selected_gene_sets],
    method='ic',
    normalization_method=None,
    statistic=statistic,
    directory_path='../output/gene_set/selected_gene_sets/gsea')

gene_set_score_p_value_fdr

In [None]:
x = feature_score_moe_p_value_fdr['Score'].sort_values()
y = gene_set_score_p_value_fdr['Score'].loc[x.index]

print(compute_information_coefficient(x, y))

print(compute_information_coefficient(np.sign(x), np.sign(y)))

correlate(x, y, xlabel='Match IC', ylabel='GSEA Score', n_permutation=10)

In [None]:
for gene_set in selected_gene_sets:

    gene_set_clean = clean_name(gene_set)

    sample_gene_set_expression = gene_set_x_sample.loc[gene_set]

    make_match_panel(
        target,
        max_tpm_log__gene_x_sample.loc[gmt.loc[
            gene_set].dropna()].dropna().append(sample_gene_set_expression),
        target_ascending=True,
        n_job=n_job,
        n_top_feature=0.5,
        max_n_feature=None,
        title='Expression of Gene Set and its Genes',
        target_type='binary',
        target_int_to_str=target_int_to_str,
        plot_column_names=True,
        file_path_prefix=
        '../output/gene_set/selected_gene_sets/gene_set_and_its_genes/{}'.
        format(gene_set_clean))

    plot_violin_box_or_bar(
        target,
        sample_gene_set_expression,
        violin_box_bar_or_swarm_kwargs={
            'palette': (
                '#20D9BA',
                '#9017E6', ),
        },
        decorate_ax_kwargs={
            'title': gene_set,
        },
        file_path='../output/gene_set/selected_gene_sets/violin_plot/{}.png'.
        format(gene_set_clean))

    mpl.pyplot.show()