# STEAP
## Post-Processing Notebook 

1.    [Identifying significant cell types](#1)
2.    [Overlap  with enriched brain related cell types (using upsetplot)](#2)
3.    [Gene-Set Enrichment Analysis (GSEA)](#3)
4.    [Cell type correlation between the pheontype and brain related traits](#4)
5.    [Expression Specificity (ES) gene correlation between cell types](#5)

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

First, we need to import the necessary packages and scripts.

In [3]:
import sys
sys.executable

'/home/insilicode/miniconda3/envs/appyter/bin/python'

In [1]:
import pandas as pd
import constants # file which consists of constant for the analysis
from scripts import convert_output_to_dataframe
from scripts import gene_set_enrichment_analysis
from scripts import calculate_beta_correlation
from scripts import calculate_es_correlation
from scripts import upsetplot
from scripts import circosplot

from bokeh.io import output_notebook
from utils import display_bokeh_df, make_caption, display_link, create_download_link, create_zip_file
output_notebook()

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Data_Section',
    title='Load your Data'
) %}

{% set gwas_name = StringField(
    name='gwas_name', 
    label='Name of the phenotype used as GWAS_SUMSTATS in the config.yml file (e.g. SCZ_PGC3_2020)',
    default='SCZ_PGC3_2020',
    description='Give a name to the phenotype analysed in the prioritization files.',
    section='Data_Section'
) %}

{% set pheno_name = StringField(
    name='pheno_name', 
    label='Name of the phenotype (e.g. Schizophrenia)', 
    default='Schizophrenia',
    description='Give a name to the phenotype analysed in the prioritization files.',
    section='Data_Section'
) %}

{% set hmagma_file = FileField(
    name='hmagma_file', 
    label='The H-MAGMA priotitization.csv file', 
    examples={'h-magma.csv': 'https://github.com/erwinerdem/appyter_steap/raw/master/h-magma.csv'},
    section='Data_Section'
) %}

{% set ldsc_file = FileField(
    name='ldsc_file', 
    label='The S-LDSC priotitization.csv file', 
    examples={'ldsc.csv': 'https://github.com/erwinerdem/appyter_steap/raw/master/ldsc.csv'},
    section='Data_Section'
) %}

{% set magma_file = FileField(
    name='magma_file', 
    label='The MAGMA priotitization.csv file', 
    examples={'magma.csv': 'https://github.com/erwinerdem/appyter_steap/raw/master/magma.csv'},
    section='Data_Section'
) %}

{% set df_all = FileField(
    name='df_all', 
    label='The .h5 data containing the enrichment analysis of brain related phenotypes.', 
    description='Load the example enrichment_data.h5 file.',
    examples={'enrichment_data.h5': 'https://github.com/erwinerdem/STEAP/raw/master/data/enrichment_data.h5'},
    section='Data_Section'
) %}

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='Analysis_Section',
    title='Choose your Analysis'
) %}


{% set other_gwas = MultiChoiceField(
    name='other_gwas',
    label='Select which other brain related phenotypes you wish to compare against (default is all)',
    choices=[
        'Structural MRI',
        'DTI Tracts',
        'rs-fMRI Network',
        'Schizophrenia',
        "Alzheimer's disease",
        "Bipolar",
        "Depression",
        "Autism spectrum disorder"
    ],
    default=[
        'Structural MRI',
        'DTI Tracts',
        'rs-fMRI Network',
        'Schizophrenia',
        "Alzheimer's disease",
        "Bipolar",
        "Depression",
        "Autism spectrum disorder"
    ],
    section='Analysis_Section'
) %}


{% set gsea = BoolField(
    name='gsea', 
    label='Perform gene-set enrichment analysis?', 
    default='false', 
    section='Analysis_Section'
) %}

{% set genesetlist = MultiChoiceField(
    name='genesetlist',
    label='Select which gene-list(s) to use for gene-set enrichment analysis',
    choices=[
        'ARCHS4_Cell-lines',
        'ARCHS4_IDG_Coexp',
        'ARCHS4_Kinases_Coexp',
        'ARCHS4_TFs_Coexp',
        'ARCHS4_Tissues',
        'Achilles_fitness_decrease',
        'Achilles_fitness_increase',
        'Aging_Perturbations_from_GEO_down',
        'Aging_Perturbations_from_GEO_up',
        'Allen_Brain_Atlas_down',
        'Allen_Brain_Atlas_up',
        'BioCarta_2013',
        'BioCarta_2015',
        'BioCarta_2016',
        'BioPlanet_2019',
        'BioPlex_2017',
        'CCLE_Proteomics_2020',
        'CORUM',
        'COVID-19_Related_Gene_Sets',
        'Cancer_Cell_Line_Encyclopedia',
        'ChEA_2013',
        'ChEA_2015',
        'ChEA_2016',
        'Chromosome_Location',
        'Chromosome_Location_hg19',
        'ClinVar_2019',
        'DSigDB',
        'Data_Acquisition_Method_Most_Popular_Genes',
        'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
        'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
        'DisGeNET',
        'Disease_Perturbations_from_GEO_down',
        'Disease_Perturbations_from_GEO_up',
        'Disease_Signatures_from_GEO_down_2014',
        'Disease_Signatures_from_GEO_up_2014',
        'DrugMatrix',
        'Drug_Perturbations_from_GEO_2014',
        'Drug_Perturbations_from_GEO_down',
        'Drug_Perturbations_from_GEO_up',
        'ENCODE_Histone_Modifications_2013',
        'ENCODE_Histone_Modifications_2015',
        'ENCODE_TF_ChIP-seq_2014',
        'ENCODE_TF_ChIP-seq_2015',
        'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ESCAPE',
        'Elsevier_Pathway_Collection',
        'Enrichr_Libraries_Most_Popular_Genes',
        'Enrichr_Submissions_TF-Gene_Coocurrence',
        'Enrichr_Users_Contributed_Lists_2020',
        'Epigenomics_Roadmap_HM_ChIP-seq',
        'GO_Biological_Process_2013',
        'GO_Biological_Process_2015',
        'GO_Biological_Process_2017',
        'GO_Biological_Process_2017b',
        'GO_Biological_Process_2018',
        'GO_Cellular_Component_2013',
        'GO_Cellular_Component_2015',
        'GO_Cellular_Component_2017',
        'GO_Cellular_Component_2017b',
        'GO_Cellular_Component_2018',
        'GO_Molecular_Function_2013',
        'GO_Molecular_Function_2015',
        'GO_Molecular_Function_2017',
        'GO_Molecular_Function_2017b',
        'GO_Molecular_Function_2018',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
        'GWAS_Catalog_2019',
        'GeneSigDB',
        'Gene_Perturbations_from_GEO_down',
        'Gene_Perturbations_from_GEO_up',
        'Genes_Associated_with_NIH_Grants',
        'Genome_Browser_PWMs',
        'HMDB_Metabolites',
        'HMS_LINCS_KinomeScan',
        'HomoloGene',
        'HumanCyc_2015',
        'HumanCyc_2016',
        'Human_Gene_Atlas',
        'Human_Phenotype_Ontology',
        'InterPro_Domains_2019',
        'Jensen_COMPARTMENTS',
        'Jensen_DISEASES',
        'Jensen_TISSUES',
        'KEA_2013',
        'KEA_2015',
        'KEGG_2013',
        'KEGG_2015',
        'KEGG_2016',
        'KEGG_2019_Human',
        'KEGG_2019_Mouse',
        'Kinase_Perturbations_from_GEO_down',
        'Kinase_Perturbations_from_GEO_up',
        'L1000_Kinase_and_GPCR_Perturbations_down',
        'L1000_Kinase_and_GPCR_Perturbations_up',
        'LINCS_L1000_Chem_Pert_down',
        'LINCS_L1000_Chem_Pert_up',
        'LINCS_L1000_Ligand_Perturbations_down',
        'LINCS_L1000_Ligand_Perturbations_up',
        'Ligand_Perturbations_from_GEO_down',
        'Ligand_Perturbations_from_GEO_up',
        'MCF7_Perturbations_from_GEO_down',
        'MCF7_Perturbations_from_GEO_up',
        'MGI_Mammalian_Phenotype_2013',
        'MGI_Mammalian_Phenotype_2017',
        'MGI_Mammalian_Phenotype_Level_3',
        'MGI_Mammalian_Phenotype_Level_4',
        'MGI_Mammalian_Phenotype_Level_4_2019',
        'MSigDB_Computational',
        'MSigDB_Hallmark_2020',
        'MSigDB_Oncogenic_Signatures',
        'Microbe_Perturbations_from_GEO_down',
        'Microbe_Perturbations_from_GEO_up',
        'Mouse_Gene_Atlas',
        'NCI-60_Cancer_Cell_Lines',
        'NCI-Nature_2015',
        'NCI-Nature_2016',
        'NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions',
        'NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions',
        'NIH_Funded_PIs_2017_Human_AutoRIF',
        'NIH_Funded_PIs_2017_Human_GeneRIF',
        'NURSA_Human_Endogenous_Complexome',
        'OMIM_Disease',
        'OMIM_Expanded',
        'Old_CMAP_down',
        'Old_CMAP_up',
        'PPI_Hub_Proteins',
        'Panther_2015',
        'Panther_2016',
        'Pfam_Domains_2019',
        'Pfam_InterPro_Domains',
        'PheWeb_2019',
        'Phosphatase_Substrates_from_DEPOD',
        'ProteomicsDB_2020',
        'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
        'Rare_Diseases_AutoRIF_ARCHS4_Predictions',
        'Rare_Diseases_AutoRIF_Gene_Lists',
        'Rare_Diseases_GeneRIF_ARCHS4_Predictions',
        'Rare_Diseases_GeneRIF_Gene_Lists',
        'Reactome_2013',
        'Reactome_2015',
        'Reactome_2016',
        'SILAC_Phosphoproteomics',
        'SubCell_BarCode',
        'SysMyo_Muscle_Gene_Sets',
        'TF-LOF_Expression_from_GEO',
        'TF_Perturbations_Followed_by_Expression',
        'TG_GATES_2020',
        'TRANSFAC_and_JASPAR_PWMs',
        'TRRUST_Transcription_Factors_2019',
        'Table_Mining_of_CRISPR_Studies',
        'TargetScan_microRNA',
        'TargetScan_microRNA_2017',
        'Tissue_Protein_Expression_from_Human_Proteome_Map',
        'Tissue_Protein_Expression_from_ProteomicsDB',
        'Transcription_Factor_PPIs',
        'UK_Biobank_GWAS_v1',
        'Virus-Host_PPI_P-HIPSTer_2020',
        'VirusMINT',
        'Virus_Perturbations_from_GEO_down',
        'Virus_Perturbations_from_GEO_up',
        'WikiPathways_2013',
        'WikiPathways_2015',
        'WikiPathways_2016',
        'WikiPathways_2019_Human',
        'WikiPathways_2019_Mouse',
        'dbGaP',
        'huMAP',
        'lncHUB_lncRNA_Co-Expression',
        'miRTarBase_2017',
    ],
    default = [
        'ARCHS4_Tissues',
        'Aging_Perturbations_from_GEO_down',
        'Aging_Perturbations_from_GEO_up',
        'Allen_Brain_Atlas_down',
        'Allen_Brain_Atlas_up',
        'ClinVar_2019',
        'Disease_Signatures_from_GEO_down_2014',
        'Disease_Signatures_from_GEO_up_2014',
        'ENCODE_Histone_Modifications_2013',
        'ENCODE_Histone_Modifications_2015',
        'Epigenomics_Roadmap_HM_ChIP-seq',
        'GO_Biological_Process_2018',
        'GO_Cellular_Component_2018',
        'GO_Molecular_Function_2018',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_down',
        'GTEx_Tissue_Sample_Gene_Expression_Profiles_up',
        'GWAS_Catalog_2019',
        'HMDB_Metabolites',
        'Human_Gene_Atlas',
        'Jensen_COMPARTMENTS',
        'Jensen_DISEASES',
        'Jensen_TISSUES',
        'KEGG_2019_Human',
        'Reactome_2016',
        'Tissue_Protein_Expression_from_ProteomicsDB',
        'WikiPathways_2019_Human',
        'miRTarBase_2017'
    ],
    section='Analysis_Section'
) %}

{% set beta_corr = BoolField(
    name='beta_corr', 
    label='Perform cell type correlation?', 
    default='false', 
    section='Analysis_Section'
) %}

{% set circos_plot = BoolField(
    name='circos_plot', 
    label='Visualize cell type correlation with a circosplot?', 
    default='false', 
    section='Analysis_Section'
) %}

{% set esmu_corr = BoolField(
    name='esmu_corr', 
    label='Perform ES gene correlation?', 
    default='false', 
    section='Analysis_Section'
) %}



In [None]:
%%appyter code_exec
gwas_name = "{{gwas_name.value}}"
name = "{{pheno_name.value}}"

hmagma_file = "{{hmagma_file.value}}"
ldsc_file = "{{ldsc_file.value}}"
magma_file = "{{magma_file.value}}"
datah5 = "{{df_all.value}}"

other_gwas = {{other_gwas.value}}

table_counter = 1
figure_counter = 1

genesetlist = {{genesetlist.value}}
gwas_group_dict = {k: v for k,v in constants.GWAS_GROUP_DICT.items() if k in other_gwas}

In [None]:
%%appyter hide_code_exec
gsea = "{{gsea.value}}"
beta_corr = "{{beta_corr.value}}"
esmu_corr = "{{esmu_corr.value}}"
circos_plot = "{{circos_plot.value}}"

The input files will be converted to a pandas dataframe to easily analyse the ouput.

In [None]:
file_dict = {
    gwas_name: {
        'H-MAGMA': hmagma_file,
        'LDSC': ldsc_file,
        'MAGMA': magma_file
    }
}
df_list_1 = []
for d in file_dict.values():
    df_list_2 = []
    for method, file_path in d.items():
        df = pd.read_csv(file_path)
        df['method'] = method
        df.sort_values(
            by=['gwas', 'specificity_id', 'annotation'],
            inplace=True)
        df_list_2.append(df)
    df_list_1.extend(df_list_2)

df = pd.concat(df_list_1, ignore_index=True)
# count the number of methods (not used atm)
df = df.merge(
    df.groupby(
        ['gwas', 'specificity_id', 'annotation']
    ).size().to_frame('n_methods'),
    on=['gwas', 'specificity_id', 'annotation'], how='left')
# count the number of annotations/celltypes
df.sort_values(by=['gwas', 'method'], inplace=True)
df.reset_index(inplace=True, drop=True)

## 1.    Find Significant Cell-Types <a id='1'></a>

The pvalue of the output must first be corrected.
By default it uses bonferroni correction, but other methods can also be applied
(see [`statsmodels.stats.multitest.multipletests`](https://www.statsmodels.org/stable/generated/statsmodels.stats.multitest.multipletests.html)
documentation).

In [None]:
df = convert_output_to_dataframe.pvalue_correction(df, method=constants.PVAL_CORRECTION) # corrects pval

# bokeh display df
table_counter = display_bokeh_df(df, table_counter, f"Results of the enrichment analysis on the {name} phenotype.")
create_download_link(df, f"{name}_enrichment_analysis.xlsx") 

To identify the enriched celltypes, we can simply filter out all non-significant hits. In the STEAP pipeline we used three different methods (S-LDSC, MAGMA and H-MAGMA) to determine whether a cell type is enriched or not.
Here, we define a cell type to be actually enriched if it is significant in at least 2 methods.

In [None]:
significant_counts = df[
        (df[f'pvalue_{constants.PVAL_CORRECTION}']<=0.05) # get only significant hits
].value_counts(['gwas','specificity_id','annotation']) # count in how many methods it is significant

# only get enriched cell-types in >=2 methods
significant_counts = significant_counts[
    (significant_counts>=len(constants.METHODS)-1)
]

# bokeh display df
bokeh_df = significant_counts.sort_index(level=1).reset_index().rename(columns={0:'significant_method_count'})
table_counter = display_bokeh_df(bokeh_df, table_counter, f"Enriched cell types in the {name} phenotype. Only cell types significant in two or more methods are shown.")
create_download_link(bokeh_df, f"significant_celltype_counts_{name}.xlsx") 

## 2.    Overlap of Enriched Cell-Types Between Phenotypes <a id='2'></a>

Let's compare which enriched cell types in the input phenotype are also enriched in other gwas (groups) using the upsetplot. 
To do this we need data of the other phenotypes as well. 
Instead of running the pipeline with 500+ different GWAS, we can just load a pre-computed dataframe.

In [None]:
df_all = pd.read_hdf(datah5, 'df_all')

# bokeh display df
bokeh_df = df_all.sample(150) # only sample as df is too large
table_counter = display_bokeh_df(bokeh_df, table_counter, f"Enrichment analysis on a large number of phenotypes. These include structural MRI measurements, white matter DTI tracts and rs-fMRI network phenotypes.")

This data contains 538 unique GWAS phenotypes.
We can group most of these GWASes in four main groups.
1.    Structural MRI Measurements
2.    White Matter DTI Tracts
3.    Resting State fMRI Network
4.    Brain related diseases


The `GWAS_GROUP_DICT` variable contains information about these groups:

In [None]:
display(gwas_group_dict)
# GWAS group name : [(regex) keywords used to match to gwas]

This dictionary contains the GWAS group names as keys and keywords as values.
These keywords can be used to search for the GWAS in the dataframe (using regex).
For instance `volume` is a keywords used to include `volume: thalamus` phenotype in the `Structural MRI` group.

To analyze the phenotype of interest let's include it to the dictionary as well.

In [None]:
gwas_group_dict[name] = [gwas_name]
display(gwas_group_dict)

Now we can visualize the overlap of cell types between phenotypes using the upsetplot.

In [None]:
filename = f'upsetplot_{name}.png'
upsetplot.plot_upset(
    pd.concat([df_all, df]), # concat the preloaded data with the input data
    gwas_group_dict,
    sign_threshold=len(constants.METHODS)-1, # only use celltypes significant in >=2 methods
    element_size=56,
    save=True,
    filename=filename
)
figure_counter =  make_caption(figure_counter, f"Upsetplot of overlapping cell types between {name} and the brain related phenotypes.", 'Figure')
display_link(filename)

We can investigate the actual cell types which overlap in these phenotypes:

In [None]:
shared_celltypes_df = upsetplot.get_shared_celltypes(pd.concat([df_all,df]),
                                                     gwas_group_dict,
                                                     sign_threshold=len(constants.METHODS)-1,
                                                     save_to_excel=False)

And to see which cell types are overlapping with only the input phenotype:

In [None]:
# bokeh display df
bokeh_df = shared_celltypes_df[
    shared_celltypes_df['group'].str.contains(name)
].reset_index(drop=True)
table_counter = display_bokeh_df(bokeh_df, table_counter, f"Cell types overlapping with the three brain measurement groups and {name} phenotype.")
create_download_link(bokeh_df, f"shared_celltypes_{name}.xlsx") 

%%appyter markdown
{% if gsea.value == True %}
## 3.    Gene-Set Enrichment Analysis <a id='3'></a>


We can get additional information about the enriched cell types using GSEA.
GSEA uses a gene-list as input. 
Here, we select the top 1% of genes with the highest ES value calculated using CELLEX in enriched cell types.
This percentage can be edited in the constant.py file, but it is recommended to not do this. 
[A range of 15 to 500 genes is recommended by the original GSEA developers](https://www.gsea-msigdb.org/gsea/doc/GSEAUserGuideFrame.html). 
During testing, the top 1% revealed to always have a number of genes in this range.

GSEA compares the gene-list to a gene-set. Gene-sets used in the analysis can be found and edited in the constant.py file. 
The gene-set libraries are from [Enrichr](https://maayanlab.cloud/Enrichr/#stats).
The gene-sets used here are:

{% endif %}

In [None]:
%%appyter code_exec
{% if gsea.value == True %}
for gene_set in genesetlist:
    print(gene_set)
{% endif %}

In [None]:
%%appyter code_exec
{% if gsea.value == True %}
gsea_dict = gene_set_enrichment_analysis.gsea(
    df,
    gwas_group_dict={name:[gwas_name]},
    gene_set_list=genesetlist,
)
# since only one gwas is within the gwas group all cell types enriched have rank 1
# thus all enriched cell types will be analysed
{% endif %}

%%appyter markdown
{% if gsea.value == True %}

To get the gsea results for all cell types analysed download the .zip file below

{% endif %}

In [None]:
%%appyter code_exec
{% if gsea.value == True %}
inputfiles = [f"gsea_{celltype.replace(', ','-')}.xlsx" for celltype in gsea_dict.keys()]
outfile = f'gsea_{name}.zip'
create_zip_file(outfile, inputfiles)
display_link(outfile)
{% endif %}

%%appyter markdown
{% if gsea.value == True %}

For phenotypes with large number of enriched cell types it can be quite daunting to go through all these gsea files.
To quickly see which terms occur the most often in all of the enriched cell types just use the `summarize_gsea` function. This does not take into account the values of the 'Combined Score' gsea calculates.

{% endif %}

In [None]:
%%appyter code_exec
{% if gsea.value == True %}
filename = f'gsea_summary_{name}.xlsx'
gsea_summary_df = gene_set_enrichment_analysis.summarize_gsea(
    gsea_dict,
    save_to_excel=True,
    filename=filename
)

table_counter = display_bokeh_df(
    gsea_summary_df,
    table_counter,
    f"Summary of gsea analysis on enriched cell types for {name}.\
    It shows the terms in the gene-sets which are significant in the {name} enriched cell types.\
    The last columns displays the number of cell types which the term is significant in."
    )
display_link(filename)
{% endif %}

%%appyter markdown
{% if beta_corr.value == True %}

## 4.    Cell Type Correlation <a id='4'></a>

We can calculate the similarity between phenotypes using the enrichment values acquired from the pipeline. 
Here, we first concatenate our data to the pre-loaded data, which contains 500+ unqiue phenotypes. 

These phenotypes were analysed with the same scRNA-seq datasets, which is important for reliable calculation of the cell type correlation.
{% endif %}

In [None]:
%%appyter code_exec
{% if beta_corr.value == True %}
df_concat = pd.concat([df_all, df])
{% endif %}

%%appyter markdown
{% if beta_corr.value == True %}

Then we calculate the correlation between all of these phenotypes.

{% endif %}

In [None]:
%%appyter code_exec
{% if beta_corr.value == True %}

corr_df = calculate_beta_correlation.calculate_celltype_corr(df_concat)
table_counter = display_bokeh_df(corr_df.sample(150), table_counter,
                                 "Cell type correlation between all the phenotypes analysed.\
                                 Here only a sample of the full data is shown.\
                                 The full data can be downloaded below.")
create_download_link(corr_df, "corr_df.xlsx") 
{% endif %}

%%appyter markdown
{% if beta_corr.value == True %}

To only select for significant correlations with our phenotype of interest, we can do:

{% endif %}

In [None]:
%%appyter code_exec
{% if beta_corr.value == True %}

bokeh_df = corr_df[
    # get only correlations with phenotype
    ((corr_df['gwasx'].str.contains(gwas_name))|(corr_df['gwasy'].str.contains(gwas_name)))
    &
    # get only significant correlations
    (corr_df['pval'] <= (0.05/corr_df.shape[0])) # bonferroni correction
].pivot(index='method',columns=['gwasx','gwasy'])['corr'].T.reset_index() # where NaN implies not significant correlation
table_counter = display_bokeh_df(bokeh_df, table_counter,
                                 f"Significant cell type correlation between {name} and the other analysed phenotypes.\
                                 NaN values are non-significant correlations")
create_download_link(bokeh_df, f"corr_df_{name}.xlsx") 

{% endif %}

%%appyter markdown
{% if beta_corr.value == True %}

To visualize the correlation between the phenotypes and out phenotype of interest we can use the circosplot.
Again we use the `gwas_group_dict` as input. The other inputs are the correlation dataframe calculated previously and the name of the gwas of interest.


{% endif %}

In [None]:
%%appyter code_exec
{% if beta_corr.value == True and circos_plot.value == True %}

filename = f'circosplot_{name}.png'

# showing all correlations in the chord plot would get messy so we can limit it to
# for example 75 quantile correlations in both positive and negative direction
corrs = corr_df[(corr_df['pval'])<=(0.05/corr_df.shape[0])]['corr']
corr_limit = (corrs[corrs<0].quantile(.75),corrs[corrs>0].quantile(.75))
# plot
circosplot.plot(corr_df, gwas_group_dict, gwas_name, 
                corr_limit=corr_limit,
                save=True,
                filename=filename
               )
figure_counter =  make_caption(figure_counter,
                               f"Circosplot {name} and the brain related phenotypes.\
                               The circle itself represents the gwas groups. \
                               The red lines inside are the mean correlations between the gwases inside these groups.\
                               The brigher the red, the higher the correlation.\
                               The three outer barplot circles represent the correlation between {name} and the traits in the gwas groups per methods.\
                               From inside to outside the methods are H-MAGMA. LDSC, MAGMA.\
                               Red bars denote positive correlation and blue negative.\
                               The big bars spanning the three outer circles denote whether the correlation is significant or not in all methods.",
                               'Figure')
display_link(filename)
{% endif %}

%%appyter markdown
{% if esmu_corr.value == True %}

## 5.     ES Gene Correlation <a id='5'></a>


We calculated the correlations between phenotypes, but we can also do this between cell types.
This can be useful to double-check an enriched cell type and see what other cell types it is correlated with.
We give as input the datasets (specificity_id). 

{% endif %}

In [None]:
%%appyter code_exec
{% if esmu_corr.value == True %}

datasets = df['specificity_id'].unique() # get all unique datasets
print(datasets)

{% endif %}

%%appyter markdown
{% if esmu_corr.value == True %}

We can calculate the ES gene correlation using:

{% endif %}

In [None]:
%%appyter code_exec
{% if esmu_corr.value == True %}

es_corr_df = calculate_es_correlation.calculate_es_corr(datasets) # calculates correlation

table_counter = display_bokeh_df(es_corr_df.sample(150), table_counter,
                                 "ES gene correlation between all the cell types in the scRNA-seq datasets.\
                                 Here only a sample of the full data is shown.\
                                 The full data can be downloaded below.")
create_download_link(es_corr_df, "es_corr_df.xlsx") 

{% endif %}

%%appyter markdown
{% if esmu_corr.value == True %}

To only select for the correlations with our enriched cell types we can first select these:

{% endif %}

In [None]:
%%appyter code_exec
{% if esmu_corr.value == True %}

significant_celltypes = significant_counts.reset_index()[
    ['specificity_id','annotation']].drop_duplicates().values.tolist()
significant_celltypes = [f'{d}, {c}' for d,c in significant_celltypes]
display(significant_celltypes)

{% endif %}

%%appyter markdown
{% if esmu_corr.value == True %}

Then we only show these correlations:

{% endif %}

In [None]:
%%appyter code_exec
{% if esmu_corr.value == True %}

bokeh_df = es_corr_df[
    (
        # get only celltypes significant in phenotype
        (es_corr_df['celltypex'].isin(significant_celltypes))
        |
        (es_corr_df['celltypey'].isin(significant_celltypes))
    )
    &
    # get only significant correlations
    (es_corr_df['pval_bonferroni']<=0.05)
].sort_values('corr', ascending=False)

table_counter = display_bokeh_df(bokeh_df.sample(150), table_counter,
                                 f"ES gene correlation between all the cell types\
                                 in the scRNA-seq datasets and enriched celltypes in {name}.\
                                 Here only a sample of the full data is shown.\
                                 The full data can be downloaded below.")
create_download_link(bokeh_df, f"es_corr_df_{name}.xlsx") 

{% endif %}