# Examine extreme partials

In [None]:
import os, errno
import functools
import pandas as pd
import collections as cx
from pybiomart import Dataset
# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

## Functions

### Cached functions

In [None]:
@functools.lru_cache()
def get_database():
    dataset = Dataset(name="hsapiens_gene_ensembl", 
                      host="http://www.ensembl.org",
                      use_cache=True)
    db = dataset.query(attributes=["ensembl_gene_id", 
                                   "external_gene_name", 
                                   "entrezgene_id"], 
                       use_attr_names=True).dropna(subset=['entrezgene_id'])
    return db

### Simple functions

In [None]:
def mkdir_p(directory):
    """
    Make a directory if it does not already exist.

    Input: Directory name
    """
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            
            
def extract_top_bottom(tissue, ml_df, percent=0.05):
    df = ml_df[(ml_df["Tissue"] == tissue)].sort_values("Partial_R2", ascending=False)
    df["ensemblID"] = df.Geneid.str.replace("\\..*", "", regex=True)
    n = round(df.shape[0] * percent)
    top = df.head(n)
    bottom = df.tail(n)
    return top, bottom


def extract_extremes(tissue, ml_df, val1, val2):
    df = ml_df[(ml_df["Tissue"] == tissue)].sort_values("Partial_R2", ascending=False)
    df["ensemblID"] = df.Geneid.str.replace("\\..*", "", regex=True)
    return df[(df["Partial_R2"] > val1)], df[(df["Partial_R2"] < val2)]


def old_convert2entrez(tissue, ml_df, percent):
    top, bottom = extract_top_bottom(tissue, ml_df, percent)
    df1 = top.merge(get_database(), left_on='ensemblID', 
                    right_on='ensembl_gene_id')
    df2 = bottom.merge(get_database(), left_on="ensemblID", 
                       right_on="ensembl_gene_id")
    return df1, df2


def convert2entrez(top, bottom):
    df1 = top.merge(get_database(), left_on='ensemblID', 
                    right_on='ensembl_gene_id')
    df2 = bottom.merge(get_database(), left_on="ensemblID", 
                       right_on="ensembl_gene_id")
    return df1, df2


def obo_annotation(alpha=0.05):
    # database annotation
    fn_obo = download_go_basic_obo()
    fn_gene2go = download_ncbi_associations() # must be gunzip to work
    obodag = GODag(fn_obo) # downloads most up-to-date
    anno_hs = Gene2GoReader(fn_gene2go, taxids=[9606])
    # get associations
    ns2assoc = anno_hs.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        get_database()['entrezgene_id'], # List of human genes with entrez IDs
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = alpha, # default significance cut-off
        methods = ['fdr_bh'])
    return goeaobj


def run_goea(tissue, top, bottom, dname, label=''):
    df1, df2 = convert2entrez(top, bottom)
    t_name = tissue.lower().replace(" ", "_")
    d = {"Top": df1, "Bottom": df2}
    for study in ["Top", "Bottom"]:
        print(study)
        df = d[study]
        geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['external_gene_name'])}
        goeaobj = obo_annotation()
        goea_results_all = goeaobj.run_study(geneids_study)
        goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
        ctr = cx.Counter([r.NS for r in goea_results_sig])
        print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
            TOTAL=len(goea_results_sig),
            BP=ctr['BP'], MF=ctr['MF'], CC=ctr['CC']))
        if label == '':
            goeaobj.wr_xlsx("%s/%s_GO_analysis_%s.xlsx" % (dname, t_name, study), 
                            goea_results_sig)
            goeaobj.wr_txt("%s/%s_GO_analysis_%s.txt" % (dname, t_name, study), 
                           goea_results_sig)
        else:
            goeaobj.wr_xlsx("%s/%s_GO_analysis_%s_%s.xlsx" % (dname, t_name, study, label), 
                            goea_results_sig)
            goeaobj.wr_txt("%s/%s_GO_analysis_%s_%s.txt" % (dname, t_name, study, label), 
                           goea_results_sig)

## Extract by tissue by Pst

### Elastic net estimated Pst

In [None]:
model = "enet"
mkdir_p(model)

In [None]:
enet = pd.read_csv("../../partial_r2/enet_partial_r2_metrics.tsv", sep='\t')
print(enet.shape)
enet.groupby("Tissue").size()

In [None]:
enet[(enet["Partial_R2"] > 0.25)].groupby("Tissue").size()

In [None]:
enet[(enet["Partial_R2"] < 0.025)].groupby("Tissue").size()

#### Enrichment and extraction

In [None]:
percent = 0.05; val1 = 0.25; val2 = 0.025
top_df = pd.DataFrame()
bottom_df = pd.DataFrame()
for tissue in ["Caudate", "DLPFC", "Dentate Gyrus", "Hippocampus"]:
    top, bottom = extract_extremes(tissue, enet, val1, val2)
    run_goea(tissue, top, bottom, model)
    top_df = pd.concat([top_df, top], axis=0)
    bottom_df = pd.concat([bottom_df, bottom], axis=0)

#### Save extremes

In [None]:
top_df["Variation_Explained"] = "High"
bottom_df["Variation_Explained"] = "Low"
dt = pd.concat([top_df, bottom_df], axis=0)
dt.to_csv("%s/extremes_partial_r2_enet.tsv" % model, sep='\t', index=False)
dt.head()

### Random forest estimated Pst

In [None]:
model = "rf"
mkdir_p(model)

In [None]:
rf = pd.read_csv("../../partial_r2/rf_partial_r2_metrics.tsv", sep='\t')
print(rf.shape)
rf.groupby("Tissue").size()

In [None]:
rf[(rf["Partial_R2"] > 0.25)].groupby("Tissue").size()

In [None]:
rf[(rf["Partial_R2"] < 0.01)].groupby("Tissue").size()

#### Enrichment and extraction

In [None]:
percent = 0.05; val1 = 0.25; val2 = 0.01
top_df = pd.DataFrame()
bottom_df = pd.DataFrame()
for tissue in ["Caudate", "DLPFC", "Dentate Gyrus", "Hippocampus"]:
    top, bottom = extract_extremes(tissue, rf, val1, val2)
    run_goea(tissue, top, bottom, model)
    top_df = pd.concat([top_df, top], axis=0)
    bottom_df = pd.concat([bottom_df, bottom], axis=0)

#### Save extremes

In [None]:
top_df["Variation_Explained"] = "High"
bottom_df["Variation_Explained"] = "Low"
dt = pd.concat([top_df, bottom_df], axis=0)
dt.to_csv("%s/extremes_partial_r2_rf.tsv" % model, sep='\t', index=False)
dt.head()

## Enrichment of top and bottom 100

In [None]:
model = "enet"
for tissue in ["Caudate", "DLPFC", "Dentate Gyrus", "Hippocampus"]:
    df = enet[(enet["Tissue"] == tissue)].sort_values("Partial_R2", ascending=False)
    df["ensemblID"] = df.Geneid.str.replace("\\..*", "", regex=True)
    top = df.head(100)
    bottom = df.tail(100)
    run_goea(tissue, top, bottom, model, 'n100')

### Random forest

In [None]:
model = "rf"
for tissue in ["Caudate", "DLPFC", "Dentate Gyrus", "Hippocampus"]:
    df = rf[(rf["Tissue"] == tissue)].sort_values("Partial_R2", ascending=False)
    df["ensemblID"] = df.Geneid.str.replace("\\..*", "", regex=True)
    top = df.head(100)
    bottom = df.tail(100)
    run_goea(tissue, top, bottom, model, 'n100')