# Summary of prediction analysis for DE genes

In [None]:
import os, errno
import pandas as pd
import seaborn as sns
from venn import venn
import matplotlib.pyplot as plt

## Functions

In [None]:
def mkdir_p(directory):
    """
    Make a directory if it does not already exist.

    Input: Directory name
    """
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

## Summary of features

In [None]:
degs = pd.read_csv("../../../../_m/degs_annotation.txt", sep='\t', index_col=0)
dtu = pd.read_csv("../../../../../differential_analysis/tissue_comparison/ds_summary/_m/diffSplicing_ancestry_FDR05_4regions.tsv", sep='\t')
dtu.groupby("Tissue").size()

In [None]:
for tissue in ["Caudate", "Dentate Gyrus", "DLPFC", "Hippocampus"]:
    overlap = len(set(degs[(degs["Tissue"] == tissue)].gene_name) & 
                  set(dtu[(dtu["Tissue"] == tissue)].gene))
    print("There are {} overlapping DTU in DEGs for {}!".format(overlap, tissue))

## Load and prep summary files

### Load files

In [None]:
rf0 = pd.read_csv("../../rf/summary_10Folds_allTissues.tsv", sep='\t')
enet0 = pd.read_csv("../../enet/summary_10Folds_allTissues.tsv", sep='\t')

### Group, select, and clean summary results

In [None]:
## Extract median of model metrics over 10 folds
rf = rf0.groupby(["tissue", "feature"]).median()\
        .loc[:, ["n_features", "test_score_r2"]].reset_index()
rf.feature = rf.feature.str.replace("_", ".", regex=True)
rf["Model"] = "Random Forest"
enet = enet0.groupby(["tissue", "feature"]).median()\
        .loc[:, ["n_features", "test_score_r2"]].reset_index()
enet.feature = enet.feature.str.replace("_", ".", regex=True)
enet["Model"] = "Elastic Net"

df = pd.concat([rf, enet], axis=0)
df.head(2)

### Overlap with DTU

In [None]:
dx = df.merge(degs[["gene_name"]], left_on="feature", right_index=True).drop_duplicates()
for tissue in ["Caudate", "Dentate Gyrus", "DLPFC", "Hippocampus"]:
    overlap = len(set(dx[(dx["tissue"] == tissue)].gene_name) & 
                  set(dtu[(dtu["Tissue"] == tissue)].gene))
    print("There are {} overlapping DTU in DEGs for {}!".format(overlap, tissue))

### Add partial r2 results

In [None]:
partial = pd.read_csv("../../partial_r2/rf_partial_r2_metrics.tsv", sep='\t')\
            .rename(columns={"Geneid": "Feature"})
partial.columns = partial.columns.str.lower()
partial["test_score_r2"] = partial.partial_r2
partial["Model"] = "Partial R2"
partial = partial.loc[:, ['tissue', 'feature', 'n_features', 'test_score_r2', 'Model']]
partial.head(2)

In [None]:
df2 = pd.concat([df, partial], axis=0)
df2.groupby(["tissue", "Model"]).size()

## Summary of results

### Histogram of R2 (median test R2 score)

In [None]:
grid = sns.FacetGrid(df, col="tissue", col_wrap=2, hue="Model")
grid.map(sns.histplot, "test_score_r2")
grid.set_axis_labels("R2 (Test Score)", "Count (Genes)")
grid.set_titles(col_template="{col_name}")
grid.add_legend()
grid.tight_layout()
grid.savefig("histogram_test_r2.pdf")
grid.savefig("histogram_test_r2.png")
grid.savefig("histogram_test_r2.svg")

In [None]:
grid = sns.FacetGrid(df2, col="tissue", col_wrap=2, hue="Model")
grid.map(sns.histplot, "test_score_r2")
grid.set_axis_labels("R2 {Test and Partial}", "Count (Genes)")
grid.set_titles(col_template="{col_name}")
grid.add_legend()
grid.tight_layout()
grid.savefig("histogram_test_N_partial_r2.pdf")
grid.savefig("histogram_test_N_partial_r2.png")
grid.savefig("histogram_test_N_partial_r2.svg")

### What number of DEGs do not have any SNPs within 20 Kbp of gene body?

In [None]:
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    xx = set(df[(df["tissue"] == tissue)].feature)
    yy = set(degs[(degs["Tissue"] == tissue)].index)
    txt = "{} of {} ({:.1%}) of DE genes do not have SNPs within 20Kbp."
    print(txt.format(len(yy) - len(xx), len(yy), (len(yy) - len(xx)) / len(yy)))

### Number of ancestry DE genes expression that can be predictive with SNP

In [None]:
df[(df["test_score_r2"] >= 0.5)].groupby(["tissue", "Model"]).size()

In [None]:
df[(df["test_score_r2"] >= 0.75)].groupby(["tissue", "Model"]).size()

In [None]:
print(df[(df["test_score_r2"] >= 0.85)].groupby(["tissue", "Model"]).size().reset_index())
df[(df["test_score_r2"] >= 0.85)]

In [None]:
set(df[(df["test_score_r2"] >= 0.85)].feature)

- **ENSG00000166435.15** is *XRRA1* one of the most significant eQTLs in the brain
- **ENSG00000013573.16** is *DDX11*
- **ENSG00000226278.1** is *PSPHP1* a pseudogene
- **ENSG00000256274.1** is *TAS2R64P* anohter pseudogene

In [None]:
print(df[(df["test_score_r2"] >= 0.9)].groupby(["tissue", "Model"]).size().reset_index())
df[(df["test_score_r2"] >= 0.9)]

### Overlapping with DTU

In [None]:
df3 = dx.merge(dtu, left_on=["gene_name", "tissue"], right_on=["gene", "Tissue"])
df3[(df3["test_score_r2"] >= 0.5)].groupby(["Tissue", "Model"]).size()

In [None]:
df3[(df3["test_score_r2"] >= 0.75)].groupby(["Tissue", "Model"]).size()

In [None]:
df3[(df3["test_score_r2"] >= 0.75)]

### What is the overlap between models?

In [None]:
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    print(tissue)
    for r2 in [0, 0.2, 0.5, 0.6, 0.7, 0.75, 0.8, 0.825]:
        ee = enet[(enet["tissue"] == tissue) & (enet["test_score_r2"] >= r2)].copy()
        rr = rf[(rf["tissue"] == tissue) & (rf["test_score_r2"] >= r2)].copy()
        oo = len(set(ee.feature) & set(rr.feature))
        txt = "There is {} out of {} and {} genes overlapping between enet and rf - at R2 > {}"
        print(txt.format(oo, len(set(ee.feature)), len(set(rr.feature)), r2))
    print("")

In [None]:
dirname = "model_venn_diagrams"
mkdir_p(dirname)
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    #print(tissue)
    for r2 in [0, 0.2, 0.5, 0.6, 0.7, 0.75, 0.8]:
        ee = enet[(enet["tissue"] == tissue) & (enet["test_score_r2"] >= r2)].copy()
        rr = rf[(rf["tissue"] == tissue) & (rf["test_score_r2"] >= r2)].copy()
        model_set = {"Elastic Net": set(ee.feature), "Random Forest": set(rr.feature),}
        venn(model_set, fmt="{percentage:.1f}%", fontsize=12)
        tt = tissue.lower().replace(" ", "_")
        plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.png".format(dirname, tt, r2))
        plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.pdf".format(dirname, tt, r2))
        plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.svg".format(dirname, tt, r2))

### What is the overlap between brain regions?

In [None]:
dirname = "tissue_venn_diagrams"
mkdir_p(dirname)
for modeln in ["Elastic Net", "Random Forest"]:
    #print(modeln)
    dft = df[(df['Model'] == modeln)].copy()
    for r2 in [0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8]:
        cc = dft[(dft["tissue"] == "Caudate") & (dft["test_score_r2"] >= r2)].copy()
        dd = dft[(dft["tissue"] == "DLPFC") & (dft["test_score_r2"] >= r2)].copy()
        hh = dft[(dft["tissue"] == "Hippocampus") & (dft["test_score_r2"] >= r2)].copy()
        gg = dft[(dft["tissue"] == "Dentate Gyrus") & (dft["test_score_r2"] >= r2)].copy()
        tissues = {"Caudate": set(cc.feature), "DLPFC": set(dd.feature), 
                   "Hippocampus": set(hh.feature), "Dentate Gyrus": set(gg.feature)}
        venn(tissues, fmt="{percentage:.1f}%", fontsize=12)
        mm = modeln.lower().replace(" ", "_")
        plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.png".format(dirname, mm, r2))
        plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.pdf".format(dirname, mm, r2))
        plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.svg".format(dirname, mm, r2))

## Examining partial R2 results using most predictive SNPs

In [None]:
partial.groupby("tissue").describe().T

In [None]:
partial[(partial["test_score_r2"] > 0.88)]

- *GLP2R* (ENSG00000065325) Glucagon Like Peptide 2 Receptor

In [None]:
idv_partial = pd.read_csv("../../partial_r2/individual_partial_r2_metrics.tsv", sep='\t')
idv_partial.head(2)

In [None]:
idv_partial[["Partial_R2", "Tissue", "Geneid"]].groupby("Tissue").describe().T

#### The vast majority of SNPs to not hold a lot of information (partial r2 < 0.01) with 25% close to 0.

In [None]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.8), ["Tissue", "Partial_R2", "Geneid"]].groupby("Tissue").size()

In [None]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.8), ["Tissue", "Partial_R2", "Geneid"]].groupby("Geneid").size()

In [None]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.8), ["Tissue", "Partial_R2", "Geneid"]].groupby(["Geneid", "Tissue"]).size()