# Summary of prediction analysis for random genes

In [1]:
import os, errno
import pandas as pd
import seaborn as sns
from venn import venn
import matplotlib.pyplot as plt

## Functions

In [2]:
def mkdir_p(directory):
    """
    Make a directory if it does not already exist.

    Input: Directory name
    """
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

## Summary of features

In [3]:
degs = pd.read_csv("../../../../_m/randomGenes_annotation.txt", sep='\t', index_col=0)
dtu = pd.read_csv("../../../../../differential_analysis/tissue_comparison/ds_summary/_m/diffSplicing_ancestry_FDR05_4regions.tsv", sep='\t')
dtu.groupby("Tissue").size()

Tissue
Caudate          1901
DLPFC            1345
Dentate Gyrus     655
Hippocampus      1332
dtype: int64

In [4]:
for tissue in ["Caudate", "Dentate Gyrus", "DLPFC", "Hippocampus"]:
    overlap = len(set(degs[(degs["Tissue"] == tissue)].gene_name) & 
                  set(dtu[(dtu["Tissue"] == tissue)].gene))
    print("There are {} overlapping DTU in DEGs for {}!".format(overlap, tissue))

There are 167 overlapping DTU in DEGs for Caudate!
There are 16 overlapping DTU in DEGs for Dentate Gyrus!
There are 105 overlapping DTU in DEGs for DLPFC!
There are 109 overlapping DTU in DEGs for Hippocampus!


## Load and prep summary files

### Load files

In [5]:
rf0 = pd.read_csv("../../rf/summary_10Folds_allTissues.tsv", sep='\t')
enet0 = pd.read_csv("../../enet/summary_10Folds_allTissues.tsv", sep='\t')

### Group, select, and clean summary results

In [6]:
## Extract median of model metrics over 10 folds
rf = rf0.groupby(["tissue", "feature"]).median()\
        .loc[:, ["n_features", "test_score_r2"]].reset_index()
rf.feature = rf.feature.str.replace("_", ".", regex=True)
rf["Model"] = "Random Forest"
enet = enet0.groupby(["tissue", "feature"]).median()\
        .loc[:, ["n_features", "test_score_r2"]].reset_index()
enet.feature = enet.feature.str.replace("_", ".", regex=True)
enet["Model"] = "Elastic Net"

df = pd.concat([rf, enet], axis=0)
df.head(2)

Unnamed: 0,tissue,feature,n_features,test_score_r2,Model
0,Caudate,ENSG00000001084.10,19.5,-0.097849,Random Forest
1,Caudate,ENSG00000001630.15,3.0,-0.110874,Random Forest


### Overlap with DTU

In [7]:
dx = df.merge(degs[["gene_name"]], left_on="feature", right_index=True).drop_duplicates()
for tissue in ["Caudate", "Dentate Gyrus", "DLPFC", "Hippocampus"]:
    overlap = len(set(dx[(dx["tissue"] == tissue)].gene_name) & 
                  set(dtu[(dtu["Tissue"] == tissue)].gene))
    print("There are {} overlapping DTU in DEGs for {}!".format(overlap, tissue))

There are 166 overlapping DTU in DEGs for Caudate!
There are 16 overlapping DTU in DEGs for Dentate Gyrus!
There are 103 overlapping DTU in DEGs for DLPFC!
There are 108 overlapping DTU in DEGs for Hippocampus!


### Add partial r2 results

In [8]:
partial = pd.read_csv("../../partial_r2/rf_partial_r2_metrics.tsv", sep='\t')\
            .rename(columns={"Geneid": "Feature"})
partial.columns = partial.columns.str.lower()
partial["test_score_r2"] = partial.partial_r2
partial["Model"] = "Partial R2"
partial = partial.loc[:, ['tissue', 'feature', 'n_features', 'test_score_r2', 'Model']]
partial.head(2)

Unnamed: 0,tissue,feature,n_features,test_score_r2,Model
0,Caudate,ENSG00000001084.10,18,0.173658,Partial R2
1,Caudate,ENSG00000001630.15,2,0.006768,Partial R2


In [9]:
df2 = pd.concat([df, partial], axis=0)
df2.groupby(["tissue", "Model"]).size()

tissue         Model        
Caudate        Elastic Net      2947
               Partial R2       2789
               Random Forest    2947
DLPFC          Elastic Net      2745
               Partial R2       2624
               Random Forest    2745
Dentate Gyrus  Elastic Net       780
               Partial R2        772
               Random Forest     780
Hippocampus    Elastic Net      2938
               Partial R2       2745
               Random Forest    2938
dtype: int64

## Summary of results

### Histogram of R2 (median test R2 score)

In [None]:
grid = sns.FacetGrid(df, col="tissue", col_wrap=2, hue="Model")
grid.map(sns.histplot, "test_score_r2")
grid.set_axis_labels("R2 (Test Score)", "Count (Genes)")
grid.set_titles(col_template="{col_name}")
grid.add_legend()
grid.tight_layout()
grid.savefig("histogram_test_r2.pdf")
grid.savefig("histogram_test_r2.png")
grid.savefig("histogram_test_r2.svg")

In [None]:
grid = sns.FacetGrid(df2, col="tissue", col_wrap=2, hue="Model")
grid.map(sns.histplot, "test_score_r2")
grid.set_axis_labels("R2 {Test and Partial}", "Count (Genes)")
grid.set_titles(col_template="{col_name}")
grid.add_legend()
grid.tight_layout()
grid.savefig("histogram_test_N_partial_r2.pdf")
grid.savefig("histogram_test_N_partial_r2.png")
grid.savefig("histogram_test_N_partial_r2.svg")

### What number of random genes do not have any SNPs within 20 Kbp of gene body?

In [10]:
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    xx = set(df[(df["tissue"] == tissue)].feature)
    yy = set(degs[(degs["Tissue"] == tissue)].index)
    txt = "{} of {} ({:.1%}) of random genes do not have SNPs within 20Kbp."
    print(txt.format(len(yy) - len(xx), len(yy), (len(yy) - len(xx)) / len(yy)))

23 of 2970 (0.8%) of random genes do not have SNPs within 20Kbp.
15 of 2760 (0.5%) of random genes do not have SNPs within 20Kbp.
18 of 2956 (0.6%) of random genes do not have SNPs within 20Kbp.
6 of 786 (0.8%) of random genes do not have SNPs within 20Kbp.


### Number of ancestry random genes expression that can be predictive with SNP

In [11]:
df[(df["test_score_r2"] >= 0.5)].groupby(["tissue", "Model"]).size()

tissue         Model        
Caudate        Elastic Net      17
               Random Forest    15
DLPFC          Elastic Net      15
               Random Forest    10
Dentate Gyrus  Elastic Net       2
               Random Forest     2
Hippocampus    Elastic Net      11
               Random Forest     5
dtype: int64

In [12]:
df[(df["test_score_r2"] >= 0.75)].groupby(["tissue", "Model"]).size()

tissue   Model        
Caudate  Elastic Net      1
         Random Forest    2
dtype: int64

In [13]:
df[(df["test_score_r2"] >= 0.75)]

Unnamed: 0,tissue,feature,n_features,test_score_r2,Model
947,Caudate,ENSG00000144115.16,41.5,0.819213,Random Forest
2134,Caudate,ENSG00000230795.3,30.5,0.762742,Random Forest
947,Caudate,ENSG00000144115.16,40.0,0.813873,Elastic Net


### Overlapping with DTU

In [23]:
df3 = dx.merge(dtu, left_on=["gene_name", "tissue"], right_on=["gene", "Tissue"])
df3[(df3["test_score_r2"] >= 0.5)].groupby(["Tissue", "Model"]).size()

Tissue       Model        
Caudate      Elastic Net      1
DLPFC        Elastic Net      4
             Random Forest    1
Hippocampus  Elastic Net      1
             Random Forest    1
dtype: int64

In [24]:
df3[(df3["test_score_r2"] >= 0.5)]

Unnamed: 0,tissue,feature,n_features,test_score_r2,Model,gene_name,clusterID,N,coord,gene,annotation,FDR,chr,Type,Tissue
325,DLPFC,ENSG00000170074.19,17.5,0.504119,Elastic Net,FAM153A,clu_26850_-,8,chr5:177713916-177717163,FAM153A,cryptic,8.93e-11,chr5,DTU,DLPFC
409,Caudate,ENSG00000234585.6,44.0,0.540765,Elastic Net,CCT6P3,clu_185708_+,2,chr7:65073514-65078624,CCT6P3,cryptic,2.46e-07,chr7,DTU,Caudate
410,DLPFC,ENSG00000234585.6,100.5,0.689523,Random Forest,CCT6P3,clu_176176_+,2,chr7:65073514-65078624,CCT6P3,cryptic,4.53e-08,chr7,DTU,DLPFC
411,DLPFC,ENSG00000234585.6,63.5,0.588457,Elastic Net,CCT6P3,clu_176176_+,2,chr7:65073514-65078624,CCT6P3,cryptic,4.53e-08,chr7,DTU,DLPFC
412,Hippocampus,ENSG00000234585.6,44.0,0.624863,Random Forest,CCT6P3,clu_164468_+,2,chr7:65073514-65078624,CCT6P3,cryptic,2.56e-07,chr7,DTU,Hippocampus
413,Hippocampus,ENSG00000234585.6,71.0,0.5871,Elastic Net,CCT6P3,clu_164468_+,2,chr7:65073514-65078624,CCT6P3,cryptic,2.56e-07,chr7,DTU,Hippocampus
553,DLPFC,ENSG00000148290.9,36.0,0.537059,Elastic Net,SURF1,clu_43950_-,2,chr9:133352142-133352446,SURF1,cryptic,0.00293,chr9,DTU,DLPFC
601,DLPFC,ENSG00000177984.6,30.0,0.501914,Elastic Net,LCN15,clu_44147_-,19,chr9:136763965-136770598,LCN15,cryptic,0.0115,chr9,DTU,DLPFC


### What is the overlap between models?

In [None]:
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    print(tissue)
    for r2 in [0, 0.2, 0.5, 0.6, 0.7, 0.75]:
        ee = enet[(enet["tissue"] == tissue) & (enet["test_score_r2"] >= r2)].copy()
        rr = rf[(rf["tissue"] == tissue) & (rf["test_score_r2"] >= r2)].copy()
        oo = len(set(ee.feature) & set(rr.feature))
        txt = "There is {} out of {} and {} genes overlapping between enet and rf - at R2 > {}"
        print(txt.format(oo, len(set(ee.feature)), len(set(rr.feature)), r2))
    print("")

In [None]:
dirname = "model_venn_diagrams"
mkdir_p(dirname)
for tissue in ["Caudate", "DLPFC", "Hippocampus", "Dentate Gyrus"]:
    #print(tissue)
    for r2 in [0, 0.2, 0.5, 0.6, 0.7, 0.75]:
        ee = enet[(enet["tissue"] == tissue) & (enet["test_score_r2"] >= r2)].copy()
        rr = rf[(rf["tissue"] == tissue) & (rf["test_score_r2"] >= r2)].copy()
        model_set = {"Elastic Net": set(ee.feature), "Random Forest": set(rr.feature),}
        try:
            venn(model_set, fmt="{percentage:.1f}%", fontsize=12)
            tt = tissue.lower().replace(" ", "_")
            plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.png".format(dirname, tt, r2))
            plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.pdf".format(dirname, tt, r2))
            plt.savefig("{}/venn_diagram_modelOverlap_{}_r2_{}.svg".format(dirname, tt, r2))
        except ZeroDivisionError:
            print("There are no features to plot!")

### What is the overlap between brain regions?

In [None]:
dirname = "tissue_venn_diagrams"
mkdir_p(dirname)
for modeln in ["Elastic Net", "Random Forest"]:
    #print(modeln)
    dft = df[(df['Model'] == modeln)].copy()
    for r2 in [0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8]:
        cc = dft[(dft["tissue"] == "Caudate") & (dft["test_score_r2"] >= r2)].copy()
        dd = dft[(dft["tissue"] == "DLPFC") & (dft["test_score_r2"] >= r2)].copy()
        hh = dft[(dft["tissue"] == "Hippocampus") & (dft["test_score_r2"] >= r2)].copy()
        gg = dft[(dft["tissue"] == "Dentate Gyrus") & (dft["test_score_r2"] >= r2)].copy()
        tissues = {"Caudate": set(cc.feature), "DLPFC": set(dd.feature), 
                   "Hippocampus": set(hh.feature), "Dentate Gyrus": set(gg.feature)}
        try: 
            venn(tissues, fmt="{percentage:.1f}%", fontsize=12)
            mm = modeln.lower().replace(" ", "_")
            plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.png".format(dirname, mm, r2))
            plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.pdf".format(dirname, mm, r2))
            plt.savefig("{}/venn_diagram_tissueOverlap_{}_r2_{}.svg".format(dirname, mm, r2))
        except ZeroDivisionError:
            print("There are no features to plot!")

## Examining partial R2 results using most predictive SNPs

In [17]:
partial.groupby("tissue").describe().T

Unnamed: 0,tissue,Caudate,DLPFC,Dentate Gyrus,Hippocampus
n_features,count,2789.0,2624.0,772.0,2745.0
n_features,mean,7.690929,8.452363,10.274611,7.286703
n_features,std,15.34009,18.184341,15.633189,15.029555
n_features,min,1.0,1.0,1.0,1.0
n_features,25%,2.0,2.0,3.0,2.0
n_features,50%,3.0,3.0,5.0,3.0
n_features,75%,6.0,7.0,11.0,6.0
n_features,max,292.0,566.0,170.0,311.0
test_score_r2,count,2789.0,2624.0,772.0,2745.0
test_score_r2,mean,0.054626,0.058206,0.122879,0.045879


In [18]:
partial[(partial["test_score_r2"] > 0.8)]

Unnamed: 0,tissue,feature,n_features,test_score_r2,Model
893,Caudate,ENSG00000144115.16,41,0.910508,Partial R2
1484,Caudate,ENSG00000179344.16,16,0.805721,Partial R2
4764,DLPFC,ENSG00000234585.6,100,0.853312,Partial R2
5058,DLPFC,ENSG00000259520.5,64,0.825024,Partial R2
6633,Hippocampus,ENSG00000164308.16,267,0.940874,Partial R2
8416,Dentate Gyrus,ENSG00000149679.11,110,0.986161,Partial R2
8490,Dentate Gyrus,ENSG00000166441.12,104,0.818569,Partial R2
8633,Dentate Gyrus,ENSG00000203279.3,66,0.865353,Partial R2
8659,Dentate Gyrus,ENSG00000213694.3,115,0.87732,Partial R2
8714,Dentate Gyrus,ENSG00000231515.1,84,0.916175,Partial R2


- *GLP2R* (ENSG00000065325) Glucagon Like Peptide 2 Receptor

In [2]:
idv_partial = pd.read_csv("../../partial_r2/individual_partial_r2_metrics.tsv", sep='\t')
idv_partial.head(2)

Unnamed: 0,SNP,Partial_R2,Full_R2,Reduced_R2,Tissue,Geneid
0,chr4_46016437_C_G_0,0.0,237.957568,237.957568,Caudate,ENSG00000163285.7
1,chr4_46016437_C_G_1,0.035396,229.534884,237.957568,Caudate,ENSG00000163285.7


In [3]:
idv_partial[["Partial_R2", "Tissue", "Geneid"]].groupby("Tissue").describe().T

Unnamed: 0,Tissue,Caudate,DLPFC,Dentate Gyrus,Hippocampus
Partial_R2,count,1655003.0,1564008.0,442840.0,1680743.0
Partial_R2,mean,0.006729416,0.006555777,0.012156,0.005680127
Partial_R2,std,0.02134155,0.01839053,0.027188,0.01663068
Partial_R2,min,0.0,0.0,0.0,0.0
Partial_R2,25%,0.0,0.0,0.0,0.0
Partial_R2,50%,0.0008106873,0.0008096635,0.001384,0.0007594009
Partial_R2,75%,0.005699962,0.005958581,0.012475,0.005274174
Partial_R2,max,0.7354638,0.6980796,0.829183,0.7105518


In [13]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.7), ["Tissue", "Partial_R2", "Geneid"]].groupby("Tissue").size()

Tissue
Caudate          9
Dentate Gyrus    5
Hippocampus      4
dtype: int64

In [14]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.7), ["Tissue", "Partial_R2", "Geneid"]].groupby("Geneid").size()

Geneid
ENSG00000144115.16    8
ENSG00000164308.16    4
ENSG00000247828.7     1
ENSG00000273295.1     5
dtype: int64

In [15]:
idv_partial.loc[(idv_partial["Partial_R2"] >= 0.7), ["Tissue", "Partial_R2", "Geneid"]].groupby(["Geneid", "Tissue"]).size()

Geneid              Tissue       
ENSG00000144115.16  Caudate          8
ENSG00000164308.16  Hippocampus      4
ENSG00000247828.7   Caudate          1
ENSG00000273295.1   Dentate Gyrus    5
dtype: int64