In [1]:
import pandas as pd
from glob import glob
import csv

In [2]:
feature_df = pd.read_csv("analysis/transcript-rna-seq/feature-annot/TRNSCRPT_FEATURE_ANNOT.txt", sep="\t")
feature_df.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_version,gene_name,gene_source,gene_biotype
0,AABR07022258.1,ensembl,gene,405,848,.,-,.,ENSRNOG00000055633,1,LOC100910067,ensembl,protein_coding
1,AABR07022620.1,ensembl,gene,122,427,.,-,.,ENSRNOG00000058846,1,AABR07022620.1,ensembl,protein_coding
2,AABR07022926.1,ensembl,gene,18,85,.,+,.,ENSRNOG00000055717,1,AABR07022926.1,ensembl,miRNA
3,AABR07024031.1,ensembl,gene,18673,58717,.,-,.,ENSRNOG00000017648,7,Vom2r8,ensembl,protein_coding
4,AABR07024032.1,ensembl,gene,17425,17528,.,-,.,ENSRNOG00000056404,1,RF00026,ensembl,snRNA


In [3]:
gene_mapper = {}
for i, row in feature_df.iterrows():
    gene_id = row["gene_id"]
    gene_name = row["gene_name"]
    gene_mapper[gene_id] = gene_name

## Sex and Timepoint Comparison
* Training group (timepoint) vs sex-matched controls
* We filter the dataframe for genes that have an adjusted p-value < 0.05. We then get the up and down differentially expressed genes for each specific tissue, sex, and timepoint group.

In [4]:
filename = "analysis/transcript-rna-seq/dea/pass1b-06_t62-spleen_transcript-rna-seq_timewise-dea-fdr_20211008.txt"
tw = pd.read_csv(filename, sep="\t")
tw.head(10)

Unnamed: 0,feature_ID,tissue,sex,comparison_group,assay,covariates,removed_samples,logFC,logFC_se,shrunk_logFC,shrunk_logFC_se,zscore,p_value,comparison_average_intensity,comparison_average_intensity_se,reference_average_intensity,reference_average_intensity_se,adj_p_value,tissue_abbreviation,selection_fdr
0,ENSRNOG00000000001,t62-spleen,male,1w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,0.106894,0.526792,0.0002387733,0.024905,0.202916,0.839201,9.466459,1.499481,8.785202,2.113512,1.0,SPLEEN,1.0
1,ENSRNOG00000000001,t62-spleen,male,2w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,-0.00961,0.577451,-2.057732e-05,0.026721,-0.016642,0.986722,8.348006,1.503956,8.785202,2.113512,1.0,SPLEEN,1.0
2,ENSRNOG00000000001,t62-spleen,male,4w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,-0.05696,0.523416,-9.578498e-05,0.02147,-0.108823,0.913343,9.995616,2.264256,8.785202,2.113512,1.0,SPLEEN,1.0
3,ENSRNOG00000000001,t62-spleen,male,8w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,0.627636,0.503587,0.02131641,0.094573,1.246331,0.212643,14.9853,2.621802,8.785202,2.113512,1.0,SPLEEN,1.0
4,ENSRNOG00000000001,t62-spleen,female,1w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,0.198305,0.56063,0.0006254192,0.031551,0.353718,0.72355,12.607312,1.186143,9.859512,0.574047,1.0,SPLEEN,1.0
5,ENSRNOG00000000001,t62-spleen,female,2w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,-0.331535,0.645398,-0.000557016,0.026494,-0.51369,0.607468,8.048467,0.590803,9.859512,0.574047,1.0,SPLEEN,1.0
6,ENSRNOG00000000001,t62-spleen,female,4w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,0.000136,0.481579,6.983907e-09,0.003456,0.000282,0.999775,10.175191,2.228992,9.859512,0.574047,1.0,SPLEEN,1.0
7,ENSRNOG00000000001,t62-spleen,female,8w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,0.058231,0.440509,0.0004784217,0.039949,0.132191,0.894834,10.417179,1.967804,9.859512,0.574047,1.0,SPLEEN,1.0
8,ENSRNOG00000000008,t62-spleen,male,1w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,-0.646087,0.491763,-0.001675895,0.025519,-1.313817,0.188908,7.980267,1.389427,12.579924,0.896894,1.0,SPLEEN,1.0
9,ENSRNOG00000000008,t62-spleen,male,2w,transcript-rna-seq,"pct_globin,rin,pct_umi_dup,median_5_3_bias",,-0.600313,0.536082,-0.001506895,0.027309,-1.119814,0.262793,6.443191,1.643377,12.579924,0.896894,1.0,SPLEEN,1.0


In [5]:
tw.head()[['feature_ID', 'tissue', 'sex', 'comparison_group', 'logFC',  'p_value', 'adj_p_value', 'tissue_abbreviation',
       'selection_fdr']]

Unnamed: 0,feature_ID,tissue,sex,comparison_group,logFC,p_value,adj_p_value,tissue_abbreviation,selection_fdr
0,ENSRNOG00000000001,t62-spleen,male,1w,0.106894,0.839201,1.0,SPLEEN,1.0
1,ENSRNOG00000000001,t62-spleen,male,2w,-0.00961,0.986722,1.0,SPLEEN,1.0
2,ENSRNOG00000000001,t62-spleen,male,4w,-0.05696,0.913343,1.0,SPLEEN,1.0
3,ENSRNOG00000000001,t62-spleen,male,8w,0.627636,0.212643,1.0,SPLEEN,1.0
4,ENSRNOG00000000001,t62-spleen,female,1w,0.198305,0.72355,1.0,SPLEEN,1.0


In [6]:
with open("MoTrPAC_timewise_2023.gmt", "w") as o:
    csv_writer = csv.writer(o, delimiter="\t")
    for filename in glob("analysis/transcript-rna-seq/dea/*timewise*.txt"):
        print(filename)
        df = pd.read_csv(filename, sep="\t")
        tissues = df.tissue.unique()
        sexes = df.sex.unique()
        groups = df.comparison_group.unique()
        
        for tissue in tissues:
            for sex in sexes:
                for timepoint in groups:
                    label = "%s_%s_%s"%(tissue, sex, timepoint)
                    up_genes = set()
                    down_genes = set()
                    d = df[(df.tissue == tissue) & (df.sex == sex) & (df.comparison_group == timepoint) & (df.adj_p_value < 0.05)] 
                    up = d[d.logFC > 0]
                    up_genes = set([gene_mapper[i] for i in up.feature_ID])

                    down = d[d.logFC < 0]
                    down_genes = set([gene_mapper[i] for i in down.feature_ID])
                    if (len(up_genes) >= 5):
                        csv_writer.writerow([
                            label + "_up",
                            "",
                            *list(up_genes)
                        ])
                    if (len(down_genes) >= 5):
                        csv_writer.writerow([
                            label + "_down",
                            "",
                            *list(down_genes)
                        ])
            

analysis/transcript-rna-seq/dea/pass1b-06_t61-colon_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t56-vastus-lateralis_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t55-gastrocnemius_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t62-spleen_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t58-heart_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t59-kidney_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t66-lung_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t53-cortex_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t60-adrenal_transcript-rna-seq_timewise-dea-fdr_20211008.txt
analysis/transcript-rna-seq/dea/pass1b-06_t68-liver_transcript-rna-s

## Training Response Significance
* Computed genes that changed over the training time course computed using likelihood ratio test. 
* We filter the dataframe for genes that have an adjusted training p-value < 0.05. These are the genes that significantly responded over the course of the raining time.

In [7]:
filename = "analysis/transcript-rna-seq/dea/pass1b-06_t62-spleen_transcript-rna-seq_training-dea-fdr_20211008.txt"
tr = pd.read_csv(filename, sep="\t")
tr.head(10)

Unnamed: 0,feature_ID,assay,tissue,removed_samples,lrt_male,p_value_male,full_model_male,reduced_model_male,lrt_female,p_value_female,full_model_female,reduced_model_female,p_value,lrt,full_model,reduced_model,adj_p_value,tissue_abbreviation
0,ENSRNOG00000000001,transcript-rna-seq,t62-spleen,,2.651426,0.617741,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,1.071593,0.898753,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.881892,,,,1.0,SPLEEN
1,ENSRNOG00000000008,transcript-rna-seq,t62-spleen,,2.211815,0.696867,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.914729,0.922438,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.926875,,,,1.0,SPLEEN
2,ENSRNOG00000000010,transcript-rna-seq,t62-spleen,,1.14943,0.886352,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,1.891004,0.755797,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.938281,,,,1.0,SPLEEN
3,ENSRNOG00000000012,transcript-rna-seq,t62-spleen,,11.889917,0.018189,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,8.93328,0.062789,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.008879,,,,0.076572,SPLEEN
4,ENSRNOG00000000017,transcript-rna-seq,t62-spleen,,2.377714,0.666658,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,3.039546,0.551229,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.735361,,,,1.0,SPLEEN
5,ENSRNOG00000000021,transcript-rna-seq,t62-spleen,,2.92989,0.569626,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,6.986178,0.13662,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.276529,,,,0.616488,SPLEEN
6,ENSRNOG00000000024,transcript-rna-seq,t62-spleen,,4.319273,0.364518,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,6.980177,0.136939,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.199538,,,,0.51802,SPLEEN
7,ENSRNOG00000000033,transcript-rna-seq,t62-spleen,,6.408972,0.170617,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,15.018139,0.004664,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.006474,,,,0.072793,SPLEEN
8,ENSRNOG00000000034,transcript-rna-seq,t62-spleen,,18.942355,0.000807,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,2.166775,0.705116,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.004819,,,,0.049617,SPLEEN
9,ENSRNOG00000000036,transcript-rna-seq,t62-spleen,,10.579231,0.031723,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,2.925815,0.570316,~pct_globin+rin+pct_umi_dup+median_5_3_bias+group,~pct_globin+rin+pct_umi_dup+median_5_3_bias,0.090683,,,,0.325494,SPLEEN


In [8]:
tr.head()[['feature_ID','tissue', 'lrt_male',
       'p_value_male', 'lrt_female',
       'p_value_female', 'p_value', 'lrt', 'adj_p_value',
       'tissue_abbreviation']]

Unnamed: 0,feature_ID,tissue,lrt_male,p_value_male,lrt_female,p_value_female,p_value,lrt,adj_p_value,tissue_abbreviation
0,ENSRNOG00000000001,t62-spleen,2.651426,0.617741,1.071593,0.898753,0.881892,,1.0,SPLEEN
1,ENSRNOG00000000008,t62-spleen,2.211815,0.696867,0.914729,0.922438,0.926875,,1.0,SPLEEN
2,ENSRNOG00000000010,t62-spleen,1.14943,0.886352,1.891004,0.755797,0.938281,,1.0,SPLEEN
3,ENSRNOG00000000012,t62-spleen,11.889917,0.018189,8.93328,0.062789,0.008879,,0.076572,SPLEEN
4,ENSRNOG00000000017,t62-spleen,2.377714,0.666658,3.039546,0.551229,0.735361,,1.0,SPLEEN


In [9]:
with open("MoTrPAC_training_significance_2023.gmt", "w") as o:
    csv_writer = csv.writer(o, delimiter="\t")
    for filename in glob("analysis/transcript-rna-seq/dea/*training*.txt"):
        df = pd.read_csv(filename, sep="\t")
        tissues = df.tissue.unique()
        for tissue in tissues:
                up_genes = set()
                down_genes = set()
                d = df[(df.tissue == tissue) & (df.adj_p_value < 0.05)] 
                genes = set([gene_mapper[i] for i in d.feature_ID])

                if (len(genes) >= 5):
                    print(filename, len(genes))
                    csv_writer.writerow([
                        tissue,
                        "",
                        *list(genes)
                    ])

analysis/transcript-rna-seq/dea/pass1b-06_t62-spleen_transcript-rna-seq_training-dea-fdr_20211008.txt 1099
analysis/transcript-rna-seq/dea/pass1b-06_t61-colon_transcript-rna-seq_training-dea-fdr_20211008.txt 2515
analysis/transcript-rna-seq/dea/pass1b-06_t56-vastus-lateralis_transcript-rna-seq_training-dea-fdr_20211008.txt 764
analysis/transcript-rna-seq/dea/pass1b-06_t55-gastrocnemius_transcript-rna-seq_training-dea-fdr_20211008.txt 565
analysis/transcript-rna-seq/dea/pass1b-06_t66-lung_transcript-rna-seq_training-dea-fdr_20211008.txt 961
analysis/transcript-rna-seq/dea/pass1b-06_t59-kidney_transcript-rna-seq_training-dea-fdr_20211008.txt 338
analysis/transcript-rna-seq/dea/pass1b-06_t58-heart_transcript-rna-seq_training-dea-fdr_20211008.txt 720
analysis/transcript-rna-seq/dea/pass1b-06_t67-small-intestine_transcript-rna-seq_training-dea-fdr_20211008.txt 746
analysis/transcript-rna-seq/dea/pass1b-06_t69-brown-adipose_transcript-rna-seq_training-dea-fdr_20211008.txt 1631
analysis/trans

In [10]:
# s1 = pd.read_csv("s1.tsv", sep="\t")
# s1.head()

In [11]:
# s1 = s1[s1.assay == "TRNSCRPT"]
# s1.head()

In [12]:
# tr[tr.adj_p_value < 0.05].head()

In [13]:
# feature_id = "ENSRNOG00000000034"
# s1[(s1.tissue == "SPLEEN") & (s1.feature_ID == feature_id)]

In [14]:
# tw[tw.feature_ID == feature_id]