# Generate supplemental data for TWAS, caudate, across all features

In [1]:
import pandas as pd

## With MHC

In [2]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'gene_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
254,ENSG00000244731,ENSG00000244731,C4A,0.330293,chr6:32219860:G:T,chr6:31902549:G:A,10.971295,5.2514850000000005e-28,2.668805e-24,2.668805e-24,Gene
4439,ENSG00000219891,ENSG00000219891,ZSCAN12P1,0.219714,chr6:27837477:A:C,chr6:27883095:G:A,10.68975,1.1367719999999999e-26,2.888538e-23,5.777075000000001e-23,Gene


In [3]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'transcript_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
3106,ENST00000496659,ENSG00000244731,C4A,0.208279,chr6:32219860:G:T,chr6:31942164:A:G,11.008695,3.469941e-28,3.078185e-24,3.078185e-24,Transcript
291,ENST00000428956,ENSG00000244731,C4A,0.313207,chr6:32219860:G:T,chr6:31902549:G:A,10.781964,4.1884610000000005e-27,1.8577920000000003e-23,3.7155840000000005e-23,Transcript


In [4]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'exon_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
34301,e377846,ENSG00000186470,BTN3A2,0.13659,chr6:26463347:G:T,chr6:26336344:G:A,11.162478,6.223219e-29,2.397931e-24,2.397931e-24,Exon
34737,e384610,ENSG00000244731,C4A,0.387274,chr6:32219860:G:T,chr6:31902549:G:A,11.064041,1.8745690000000001e-28,3.611545e-24,7.22309e-24,Exon


In [5]:
annot = pd.read_csv('../../_m/jxn_annotation.tsv', sep='\t', index_col=1)
annot["gene_id"] = annot.index
juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'junction_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['gene_id', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'gene_id': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
10530,chr6:31996601-31996828(+),ENSG00000244731,C4A,0.375134,chr6:32219860:G:T,chr6:31902549:G:A,10.811784,3.027249e-27,2.0302e-23,3.9723560000000005e-23,Junction
10528,chr6:31996112-31996206(+),ENSG00000244731,C4A,0.410304,chr6:32219860:G:T,chr6:31902549:G:A,10.809773,3.094346e-27,2.0302e-23,4.0604010000000004e-23,Junction


In [6]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

(65607, 11)


Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
0,ENSG00000248587,ENSG00000248587,GDNF-AS1,0.182975,chr5:37821119:C:A,chr5:37981540:A:C,-0.345122,0.730003,0.889658,1.0,Gene
1,ENSG00000185052,ENSG00000185052,SLC24A3,0.255074,chr20:19648189:C:T,chr20:19138350:A:G,0.400819,0.688553,0.871279,1.0,Gene


In [7]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures.txt.gz', index=False, header=True, sep='\t')

## Without MHC

In [8]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'gene_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
4665,ENSG00000163938,ENSG00000163938,GNL3,0.410009,chr3:52781889:T:C,chr3:52588070:G:A,9.415273,4.718536e-21,2.348887e-17,2.348887e-17,Gene
3609,ENSG00000166159,ENSG00000166159,LRTM2,0.239365,chr12:2221292:C:T,chr12:2224318:C:T,-9.064394,1.2529839999999998e-19,3.118677e-16,6.237353e-16,Gene


In [9]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'transcript_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
4596,ENST00000394799,ENSG00000163938,GNL3,0.122586,chr3:52781889:T:C,chr3:52799789:C:A,8.983223,2.62948e-19,2.283966e-15,2.283966e-15,Transcript
1396,ENST00000315580,ENSG00000182196,ARL6IP4,0.482633,chr12:123148383:G:A,chr12:122973072:C:T,-8.699604,3.330436e-18,1.446408e-14,2.892817e-14,Transcript


In [10]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'exon_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
34301,e377846,ENSG00000186470,BTN3A2,0.13659,chr6:26463347:G:T,chr6:26336344:G:A,11.162478,6.223219e-29,2.397931e-24,2.397931e-24,Exon
34737,e384610,ENSG00000244731,C4A,0.387274,chr6:32219860:G:T,chr6:31902549:G:A,11.064041,1.8745690000000001e-28,3.611545e-24,7.22309e-24,Exon


In [11]:
annot = pd.read_csv('../../_m/jxn_annotation.tsv', sep='\t', index_col=1)
annot["gene_id"] = annot.index
juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'junction_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['gene_id', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'gene_id': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
8293,chr3:52690705-52690944(+),ENSG00000163938,GNL3,0.062694,chr3:52781889:T:C,chr3:52507237:A:G,9.632857,5.809151e-22,7.466501e-18,7.466501e-18,Junction
8295,chr3:52693808-52694036(+),ENSG00000163938,GNL3,0.114983,chr3:52781889:T:C,chr3:52594040:C:A,9.401695,5.369125e-21,3.450468e-17,6.900936e-17,Junction


In [12]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

(65049, 11)


Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
0,ENSG00000248587,ENSG00000248587,GDNF-AS1,0.182975,chr5:37821119:C:A,chr5:37981540:A:C,-0.345122,0.730003,0.892644,1.0,Gene
1,ENSG00000185052,ENSG00000185052,SLC24A3,0.255074,chr20:19648189:C:T,chr20:19138350:A:G,0.400819,0.688553,0.875014,1.0,Gene


In [13]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures_noMHC.txt.gz', 
          index=False, header=True, sep='\t')