# Generate supplemental data for TWAS, caudate, across all features

In [1]:
import pandas as pd

## With MHC

In [2]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'gene_weights/fusion_pgc2/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
1004,ENSG00000204338,ENSG00000204338,CYP21A1P,0.201177,rs497309,rs1150753,11.946468,6.774336e-33,2.77951e-29,2.77951e-29,Gene
146,ENSG00000244731,ENSG00000244731,C4A,0.269548,rs497309,rs497309,11.509381,1.1832529999999999e-30,2.4274440000000003e-27,4.8548880000000005e-27,Gene


In [3]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'transcript_weights/fusion_pgc2/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
3228,ENST00000252211,ENSG00000189298,ZKSCAN3,0.092937,rs2232423,rs2071965,10.011222,1.3606130000000002e-23,5.137673e-20,5.137673e-20,Transcript
1224,ENST00000487376,ENSG00000137312,FLOT1,0.13458,rs3130557,rs2233956,-9.882642,4.9509920000000007e-23,9.347473e-20,1.8694949999999997e-19,Transcript


In [4]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'exon_weights/fusion_pgc2/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
2132,e106885,ENSG00000117020,AKT3,0.109771,rs14403,rs3006917,-7.333341,2.244847e-13,2.197032e-09,2.197032e-09,Exon
2127,e106939,ENSG00000117020,AKT3,0.080285,rs14403,rs3006917,-7.121541,1.067268e-12,3.570908e-09,1.044535e-08,Exon


In [5]:
dj_file = '../../../../differential_expression/_m/junctions/diffExpr_szVctl_full.txt'
dj = pd.read_csv(dj_file, sep='\t', index_col=0)
dj = dj[['Symbol', 'ensemblID']]

jannot_file = '/ceph/projects/v4_phase3_paper/analysis/twas/_m/junctions/jxn_annotation.tsv'
jannot = pd.read_csv(jannot_file, sep='\t', index_col=1)

jannot = jannot[['JxnID']]
annot = pd.merge(jannot, dj, left_index=True, right_index=True)

juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'junction_weights/fusion_pgc2/summary_stats/_m/fusion_associations.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['FILE', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'FILE': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
2726,j102706,ENSG00000163728,TTC14,0.070112,rs1805579,rs1806190,-6.58773,4.46601e-11,1.262094e-07,1.262094e-07,Junction
2727,j102707,ENSG00000163728,TTC14,0.094196,rs1805579,rs1806190,-5.841756,5.165335e-09,7.298618e-06,1.459724e-05,Junction


In [6]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

(20492, 11)


Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
0,ENSG00000188730,ENSG00000188730,VWC2,0.449575,rs6971984,rs10245006,0.761704,0.446237,0.73827,1.0,Gene
1,ENSG00000075303,ENSG00000075303,SLC25A40,0.052929,rs7779623,rs7794263,-0.860432,0.389551,0.695832,1.0,Gene


In [None]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures.txt.gz', index=False, header=True, sep='\t')

## Without MHC

In [2]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'gene_weights/fusion_pgc2/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
1004,ENSG00000204338,ENSG00000204338,CYP21A1P,0.201177,rs497309,rs1150753,11.946468,6.774336e-33,2.77951e-29,2.77951e-29,Gene
146,ENSG00000244731,ENSG00000244731,C4A,0.269548,rs497309,rs497309,11.509381,1.1832529999999999e-30,2.4274440000000003e-27,4.8548880000000005e-27,Gene


In [3]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'transcript_weights/fusion_pgc2/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
3228,ENST00000252211,ENSG00000189298,ZKSCAN3,0.092937,rs2232423,rs2071965,10.011222,1.3606130000000002e-23,5.137673e-20,5.137673e-20,Transcript
1224,ENST00000487376,ENSG00000137312,FLOT1,0.13458,rs3130557,rs2233956,-9.882642,4.9509920000000007e-23,9.347473e-20,1.8694949999999997e-19,Transcript


In [4]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'exon_weights/fusion_pgc2/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
2132,e106885,ENSG00000117020,AKT3,0.109771,rs14403,rs3006917,-7.333341,2.244847e-13,2.197032e-09,2.197032e-09,Exon
2127,e106939,ENSG00000117020,AKT3,0.080285,rs14403,rs3006917,-7.121541,1.067268e-12,3.570908e-09,1.044535e-08,Exon


In [5]:
dj_file = '../../../../differential_expression/_m/junctions/diffExpr_szVctl_full.txt'
dj = pd.read_csv(dj_file, sep='\t', index_col=0)
dj = dj[['Symbol', 'ensemblID']]

jannot_file = '/ceph/projects/v4_phase3_paper/analysis/twas/_m/junctions/jxn_annotation.tsv'
jannot = pd.read_csv(jannot_file, sep='\t', index_col=1)

jannot = jannot[['JxnID']]
annot = pd.merge(jannot, dj, left_index=True, right_index=True)

juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/'+\
                    'junction_weights/fusion_pgc2/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['FILE', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'FILE': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
2726,j102706,ENSG00000163728,TTC14,0.070112,rs1805579,rs1806190,-6.58773,4.46601e-11,1.262094e-07,1.262094e-07,Junction
2727,j102707,ENSG00000163728,TTC14,0.094196,rs1805579,rs1806190,-5.841756,5.165335e-09,7.298618e-06,1.459724e-05,Junction


In [6]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

(20492, 11)


Unnamed: 0,Feature,ensemblID,ID,HSQ,BEST.GWAS.ID,EQTL.ID,TWAS.Z,TWAS.P,FDR,Bonferroni,Type
0,ENSG00000188730,ENSG00000188730,VWC2,0.449575,rs6971984,rs10245006,0.761704,0.446237,0.73827,1.0,Gene
1,ENSG00000075303,ENSG00000075303,SLC25A40,0.052929,rs7779623,rs7794263,-0.860432,0.389551,0.695832,1.0,Gene


In [None]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures_noMHC.txt.gz', 
          index=False, header=True, sep='\t')