# Generate supplemental data for TWAS, caudate, across all features

In [None]:
import pandas as pd

## With MHC

In [None]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'gene_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

In [None]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'transcript_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

In [None]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'exon_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

In [None]:
annot = pd.read_csv('../../_m/jxn_annotation.tsv', sep='\t', index_col=1)
annot["gene_id"] = annot.index
juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'junction_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['gene_id', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'gene_id': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

In [None]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

In [None]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures.txt.gz', index=False, header=True, sep='\t')

## Without MHC

In [None]:
genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'gene_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/genes/diffExpr_szVctl_full.txt', sep='\t')
genes = annot[['ensemblID']].merge(genes, left_on='ensemblID', right_on='FILE')
genes = genes[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
genes['Type'] = 'Gene'
genes.rename(columns={'FILE': 'Feature'}, inplace=True)
genes.sort_values('TWAS.P').head(2)

In [None]:
trans = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'transcript_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/transcripts/diffExpr_szVctl_full.txt', sep='\t')
annot['ensemblID'] = annot.gene_id.str.replace('\\..*', '', regex=True)
annot['FILE'] = annot.transcript_id.str.replace('\\..*', '', regex=True)
trans = annot[['ensemblID', 'FILE']].merge(trans, on='FILE')
trans = trans[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
trans['Type'] = 'Transcript'
trans.rename(columns={'FILE': 'Feature'}, inplace=True)
trans.sort_values('TWAS.P').head(2)

In [None]:
exons = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'exon_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
annot = pd.read_csv('../../../../differential_expression/_m/exons/diffExpr_szVctl_full.txt', 
                    sep='\t', index_col=0)
exons = annot[['ensemblID']].merge(exons, left_index=True, right_on='FILE')
exons = exons[['FILE', 'ensemblID', 'ID', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
exons['Type'] = 'Exon'
exons.rename(columns={'FILE': 'Feature'}, inplace=True)
exons.sort_values('TWAS.P').head(2)

In [None]:
annot = pd.read_csv('../../_m/jxn_annotation.tsv', sep='\t', index_col=1)
annot["gene_id"] = annot.index
juncs = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'junction_weights/fusion/summary_stats/_m/fusion_associations_noMHC.txt', sep='\t')
juncs = pd.merge(annot, juncs, left_on='JxnID', right_on='FILE')
juncs = juncs[['gene_id', 'ensemblID', 'Symbol', 'HSQ', 'BEST.GWAS.ID', 'EQTL.ID', 
               'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']]
juncs['Type'] = 'Junction'
juncs.rename(columns={'Symbol': 'ID', 'gene_id': 'Feature'}, inplace=True)
juncs.sort_values('TWAS.P').head(2)

In [None]:
df = pd.concat([genes, trans, exons, juncs], axis=0)
print(df.shape)
df.head(2)

In [None]:
df.to_csv('BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures_noMHC.txt.gz', 
          index=False, header=True, sep='\t')