# Summary of TWAS

In [7]:
import numpy as np
import pandas as pd
from glob import iglob
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

## Load PGC2

In [2]:
pgc2_file = '../../../../../../inputs/gwas/PGC2_CLOZUK/map_phase3/_m/libd_hg38_pggc2sz_snps.tsv'
pgc2_df = pd.read_csv(pgc2_file, sep='\t', low_memory=False, index_col=0)

  mask |= (ar1 == a)


In [10]:
li = []

for filename in iglob("../../_m/*.dat*"):
    li.append(pd.read_csv(filename, sep='\t'))

df = pd.concat(li, axis=0, ignore_index=True).drop(['PANEL'], axis=1)
df['FILE'] = df.FILE.str.replace('../../_m/WEIGHTS/twas_gene_expression.', '')
df['FILE'] = df.FILE.str.replace('\\..*', '')
df.shape

(4150, 22)

In [11]:
df.iloc[0:2, 0:5]

Unnamed: 0,FILE,ID,CHR,P0,P1
0,ENSG00000000457,SCYL3,1,169849631,169894267.0
1,ENSG00000000460,C1orf112,1,169662007,169854080.0


In [12]:
df = df[~(df['TWAS.P'].isna())].copy() # Drop NAs
df.shape

(4103, 22)

In [13]:
pv1 = multipletests(df.loc[:, 'TWAS.P'], method='fdr_bh')
pv2 = multipletests(df.loc[:, 'TWAS.P'], method='bonferroni')
df['FDR'] = pv1[1]
df['Bonferroni'] = pv2[1]
df.head()

Unnamed: 0,FILE,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,...,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P,PERM.PV,PERM.N,PERM.ANL_PV,FDR,Bonferroni
0,ENSG00000000457,SCYL3,1,169849631,169894267.0,0.101393,rs3917423,2.782,rs12128308:169876994:G:A,0.017842,...,bslmm,0.032921,0.0001730705,0.665158,0.505949,0.0,0,0.0,0.784871,1.0
1,ENSG00000000460,C1orf112,1,169662007,169854080.0,0.210605,rs3917423,2.782,rs1062976,0.121327,...,enet,0.133235,4.407639e-14,-0.319099,0.749652,0.0,0,0.0,0.907054,1.0
2,ENSG00000000971,CFH,1,196651878,196747504.0,0.077378,rs7524776,-4.151,rs3855964,0.089495,...,lasso,0.083702,5.0472e-09,1.16,0.246049,0.0,0,0.0,0.564313,1.0
3,ENSG00000001460,STPG1,1,24356999,24416934.0,0.328272,rs12401982,3.549,rs10489442,0.157484,...,bslmm,0.235684,6.604533e-24,1.438387,0.150324,0.0,0,0.0,0.431315,1.0
4,ENSG00000001461,NIPAL3,1,24415794,24472976.0,0.219004,rs12401982,3.549,rs576096,-0.000903,...,bslmm,0.04641,1.254019e-05,1.612549,0.106843,0.0,0,0.0,0.353528,1.0


In [5]:
df.to_csv("fusion_associations.txt", sep="\t", index=False)

  reject = pvals <= alphacBonf
  pvals_corrected[pvals_corrected>1] = 1


In [23]:
print("There are %d transcripts with significant p-values." % np.sum(df['TWAS.P'] <= 0.05))
print("There are %d transcripts with significant FDR." % np.sum(df['FDR'] <= 0.05))
print("There are %d transcripts with significant Bonferroni." % np.sum(df['Bonferroni'] <= 0.05))

There are 905 transcripts with significant p-values.
There are 389 transcripts with significant FDR.
There are 101 transcripts with significant Bonferroni.


In [15]:
dft = pd.merge(df, pgc2_df, left_on='BEST.GWAS.ID', right_on='our_snp_id', suffixes=['_TWAS', '_PGC2'])
dft.shape

(4103, 46)

In [16]:
table =  [[np.sum((dft['P']<5e-8) & ((dft['TWAS.P']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['TWAS.P']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['TWAS.P']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['TWAS.P']>=.05)))]]
print(table)
fisher_exact(table)

[[189, 168], [716, 3030]]


(4.760824022346369, 5.120710830232514e-41)

In [17]:
table =  [[np.sum((dft['P']<5e-8) & ((dft['FDR']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['FDR']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['FDR']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['FDR']>=.05)))]]
print(table)
fisher_exact(table)

[[140, 217], [249, 3497]]


(9.060759165695039, 3.666137567103334e-59)

In [18]:
table =  [[np.sum((dft['P']<5e-8) & ((dft['Bonferroni']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['Bonferroni']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['Bonferroni']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['Bonferroni']>=.05)))]]
print(table)
fisher_exact(table)

[[85, 272], [16, 3730]]


(72.8515625, 1.7079536902471683e-77)

In [19]:
twas = df[(df['TWAS.P'] <= 0.05)].copy()

snps_twas = set(pgc2_df['our_snp_id']) & set(twas['BEST.GWAS.ID'])
snps_pgc2 = set(pgc2_df[(pgc2_df['P']<5e-8)].loc[:,'our_snp_id']) & set(twas['BEST.GWAS.ID'])
overlap = len(snps_twas)
overlap_sig = len(snps_pgc2)

print('There are %0.2f%% overlap between PGC2 and TWAS.' % 
      (overlap/len(twas.loc[:, 'BEST.GWAS.ID'].unique()) * 100))
print('There are %0.2f%% overlap between significant PGC2 and TWAS.' % 
      (overlap_sig/len(twas.loc[:, 'BEST.GWAS.ID'].unique())* 100))
print("There are %d novel unique SNPs associations with SZ." % 
      (overlap - overlap_sig))

There are 100.00% overlap between PGC2 and TWAS.
There are 11.90% overlap between significant PGC2 and TWAS.
There are 459 novel unique SNPs associations with SZ.


In [20]:
new_df = twas.set_index('BEST.GWAS.ID').loc[snps_twas - snps_pgc2, ['ID', 'FILE', 'CHR', 'TWAS.Z', 'TWAS.P', 'FDR', 'Bonferroni']].reset_index()
new_df.head()

Unnamed: 0,BEST.GWAS.ID,ID,FILE,CHR,TWAS.Z,TWAS.P,FDR,Bonferroni
0,rs2106246,EARS2,ENSG00000103356,16,2.024992,0.042868,0.208152,1.0
1,rs2106246,,ENSG00000260136,16,-2.46337,0.013764,0.101387,1.0
2,rs8072032,DHX33,ENSG00000005100,17,-2.843833,0.004457,0.047752,1.0
3,rs8072032,RABEP1,ENSG00000029725,17,-3.798113,0.000146,0.00367,0.598224
4,rs8072032,ENO3,ENSG00000108515,17,-2.421235,0.015468,0.1098,1.0


In [21]:
new_df[(new_df['Bonferroni'] <= 0.05)].head()

Unnamed: 0,BEST.GWAS.ID,ID,FILE,CHR,TWAS.Z,TWAS.P,FDR,Bonferroni
8,rs4888256,GLG1,ENSG00000090863,16,4.403143,1.066939e-05,0.000447,0.043777
40,rs1428122,FAM114A2,ENSG00000055147,5,-5.218479,1.803981e-07,1.2e-05,0.00074
52,rs6695327,ANKRD45,ENSG00000183831,1,4.803447,1.559575e-06,8.6e-05,0.006399
252,rs200448,DNAJC11,ENSG00000007923,1,-5.084607,3.683884e-07,2.4e-05,0.001511
253,rs200448,KLHL21,ENSG00000162413,1,4.597465,4.276621e-06,0.000206,0.017547


## Joint analysis

In [22]:
li = []

for filename in iglob("../../_m/sig_analysis/*included.dat"):
    li.append(pd.read_csv(filename, sep='\t'))
    
df2 = pd.concat(li, axis=0, ignore_index=True)
df2['FILE'] = df2.FILE.str.replace('../../_m/WEIGHTS/twas_gene_expression.', '')
df2['FILE'] = df2.FILE.str.replace('\\..*', '')
df2.shape

(93, 8)

In [24]:
print("There are %d transcripts with TWAS associations." % np.sum(df2['JOINT.P'] <= 0.05))

There are 93 transcripts with TWAS associations.


In [15]:
df2.to_csv("fusion_twas_significant_assoc.txt", sep="\t", index=False)
df2.head(2)

Unnamed: 0,FILE,ID,TWAS.Z,TWAS.P,JOINT.BETA,JOINT.BETA.SE,JOINT.Z,JOINT.P
0,ENST00000262096,ZDHHC2,-3.6,0.00032,-3.6,1,-3.6,0.00032
1,ENST00000519476,LETM2,-6.1,1.1e-09,-6.1,1,-6.1,1.1e-09


In [25]:
df2.drop(['TWAS.Z', 'TWAS.P'], axis=1, inplace=True)

dft2 = pd.merge(df2, dft, on=['FILE', 'ID'], how='right').fillna(1)
dft2.shape

(4103, 50)

In [26]:
table =  [[np.sum((dft2['P']<5e-8) & ((dft2['JOINT.P']<.05))),
           np.sum((dft2['P']<5e-8) & ((dft2['JOINT.P']>=.05)))],
          [np.sum((dft2['P']>=5e-8) & ((dft2['JOINT.P']<.05))),
           np.sum((dft2['P']>=5e-8) & ((dft2['JOINT.P']>=.05)))]]
print(table)
fisher_exact(table)

[[30, 327], [63, 3683]]


(5.363331877093345, 6.43110777830765e-11)

In [27]:
twas2 = pd.merge(df2, df, on=['FILE', 'ID'])

snps_twas = set(pgc2_df['our_snp_id']) & set(twas2['BEST.GWAS.ID'])
snps_pgc2 = set(pgc2_df[(pgc2_df['P']<5e-8)].loc[:,'our_snp_id']) & set(twas2['BEST.GWAS.ID'])
overlap = len(snps_twas)
overlap_sig = len(snps_pgc2)

print('There are %0.2f%% overlap between PGC2 and TWAS.' % 
      (overlap/len(twas2.loc[:, 'BEST.GWAS.ID'].unique()) * 100))
print('There are %0.2f%% overlap between significant PGC2 and TWAS.' % 
      (overlap_sig/len(twas2.loc[:, 'BEST.GWAS.ID'].unique())* 100))
print("There are %d novel unique SNPs associations with SZ." % 
      (overlap - overlap_sig))

There are 100.00% overlap between PGC2 and TWAS.
There are 30.23% overlap between significant PGC2 and TWAS.
There are 60 novel unique SNPs associations with SZ.


In [29]:
new_df2 = pgc2_df.set_index('our_snp_id').loc[snps_twas - snps_pgc2, :].reset_index()
new_df2 = new_df2[['our_snp_id', 'CHR', 'BP', 'A1', 'A2', 'OR', 'P', 'is_index_snp']]
new_df2.head()

Unnamed: 0,our_snp_id,CHR,BP,A1,A2,OR,P,is_index_snp
0,rs6656:44606098:C:T,7,44606098,C,T,0.95243,6.96e-07,False
1,rs8137258,22,20135961,T,C,0.94503,9.87e-07,False
2,rs2242000,1,205031769,G,A,0.94441,2.8e-06,False
3,rs41468646:21622246:C:T,16,21622246,C,T,1.0618,1.38e-06,False
4,rs8072032,17,4955206,G,T,0.92748,2.54e-05,False


In [None]:
new_df2.to_csv('pgc2_clozuk_twas_significant_snps.txt', sep='\t', index=False)