# Comparison with public TWAS associations for SZ

In [None]:
import urllib
import numpy as np
import pandas as pd
from html.parser import HTMLParser
from scipy.stats import fisher_exact
from html.entities import name2codepoint

In [None]:
class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.genes = []
        
    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if attr[0] == 'href' and attr[1].startswith('/genes/'):
                gene = [attr[1][len('/genes/'):]]
                self.genes += gene

## MHC genes

In [None]:
mhc_genes = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                        "gene_weights/fusion_pgc2/_m/PGC2.SCZ.Caudate.6.dat.MHC", sep='\t').ID
len(set(mhc_genes))

## Caudate specific of BrainSeq brain regions

In [None]:
fn = "../../libd_twas_comparison/_m/caudate_only_twasList_genes.txt"
caudate = pd.read_csv(fn, sep='\t')
caudate.shape

In [None]:
len(set(caudate.ID) - set(mhc_genes))

In [None]:
caudate_noMHC = caudate[(caudate["ID"].isin(list(set(caudate.ID) - set(mhc_genes))))].copy()
caudate_noMHC.iloc[0:2, 0:5]

## TWAS Hub comparison

### Schizophrenia 2014

In [None]:
parser = MyHTMLParser()
html_str = urllib.request.urlopen('http://twas-hub.org/traits/Schizophrenia/').read().decode()
parser.feed(html_str)
scz_2014 = np.unique(parser.genes)

In [None]:
scz_2014.shape

### Schizophrenia 2018

In [None]:
parser = MyHTMLParser()
html_str = urllib.request.urlopen('http://twas-hub.org/traits/SCZ_2018/').read().decode()
parser.feed(html_str)
scz_2018 = np.unique(parser.genes)

### TWAS hub comparison

In [None]:
twas_hub = np.unique(np.append(scz_2014, scz_2018))
len(twas_hub)

In [None]:
twas_hub

In [None]:
print("There are %d caudate only genes present in the TWAS hub" % 
      len(set(caudate.ID) & set(twas_hub)))
set(caudate.ID) & set(twas_hub)

In [None]:
print("There are %d caudate only genes (no MHC) present in the TWAS hub" % 
      len(set(caudate_noMHC.ID) & set(twas_hub)))
set(caudate.ID) & set(twas_hub)

## Gandal comparison

In [None]:
gandal = pd.read_excel("../_h/aat8127_Table_S4.xlsx", sheet_name="SCZ.TWAS")
gandal_twas = gandal[(gandal['TWAS.Bonferroni'] <= 0.05)].copy()
np.sum(gandal.loc[:, 'TWAS.Bonferroni'] < 0.05)

In [None]:
caudate_bonferroni = caudate[(caudate['Bonferroni'] <= 0.05)].copy()
np.sum(caudate.Bonferroni <= 0.05)

In [None]:
caudate_noMHC_bonferroni = caudate_noMHC[(caudate_noMHC['Bonferroni'] <= 0.05)].copy()
np.sum(caudate_noMHC.Bonferroni <= 0.05)

In [None]:
print("There are %d caudate only genes present in the Gandal at Bonferroni < 0.05." % 
      len(set(caudate_bonferroni.FILE) & set(gandal_twas.GeneID)))
#list(set(caudate_bonferroni.ID) & set(gandal_twas.gene_name))

print("There are %d caudate only genes present in the Gandal." % 
      len(set(caudate.FILE) & set(gandal_twas.GeneID)))
np.array(list(set(caudate.FILE) & set(gandal_twas.GeneID)))

In [None]:
print("There are %d caudate only genes (no MHC) present in the Gandal at Bonferroni < 0.05." % 
      len(set(caudate_noMHC_bonferroni.FILE) & set(gandal_twas.GeneID)))
#list(set(caudate_bonferroni.ID) & set(gandal_twas.gene_name))

print("There are %d caudate only genes (no MHC) present in the Gandal." % 
      len(set(caudate_noMHC.FILE) & set(gandal_twas.GeneID)))
np.array(list(set(caudate_noMHC.FILE) & set(gandal_twas.GeneID)))

#### MHC is present within Gandal analysis

In [None]:
gandal[(gandal["TWAS.Bonferroni"] < 0.05)].shape

In [None]:
## Not MHC
gandal[~(gandal['gene_name'].isin(list(set(mhc_genes)))) & 
       (gandal["TWAS.Bonferroni"] < 0.05)].shape

In [None]:
print("There are {} MHC genes within Gandal significant TWAS (Bonferroni < 0.05)!"\
      .format(len(gandal[(gandal['gene_name'].isin(list(set(mhc_genes)))) & 
       (gandal["TWAS.Bonferroni"] < 0.05)].gene_name)))
gandal[(gandal['gene_name'].isin(list(set(mhc_genes)))) & 
       (gandal["TWAS.Bonferroni"] < 0.05)].gene_name

### Calculated enrichment with Gandal

In [None]:
caudate.shape

In [None]:
dft = caudate.loc[:, ['FILE', 'ID', 'Bonferroni']]\
             .merge(gandal.loc[:, ['GeneID', 'gene_name', 'TWAS.Bonferroni']], 
                    left_on='FILE', right_on='GeneID', 
                    suffixes=["_Benjamin", "_Gandal"])
dft.shape

In [None]:
table =  [[np.sum((dft['Bonferroni']<0.05) & ((dft['TWAS.Bonferroni']<.05))),
           np.sum((dft['Bonferroni']<0.05) & ((dft['TWAS.Bonferroni']>=.05)))],
          [np.sum((dft['Bonferroni']>=0.05) & ((dft['TWAS.Bonferroni']<.05))),
           np.sum((dft['Bonferroni']>=0.05) & ((dft['TWAS.Bonferroni']>=.05)))]]
print(table)
fisher_exact(table)

### Extract and save overlapping genes

#### Bonferroni < 0.05

In [None]:
overlapping_twas = np.append(np.array(caudate_bonferroni[(caudate_bonferroni['FILE'].isin(list(set(caudate_bonferroni.FILE) & 
                                                                         set(gandal_twas.GeneID))))].ID), 
                             np.array(list(set(caudate_bonferroni.ID) & set(twas_hub))))
len(overlapping_twas)

In [None]:
caudate_bonferroni[~(caudate_bonferroni['ID'].isin(overlapping_twas))].to_csv("caudate_only_twasList_genes_bonferroni.txt", 
                                                        sep='\t', index=False)
caudate_bonferroni[~(caudate_bonferroni['ID'].isin(overlapping_twas))].shape

In [None]:
drop_caudate = caudate_bonferroni[~(caudate_bonferroni['ID'].isin(overlapping_twas))].copy()
drop_caudate[(drop_caudate['P'] > 5e-8)].shape

In [None]:
drop_caudate[(drop_caudate['P'] > 5e-8)].sort_values('FDR')\
                                        .loc[:, ['ID', 'our_snp_id', 'CHR_TWAS', 'FDR', 'P', 'FILE']]

#### FDR < 0.05

In [None]:
overlapping_twas = np.append(np.array(caudate[(caudate['FILE'].isin(list(set(caudate.FILE) & 
                                                                         set(gandal_twas.GeneID))))].ID), 
                             np.array(list(set(caudate.ID) & set(twas_hub))))
len(overlapping_twas)

In [None]:
caudate[~(caudate['ID'].isin(overlapping_twas))].to_csv("caudate_only_twasList_genes.txt", 
                                                        sep='\t', index=False)
caudate[~(caudate['ID'].isin(overlapping_twas))].shape

In [None]:
drop_caudate = caudate[~(caudate['ID'].isin(overlapping_twas))].copy()
drop_caudate[(drop_caudate['P'] > 5e-8)].shape

In [None]:
drop_caudate[(drop_caudate['P'] > 5e-8)].sort_values('FDR')\
                                        .loc[:, ['ID', 'our_snp_id', 'CHR_TWAS', 'FDR', 'P', 'FILE']].head(25)

In [None]:
drop_caudate[(drop_caudate["ID"] == "MIAT")].loc[:, ['ID', 'our_snp_id', 'CHR_TWAS', 'FDR', 'P', 'FILE']]

## TWAS tissue summary

In [None]:
brainseq = pd.read_csv("../../libd_twas_comparison/_m/TWAS_gene_tissue_summary.csv")
brainseq.shape

In [None]:
brainseq.head(2)

In [None]:
bb = brainseq.merge(pd.DataFrame({'Symbol': twas_hub, 'inTWAS_HUB': 1}), on='Symbol', how='left')\
             .merge(pd.DataFrame({'Geneid': gandal_twas.GeneID, 'inGandal': 1}), 
                    on="Geneid", how='left')

bb.to_csv('TWAS_gene_tissue_summary.csv', index=False)