In [None]:
# Leiden and igraph
import pandas as pd
import numpy as np
from sklearn.neighbors import kneighbors_graph
import igraph as ig
import leidenalg
import re
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests


# Ensembl ID to Entrez ID conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter


# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS


# Word cloud
import wordcloud
import math
import random

# GTF parser for Ensembl ID to gene symbol conversion
from gtfparse import read_gtf

In [None]:
def latent_variables_to_leiden(df, n_neighbors, directed, seed):
    adjacency = kneighbors_graph(df, n_neighbors, mode='connectivity', include_self=False)
    distance = kneighbors_graph(df, n_neighbors, mode='distance', include_self=False)
    sources, targets = adjacency.nonzero()
    # weight is inverse distance squared
    weights = 1 /  distance[sources, targets].A1 ** 2
    g = ig.Graph(directed=directed)
    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shape[0] vertices
    g.add_edges(list(zip(sources, targets)))
    g.es['weight'] = weights
    part = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition, weights='weight', seed=seed)
    membership_df = pd.DataFrame({'cluster_id':part.membership}, index=df.index)
    return (membership_df, part)

In [None]:
# Ensembl to Entrez

class EnsemblIDToEntrezIDConverter():
    def __init__(self):
        with localconverter(ro.default_converter + pandas2ri.converter):
            df_a = ro.conversion.rpy2py(ro.r('''
                                             library(org.Hs.eg.db)
                                             as.data.frame(org.Hs.egENSEMBL)
                                            '''))
            self.entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
    
    def convert(self, gene_list_or_set):
        
        ensembl_id_df = pd.DataFrame(index = { re.sub("\..*$","",x) for x in gene_list_or_set })
        entrez_ids = set(ensembl_id_df.merge(self.entrez_ensembl_df, left_index=True, right_index=True)['gene_id'].astype(int))
        return entrez_ids
    
# Gene Ontology
    
def get_entrez_ensembl_df():
    with localconverter(ro.default_converter + pandas2ri.converter):
        df_a = ro.conversion.rpy2py(ro.r('''
                                         library(org.Hs.eg.db)
                                         as.data.frame(org.Hs.egENSEMBL)
                                        '''))
        entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
        return entrez_ensembl_df


class MyGeneOntologyAnalysis():
    def __init__(self):
        obo_fname = download_go_basic_obo()
        file_gene2go = download_ncbi_associations()
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[9606])

        # Get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        for nspc, id2gos in ns2assoc.items():
            print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
          
        genes_with_annotation = set.union(*(set(x.keys()) for x in ns2assoc.values()))  
        self.alpha = 0.05    
        self.goeaobj = GOEnrichmentStudyNS(
            genes_with_annotation, # List of human genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = self.alpha, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method     

    def goea_results_all(self, gene_set):
        return self.goeaobj.run_study(gene_set)  
    
    def goea_results_significant(self, gene_set):
        all_results = self.goea_results_all(gene_set)
        return [r for r in all_results if r.p_fdr_bh < self.alpha]
    
# Word cloud
    
class GeneOntologyWordCloud():
    def __init__(self):
        self.wc = wordcloud.WordCloud(colormap='rainbow', 
                             stopwords=['integral', 'component', 'of', 'process', 'activity', 'to'],
                             collocations = True,
                             ranks_only=True,
                         )
        
    def gen_random_text(self):
        # Generate a spacer between gene ontology terms
        return ' '.join(''.join((random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(150))) for _ in range(3))
        
    def goea_to_text(self, goea_results):
        # Generate a text concatenating GO term names, each name being repeated proportionally to the minus log of its pvalue
        return ' '.join((''.join( ' ' + self.gen_random_text() + ' ' + x.name for _ in range(int((-10) * math.log2(x.p_uncorrected))))
                for x in goea_results  if x.enrichment == 'e'))
    
    def generate_image(self, goea_results):
        return self.wc.generate(self.goea_to_text(goea_results)).to_image()
        
    

In [None]:
def get_gtf_genes_df():
    try:
        gtf_df_genes = pd.read_csv("gtf_df_genes.csv")
    except:
        gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
        gtf_df_genes = gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]
        gtf_df_genes.to_csv("gtf_df_genes.csv", index=None)
        
    return gtf_df_genes
        

In [None]:
class ClusterGOWordCloud():
    def __init__(self, n_neighbors, directed, seed):
        self.n_neighbors = n_neighbors
        expression_df = pd.read_csv('../../_m/latent_variables.csv', index_col=0)
        mucols = [x for x in expression_df.columns if 'mu' in x]
        self.expression_df = expression_df[mucols]
        (self.mdf, self.part) =  latent_variables_to_leiden(self.expression_df, n_neighbors, directed, seed)
        self.gtf_df_genes = get_gtf_genes_df()
        self.e2e = EnsemblIDToEntrezIDConverter()
        self.mygoa = MyGeneOntologyAnalysis()
        self.gowc = GeneOntologyWordCloud()
    
    def cluster_df(self, cluster_id):
        
        return self.expression_df.iloc[self.part[cluster_id]][[]]\
        .merge(cgowc.gtf_df_genes, left_index=True, right_on='gene_id', how='left')\
        .set_index('gene_id')
        
    def pipeline(self, cluster_id, filename_prefix):
        
        self.cluster_df(cluster_id).to_csv("%s_genes.csv" % filename_prefix)
        
        nn = set((self.expression_df.index[x] for x in self.part[cluster_id]))
                
        go_r = self.mygoa.goea_results_significant(self.e2e.convert(nn))
        self.mygoa.goeaobj.wr_tsv("%s_go_enrichment.tsv" % filename_prefix, go_r)
        
        if len(go_r) > 0:
            p = self.gowc.generate_image(go_r)
            p.save("%s_go_wordcloud.png" % filename_prefix)
        

In [None]:
cgowc = ClusterGOWordCloud(8, False, 1092333)

In [None]:
len(cgowc.part)

In [None]:
a = [x for x in range(len(cgowc.part)) if 1 in cgowc.part[x]][0]
print("D2 junction 5-6 is in module", a)

a = [x for x in range(len(cgowc.part)) if 0 in cgowc.part[x]][0]
print("D2 junction 5-7 is in module", a)

d = {z[0]:z[1]  for z in zip(cgowc.gtf_df_genes['gene_name'], cgowc.gtf_df_genes['gene_id'])}
a = cgowc.mdf.loc[d['SETD1A'], 'cluster_id']
print("SETD1A is in module", a)

a = cgowc.mdf.loc[d['DRD2'], 'cluster_id']
print("DRD2 is in module", a)

# GWAS, TWAS, SMR, and DE enrichment

In [None]:
de_genes = set(pd.read_csv('../../../../../../differential_expression/_m/genes/diffExpr_szVctl_FDR05.txt',
                      sep='\t', usecols=[0], index_col=0).index)
len(de_genes)

In [None]:
annot = pd.read_csv("/ceph/projects/v4_phase3_paper/inputs/counts/text_files_counts/_m/caudate/gene.bed", 
                    sep='\t', index_col=0)
annot["Feature"] = annot.gene_id.str.replace("\\..*", "", regex=True)

twas = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas_ea/'+\
                    'gene_weights/fusion/summary_stats/_m/fusion_associations.txt', sep='\t')
twas = twas[(twas["FDR"] < 0.05)].merge(annot, left_on="FILE", right_on="Feature")

twas_genes = set(twas['gene_id'])
len(twas_genes)

In [None]:
## Extract prioritized genes from PGC3 (FINEMAP or SMR evidence)
gwas_fn = '/ceph/users/jbenja13/resources/gwas/pgc3/_m/'+\
        'nature_submission_11.08.2021/Supplementary Tables/'+\
        'Supplementary Table 12 - Prioritized Genes UPDATED.xlsx'
gwas_df = pd.read_excel(gwas_fn, sheet_name="Prioritised")\
    .merge(annot, left_on="Ensembl.ID", right_on="Feature")
gwas_genes = set(gwas_df['gene_id'])
len(gwas_genes)

In [None]:
mhc_genes = set(pd.read_csv('/ceph/projects/v4_phase3_paper/inputs/counts/mhc_region_genes/_m/mhc_genes.csv')['gene_id'])
len(mhc_genes)

In [None]:
smr_fn = "../../../../../../smr/_m/eqtl_genes.eqtl_p1e-04.gwas_p5e-08.csv"
smr_df = pd.read_csv(smr_fn)
smr_genes = set(smr_df[(smr_df["FDR"] < 0.05) & 
                       (smr_df["p_HEIDI"] > 0.01)]["probeID"])
len(smr_genes)

In [None]:
def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
                               [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return stats.fisher_exact(m)

In [None]:
def enrichment_rows():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:])
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii]))
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes, u),
               *fet(a, twas_genes, u),
               *fet(a, smr_genes, u),
               *fet(a, de_genes, u),
               )
        
    #print(ii, 0 in part[ii], 1 in part[ii], fet(a, gwas_genes, u), fet(a, twas_genes, u), fet(a, de_genes, u), len(part[ii]))
    

In [None]:
edf1 = pd.DataFrame.from_records(enrichment_rows(), 
                                 columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 
                                          'smr_or', "smr_p", 'de_or', 'de_p'],
                                 index='module_id')
edf1['twas_fdr_bh'] = multipletests(edf1['twas_p'], method='fdr_bh')[1]
edf1['gwas_fdr_bh'] = multipletests(edf1['gwas_p'], method='fdr_bh')[1]
edf1['smr_fdr_bh'] = multipletests(edf1['smr_p'], method='fdr_bh')[1]
edf1['de_fdr_bh'] = multipletests(edf1['de_p'], method='fdr_bh')[1]
edf1[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 
      'smr_or', "smr_p", "smr_fdr_bh", 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment.csv')
edf1

In [None]:
def enrichment_rows_nomhc():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:]) - mhc_genes
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii])) - mhc_genes
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes - mhc_genes, u),
               *fet(a, twas_genes - mhc_genes, u),
               *fet(a, smr_genes - mhc_genes, u),
               *fet(a, de_genes - mhc_genes, u),
              )
        

In [None]:
edf2 = pd.DataFrame.from_records(enrichment_rows_nomhc(), 
                                 columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 
                                          'smr_or', "smr_p", 'de_or', 'de_p'],
                                 index='module_id')
edf2['twas_fdr_bh'] = multipletests(edf2['twas_p'], method='fdr_bh')[1]
edf2['gwas_fdr_bh'] = multipletests(edf2['gwas_p'], method='fdr_bh')[1]
edf2['smr_fdr_bh'] = multipletests(edf2['smr_p'], method='fdr_bh')[1]
edf2['de_fdr_bh'] = multipletests(edf2['de_p'], method='fdr_bh')[1]
edf2[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 
      'smr_or', "smr_p", "smr_fdr_bh", 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment_excluding_mhc_region.csv')
edf2

# GO enrichment for each cluster

In [None]:
# Run pipeline for each cluster
for cluster_id in range(len(cgowc.part)):
    cgowc.pipeline(cluster_id, 'module%d' % cluster_id)

In [None]:
pwd