In [1]:
# Leiden and igraph
import pandas as pd
import numpy as np
from sklearn.neighbors import kneighbors_graph
import igraph as ig
import leidenalg
import re
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests


# Ensembl ID to Entrez ID conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter


# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS


# Word cloud
import wordcloud
import math
import random

# GTF parser for Ensembl ID to gene symbol conversion
from gtfparse import read_gtf



In [2]:
def latent_variables_to_leiden(df, n_neighbors, directed, seed):
    
    adjacency = kneighbors_graph(df, n_neighbors, mode='connectivity', include_self=False)
    
    distance = kneighbors_graph(df, n_neighbors, mode='distance', include_self=False)
    
    sources, targets = adjacency.nonzero()

    # weight is inverse distance squared

    weights = 1 /  distance[sources, targets].A1 ** 2

    
    g = ig.Graph(directed=directed)
    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shape[0] vertices
    g.add_edges(list(zip(sources, targets)))
    g.es['weight'] = weights
    
    part = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition, weights='weight', seed=seed)
    
    
    membership_df = pd.DataFrame({'cluster_id':part.membership}, index=df.index)
    
    return (membership_df, part)

In [3]:
# Ensembl to Entrez

class EnsemblIDToEntrezIDConverter():
    def __init__(self):
        with localconverter(ro.default_converter + pandas2ri.converter):
            df_a = ro.conversion.rpy2py(ro.r('''
                                             library(org.Hs.eg.db)
                                             as.data.frame(org.Hs.egENSEMBL)
                                            '''))
            self.entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
    
    def convert(self, gene_list_or_set):
        
        ensembl_id_df = pd.DataFrame(index = { re.sub("\..*$","",x) for x in gene_list_or_set })
        entrez_ids = set(ensembl_id_df.merge(self.entrez_ensembl_df, left_index=True, right_index=True)['gene_id'].astype(int))
        return entrez_ids
        
    
    
    
# Gene Ontology
    
def get_entrez_ensembl_df():
    with localconverter(ro.default_converter + pandas2ri.converter):
        df_a = ro.conversion.rpy2py(ro.r('''
                                         library(org.Hs.eg.db)
                                         as.data.frame(org.Hs.egENSEMBL)
                                        '''))
        entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
        return entrez_ensembl_df


class MyGeneOntologyAnalysis():
    def __init__(self):
        

        obo_fname = download_go_basic_obo()
        file_gene2go = download_ncbi_associations()
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[9606])

        # Get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        for nspc, id2gos in ns2assoc.items():
            print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
          
        genes_with_annotation = set.union(*(set(x.keys()) for x in ns2assoc.values()))

        
        self.alpha = 0.05
        
        self.goeaobj = GOEnrichmentStudyNS(
            genes_with_annotation, # List of human genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = self.alpha, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method
        
        

    def goea_results_all(self, gene_set):
        return self.goeaobj.run_study(gene_set)
    
    
    def goea_results_significant(self, gene_set):
        all_results = self.goea_results_all(gene_set)
        return [r for r in all_results if r.p_fdr_bh < self.alpha]
    
    

# Word cloud
    
class GeneOntologyWordCloud():
    def __init__(self):
        self.wc = wordcloud.WordCloud(colormap='rainbow', 
                             stopwords=['integral', 'component', 'of', 'process', 'activity', 'to'],
                             collocations = True,
                             ranks_only=True,
                         )
        
    def gen_random_text(self):
        # Generate a spacer between gene ontology terms
        return ' '.join(''.join((random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(150))) for _ in range(3))
        
    def goea_to_text(self, goea_results):
        # Generate a text concatenating GO term names, each name being repeated proportionally to the minus log of its pvalue
        return ' '.join((''.join( ' ' + self.gen_random_text() + ' ' + x.name for _ in range(int((-10) * math.log2(x.p_uncorrected))))
                for x in goea_results  if x.enrichment == 'e'))
    
    def generate_image(self, goea_results):
        return self.wc.generate(self.goea_to_text(goea_results)).to_image()
        
    

In [4]:
def get_gtf_genes_df():
    try:
        gtf_df_genes = pd.read_csv("gtf_df_genes.csv")
    except:
        gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
        gtf_df_genes = gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]
        gtf_df_genes.to_csv("gtf_df_genes.csv", index=None)
        
    return gtf_df_genes
        

In [5]:
class ClusterGOWordCloud():
    def __init__(self, n_neighbors, directed, seed):
        
        self.n_neighbors = n_neighbors
        expression_df = pd.read_csv('../../_m/latent_variables.csv', index_col=0)
        mucols = [x for x in expression_df.columns if 'mu' in x]
        self.expression_df = expression_df[mucols]
        
        
        
        (self.mdf, self.part) =  latent_variables_to_leiden(self.expression_df, n_neighbors, directed, seed)
        
        
        self.gtf_df_genes = get_gtf_genes_df()

        self.e2e = EnsemblIDToEntrezIDConverter()
        self.mygoa = MyGeneOntologyAnalysis()
        self.gowc = GeneOntologyWordCloud()
    
    
    def cluster_df(self, cluster_id):
        
        return self.expression_df.iloc[self.part[cluster_id]][[]]\
        .merge(cgowc.gtf_df_genes, left_index=True, right_on='gene_id', how='left')\
        .set_index('gene_id')
        
    
    def pipeline(self, cluster_id, filename_prefix):
        
        #with open("%s_neighbors.txt" % filename_prefix, "wt") as f:
        #    for x in nn:
        #        print(x, file=f)

        #self.gn.neighbors_df(gene_id, self.n_neighbors)\
        #.to_csv("%s_neighbors.csv" % filename_prefix)
        
        #nn = self.gn.neighbors(gene_id, self.n_neighbors)
        
        self.cluster_df(cluster_id).to_csv("%s_genes.csv" % filename_prefix)
        
        nn = set((self.expression_df.index[x] for x in self.part[cluster_id]))
                
        go_r = self.mygoa.goea_results_significant(self.e2e.convert(nn))
        self.mygoa.goeaobj.wr_tsv("%s_go_enrichment.tsv" % filename_prefix, go_r)
        
        if len(go_r) > 0:
            p = self.gowc.generate_image(go_r)
            p.save("%s_go_wordcloud.png" % filename_prefix)
        
    
    


In [6]:
cgowc = ClusterGOWordCloud(8, False, 1092333)




  result = parse_gtf(


  result = parse_gtf(
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']




Attaching package: ‘BiocGenerics’



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    IQR, mad, sd, var, xtabs



    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min




   

requests.get(http://purl.obolibrary.org/obo/go/go-basic.obo, stream=True)
  WROTE: go-basic.obo

FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz
  gunzip gene2go.gz
go-basic.obo: fmt(1.2) rel(2021-09-01) 47,191 GO Terms
HMS:0:00:04.667423 330,404 annotations, 20,688 genes, 18,642 GOs, 1 taxids READ: gene2go 
MF 18,194 annotated human genes
CC 19,433 annotated human genes
BP 18,501 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 90% 18,501 of 20,652 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 94% 19,433 of 20,652 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 88% 18,194 of 20,652 population items found in association


In [7]:
len(cgowc.part)

20

In [8]:
a = [x for x in range(len(cgowc.part)) if 1 in cgowc.part[x]][0]
print("D2 junction 5-6 is in module", a)

a = [x for x in range(len(cgowc.part)) if 0 in cgowc.part[x]][0]
print("D2 junction 5-7 is in module", a)

d = {z[0]:z[1]  for z in zip(cgowc.gtf_df_genes['gene_name'], cgowc.gtf_df_genes['gene_id'])}
a = cgowc.mdf.loc[d['SETD1A'], 'cluster_id']
print("SETD1A is in module", a)

a = cgowc.mdf.loc[d['DRD2'], 'cluster_id']
print("DRD2 is in module", a)




D2 junction 5-6 is in module 0
D2 junction 5-7 is in module 0
SETD1A is in module 5
DRD2 is in module 0


# GWAS, TWAS and DE enrichment


In [9]:
de_genes = set(pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/differential_expression/_m/genes/diffExpr_szVctl_FDR05.txt',
                      sep='\t', usecols=[0], index_col=0).index)
len(de_genes)


2696

In [10]:
def get_twas_genes():
    df = pd.read_csv('/ceph/projects/v4_phase3_paper/analysis/twas/results_tables/pgc2/_m/gene_twas_results_including_MHC.csv')
    return set(df[df['TWAS.P.fdr_bh'] < .05]['gene_id'])


twas_genes = get_twas_genes()
len(twas_genes)

684

In [11]:
gwas_genes = set(pd.read_csv('/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/table_s3/hg38/genes/_m/gwas_genes.csv')['gene_id'])
len(gwas_genes)

2000

In [12]:
mhc_genes = set(pd.read_csv('/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/table_s3/hg38/mhc_region_genes/_m/mhc_genes.csv')['gene_id'])
len(mhc_genes)

383

In [13]:
def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    
    
    
    
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
                               [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return stats.fisher_exact(m) #, m, len(yes_b)/len(u), len(yes_a)/len(u), len(yes_a.intersection(yes_b)) * len(no_a.intersection(no_b)) / (len(yes_a.intersection(no_b)) * len(no_a.intersection(yes_b)))

In [14]:
def enrichment_rows():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:])
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii]))
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes, u),
               *fet(a, twas_genes, u),
               *fet(a, de_genes, u),
               )
        
    #print(ii, 0 in part[ii], 1 in part[ii], fet(a, gwas_genes, u), fet(a, twas_genes, u), fet(a, de_genes, u), len(part[ii]))
    
edf1 = pd.DataFrame.from_records(enrichment_rows(), 
                                 columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 'de_or', 'de_p'],
                                 index='module_id')
edf1['twas_fdr_bh'] = multipletests(edf1['twas_p'], method='fdr_bh')[1]
edf1['gwas_fdr_bh'] = multipletests(edf1['gwas_p'], method='fdr_bh')[1]
edf1['de_fdr_bh'] = multipletests(edf1['de_p'], method='fdr_bh')[1]

edf1[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment.csv')

edf1

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,twas_or,twas_p,de_or,de_p,twas_fdr_bh,gwas_fdr_bh,de_fdr_bh
module_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4932,1.19538,0.017789,1.51528,7e-06,2.970279,1.148426e-124,7.3e-05,0.08524,2.296851e-123
1,3681,0.977286,0.827135,0.569069,2.1e-05,0.343346,3.497782e-46,0.000137,0.919039,3.497782e-45
2,3047,1.04877,0.602387,1.686867,1e-06,1.204706,0.002526951,3e-05,0.919039,0.003158689
3,2688,1.110855,0.270985,1.294817,0.032309,0.957064,0.5414012,0.080773,0.677462,0.5414012
4,1533,1.046916,0.697807,0.839276,0.401404,0.908283,0.3126862,0.573434,0.919039,0.3474292
5,1488,1.170352,0.211855,1.185547,0.269657,1.55955,3.047559e-08,0.414857,0.605301,5.541017e-08
6,1353,0.471844,1.4e-05,0.564796,0.009668,1.069158,0.4564584,0.038673,0.000275,0.4804825
7,1157,0.72288,0.054457,0.494499,0.00397,0.183562,3.232025e-27,0.01985,0.181522,1.616013e-26
8,1103,0.545154,0.001089,0.556236,0.018219,0.201529,1.279795e-24,0.060729,0.007257,4.265982e-24
9,1047,1.116042,0.438109,0.860136,0.547137,0.160597,9.165784000000001e-27,0.729516,0.836868,3.6663139999999996e-26


In [15]:
def enrichment_rows_nomhc():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:]) - mhc_genes
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii])) - mhc_genes
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes - mhc_genes, u),
               *fet(a, twas_genes - mhc_genes, u),
               *fet(a, de_genes - mhc_genes, u),
              )
        
    #print(ii, 0 in part[ii], 1 in part[ii], fet(a, gwas_genes, u), fet(a, twas_genes, u), fet(a, de_genes, u), len(part[ii]))
    
edf2 = pd.DataFrame.from_records(enrichment_rows_nomhc(), 
                                columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 'de_or', 'de_p'],
                                index='module_id')
edf2['twas_fdr_bh'] = multipletests(edf2['twas_p'], method='fdr_bh')[1]
edf2['gwas_fdr_bh'] = multipletests(edf2['gwas_p'], method='fdr_bh')[1]
edf2['de_fdr_bh'] = multipletests(edf2['de_p'], method='fdr_bh')[1]

edf2[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment_excluding_mhc_region.csv')

edf2

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,twas_or,twas_p,de_or,de_p,twas_fdr_bh,gwas_fdr_bh,de_fdr_bh
module_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4932,1.270454,0.003104,1.549287,5.86786e-06,2.961945,2.549525e-123,3.9e-05,0.020695,5.09905e-122
1,3681,1.020123,0.812521,0.519681,3.62671e-06,0.345171,1.176488e-45,3.6e-05,0.931078,1.176488e-44
2,3047,1.08928,0.381779,1.766923,3.520444e-07,1.19541,0.004129949,7e-06,0.702226,0.005162436
3,2688,1.114344,0.300997,1.381112,0.009880629,0.951603,0.4958298,0.039523,0.670483,0.4958298
4,1533,0.838594,0.288888,0.693205,0.0783874,0.915592,0.3534987,0.174194,0.670483,0.3927763
5,1488,1.166873,0.252847,1.245411,0.1818538,1.565331,2.673936e-08,0.30309,0.670483,4.861702e-08
6,1353,0.524344,0.000442,0.585507,0.02007082,1.070443,0.4560327,0.057345,0.00442,0.4800344
7,1157,0.768958,0.148966,0.46705,0.003603034,0.184092,3.015561e-27,0.018015,0.595863,1.507781e-26
8,1103,0.45482,0.000199,0.53259,0.01788755,0.202944,2.5634199999999998e-24,0.057345,0.003987,8.544732999999999e-24
9,1047,0.977653,1.0,0.821551,0.4609709,0.162616,2.728412e-26,0.65853,1.0,1.091365e-25


# GO enrichment for each cluster

In [16]:
# Run pipeline for each cluster
for cluster_id in range(len(cgowc.part)):
    cgowc.pipeline(cluster_id, 'module%d' % cluster_id)


Run BP Gene Ontology Analysis: current study set of 4806 IDs ... 94%  4,412 of  4,685 study items found in association
 97%  4,685 of  4,806 study items found in population(20652)
Calculating 12,417 uncorrected p-values using fisher_scipy_stats
  12,417 GO terms are associated with 18,501 of 20,652 population items
   7,819 GO terms are associated with  4,412 of  4,806 study items
  METHOD fdr_bh:
     387 GO terms found significant (< 0.05=alpha) (361 enriched +  26 purified): statsmodels fdr_bh
   3,261 study items associated with significant GO IDs (enriched)
     583 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 4806 IDs ... 98%  4,583 of  4,685 study items found in association
 97%  4,685 of  4,806 study items found in population(20652)
Calculating 1,754 uncorrected p-values using fisher_scipy_stats
   1,754 GO terms are associated with 19,433 of 20,652 population items
   1,296 GO terms are associated with  4,583 o

In [17]:
pwd

'/ceph/projects/v4_phase3_paper/analysis/gnvae/disvae/model/embedding/leiden/_m'