In [1]:
# Leiden and igraph
import pandas as pd
import numpy as np
from sklearn.neighbors import kneighbors_graph
import igraph as ig
import leidenalg
import re
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests


# Ensembl ID to Entrez ID conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter


# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS


# Word cloud
import wordcloud
import math
import random

# GTF parser for Ensembl ID to gene symbol conversion
from gtfparse import read_gtf



  from pandas.core.index import Index as PandasIndex


In [2]:
def latent_variables_to_leiden(df, n_neighbors, directed, seed):
    
    adjacency = kneighbors_graph(df, n_neighbors, mode='connectivity', include_self=False)
    
    distance = kneighbors_graph(df, n_neighbors, mode='distance', include_self=False)
    
    sources, targets = adjacency.nonzero()

    # weight is inverse distance squared

    weights = 1 /  distance[sources, targets].A1 ** 2

    
    g = ig.Graph(directed=directed)
    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shape[0] vertices
    g.add_edges(list(zip(sources, targets)))
    g.es['weight'] = weights
    
    part = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition, weights='weight', seed=seed)
    
    
    membership_df = pd.DataFrame({'cluster_id':part.membership}, index=df.index)
    
    return (membership_df, part)

In [3]:
# Ensembl to Entrez

class EnsemblIDToEntrezIDConverter():
    def __init__(self):
        with localconverter(ro.default_converter + pandas2ri.converter):
            df_a = ro.conversion.rpy2py(ro.r('''
                                             library(org.Hs.eg.db)
                                             as.data.frame(org.Hs.egENSEMBL)
                                            '''))
            self.entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
    
    def convert(self, gene_list_or_set):
        
        ensembl_id_df = pd.DataFrame(index = { re.sub("\..*$","",x) for x in gene_list_or_set })
        entrez_ids = set(ensembl_id_df.merge(self.entrez_ensembl_df, left_index=True, right_index=True)['gene_id'].astype(int))
        return entrez_ids
        
    
    
    
# Gene Ontology
    
def get_entrez_ensembl_df():
    with localconverter(ro.default_converter + pandas2ri.converter):
        df_a = ro.conversion.rpy2py(ro.r('''
                                         library(org.Hs.eg.db)
                                         as.data.frame(org.Hs.egENSEMBL)
                                        '''))
        entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
        return entrez_ensembl_df


class MyGeneOntologyAnalysis():
    def __init__(self):
        

        obo_fname = download_go_basic_obo()
        file_gene2go = download_ncbi_associations()
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[9606])

        # Get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        for nspc, id2gos in ns2assoc.items():
            print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
          
        genes_with_annotation = set.union(*(set(x.keys()) for x in ns2assoc.values()))

        
        self.alpha = 0.05
        
        self.goeaobj = GOEnrichmentStudyNS(
            genes_with_annotation, # List of human genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = self.alpha, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method
        
        

    def goea_results_all(self, gene_set):
        return self.goeaobj.run_study(gene_set)
    
    
    def goea_results_significant(self, gene_set):
        all_results = self.goea_results_all(gene_set)
        return [r for r in all_results if r.p_fdr_bh < self.alpha]
    
    

# Word cloud
    
class GeneOntologyWordCloud():
    def __init__(self):
        self.wc = wordcloud.WordCloud(colormap='rainbow', 
                             stopwords=['integral', 'component', 'of', 'process', 'activity', 'to'],
                             collocations = True,
                             ranks_only=True,
                         )
        
    def gen_random_text(self):
        # Generate a spacer between gene ontology terms
        return ' '.join(''.join((random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(150))) for _ in range(3))
        
    def goea_to_text(self, goea_results):
        # Generate a text concatenating GO term names, each name being repeated proportionally to the minus log of its pvalue
        return ' '.join((''.join( ' ' + self.gen_random_text() + ' ' + x.name for _ in range(int((-10) * math.log2(x.p_uncorrected))))
                for x in goea_results  if x.enrichment == 'e'))
    
    def generate_image(self, goea_results):
        return self.wc.generate(self.goea_to_text(goea_results)).to_image()
        
    

In [4]:
def get_gtf_genes_df():
    try:
        gtf_df_genes = pd.read_csv("gtf_df_genes.csv")
    except:
        gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
        gtf_df_genes = gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]
        gtf_df_genes.to_csv("gtf_df_genes.csv", index=None)
        
    return gtf_df_genes
        

In [5]:
class ClusterGOWordCloud():
    def __init__(self, n_neighbors, directed, seed):
        
        self.n_neighbors = n_neighbors
        expression_df = pd.read_csv('../../_m/latent_variables.csv', index_col=0)
        mucols = [x for x in expression_df.columns if 'mu' in x]
        self.expression_df = expression_df[mucols]
        
        
        
        (self.mdf, self.part) =  latent_variables_to_leiden(self.expression_df, n_neighbors, directed, seed)
        
        
        self.gtf_df_genes = get_gtf_genes_df()

        self.e2e = EnsemblIDToEntrezIDConverter()
        self.mygoa = MyGeneOntologyAnalysis()
        self.gowc = GeneOntologyWordCloud()
    
    
    def cluster_df(self, cluster_id):
        
        return self.expression_df.iloc[self.part[cluster_id]][[]]\
        .merge(cgowc.gtf_df_genes, left_index=True, right_on='gene_id', how='left')\
        .set_index('gene_id')
        
    
    def pipeline(self, cluster_id, filename_prefix):
        
        #with open("%s_neighbors.txt" % filename_prefix, "wt") as f:
        #    for x in nn:
        #        print(x, file=f)

        #self.gn.neighbors_df(gene_id, self.n_neighbors)\
        #.to_csv("%s_neighbors.csv" % filename_prefix)
        
        #nn = self.gn.neighbors(gene_id, self.n_neighbors)
        
        self.cluster_df(cluster_id).to_csv("%s_genes.csv" % filename_prefix)
        
        nn = set((self.expression_df.index[x] for x in self.part[cluster_id]))
                
        go_r = self.mygoa.goea_results_significant(self.e2e.convert(nn))
        self.mygoa.goeaobj.wr_tsv("%s_go_enrichment.tsv" % filename_prefix, go_r)
        
        if len(go_r) > 0:
            p = self.gowc.generate_image(go_r)
            p.save("%s_go_wordcloud.png" % filename_prefix)
        
    
    


In [6]:
cgowc = ClusterGOWordCloud(8, False, 1)

#1,5,6
#8






Attaching package: 'BiocGenerics'



    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB



    IQR, mad, sd, var, xtabs



    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which, which.max, which.min




    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.




Attaching package: 'S4Vectors'



    expand.grid





  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-04-23) 47,239 GO Terms
HMS:0:00:03.522364 330,518 annotations, 20,562 genes, 18,403 GOs, 1 taxids READ: gene2go 
MF 17,537 annotated human genes
BP 18,499 annotated human genes
CC 19,334 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 90% 18,499 of 20,517 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 94% 19,334 of 20,517 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 85% 17,537 of 20,517 population items found in association


In [7]:
len(cgowc.part)

20

In [8]:
a = [x for x in range(len(cgowc.part)) if 1 in cgowc.part[x]][0]
print("D2 junction 5-6 is in module", a)

a = [x for x in range(len(cgowc.part)) if 0 in cgowc.part[x]][0]
print("D2 junction 5-7 is in module", a)

d = {z[0]:z[1]  for z in zip(cgowc.gtf_df_genes['gene_name'], cgowc.gtf_df_genes['gene_id'])}
a = cgowc.mdf.loc[d['SETD1A'], 'cluster_id']
print("SETD1A is in module", a)

D2 junction 5-6 is in cluster 0
D2 junction 5-7 is in cluster 6
SETD1A is in cluster 3


# GWAS, TWAS and DE enrichment


In [9]:
de_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/analysis/differential_expression/_m/genes/diffExpr_szVctl_FDR05.txt',
                      sep='\t', usecols=[0], index_col=0).index)
len(de_genes)


2699

In [10]:
twas_genes = set(pd.read_csv('/ceph/users/apua/projects/caudate_twas_reader/genes/_m/twas_significant_genes.csv')['gene_id'])
len(twas_genes)

389

In [11]:
gwas_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/genes/_m/gwas_genes.csv')['gene_id'])
len(gwas_genes)

2000

In [12]:
mhc_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/mhc_region_genes/_m/mhc_genes.csv')['gene_id'])
len(mhc_genes)

383

In [13]:
def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    
    
    
    
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
                               [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return stats.fisher_exact(m) #, m, len(yes_b)/len(u), len(yes_a)/len(u), len(yes_a.intersection(yes_b)) * len(no_a.intersection(no_b)) / (len(yes_a.intersection(no_b)) * len(no_a.intersection(yes_b)))

In [14]:
def enrichment_rows():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:])
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii]))
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes, u),
               *fet(a, twas_genes, u),
               *fet(a, de_genes, u),
               )
        
    #print(ii, 0 in part[ii], 1 in part[ii], fet(a, gwas_genes, u), fet(a, twas_genes, u), fet(a, de_genes, u), len(part[ii]))
    
edf1 = pd.DataFrame.from_records(enrichment_rows(), 
                                 columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 'de_or', 'de_p'],
                                 index='module_id')
edf1['twas_fdr_bh'] = multipletests(edf1['twas_p'], method='fdr_bh')[1]
edf1['gwas_fdr_bh'] = multipletests(edf1['gwas_p'], method='fdr_bh')[1]
edf1['de_fdr_bh'] = multipletests(edf1['de_p'], method='fdr_bh')[1]

edf1[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment.csv')

edf1

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,twas_or,twas_p,de_or,de_p,twas_fdr_bh,gwas_fdr_bh,de_fdr_bh
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4917,1.266284,0.001428,1.391645,0.008004,2.569408,6.466662e-92,0.026679,0.028564,6.466662e-91
1,2877,0.778369,0.017285,0.592902,0.007999,0.24638,5.581596e-53,0.026679,0.057615,3.7210640000000004e-52
2,2288,0.89915,0.360731,1.449815,0.021982,0.859366,0.04910687,0.055053,0.515329,0.05777278
3,1949,1.189048,0.117726,1.688713,0.002093,1.784156,5.959732e-17,0.010467,0.19621,1.191946e-16
4,1891,1.216091,0.077965,1.745942,0.001264,0.910389,0.2657709,0.008424,0.141754,0.2797588
5,1869,1.320908,0.011018,0.837988,0.481794,0.548429,1.04391e-10,0.759806,0.044072,1.73985e-10
6,1285,0.720704,0.041116,1.13884,0.548444,2.17139,4.011151e-22,0.783491,0.082233,1.33705e-21
7,1095,1.03944,0.761506,0.681277,0.244801,0.202418,2.688646e-24,0.544002,0.801585,1.075458e-23
8,1024,0.682996,0.040966,0.193611,0.000414,0.318141,1.880132e-15,0.004141,0.082233,3.418421e-15
9,1020,0.662454,0.027436,0.194404,0.000409,0.182406,1.948498e-24,0.004141,0.07839,9.742489e-24


In [15]:
def enrichment_rows_nomhc():
    part = cgowc.part
    df = cgowc.expression_df
    u = set(cgowc.expression_df.index[3:]) - mhc_genes
    for ii in range(len(part)):
        a = set((df.index[x] for x in part[ii])) - mhc_genes
        yield (ii,
               len(part[ii]),
               *fet(a, gwas_genes - mhc_genes, u),
               *fet(a, twas_genes - mhc_genes, u),
               *fet(a, de_genes - mhc_genes, u),
              )
        
    #print(ii, 0 in part[ii], 1 in part[ii], fet(a, gwas_genes, u), fet(a, twas_genes, u), fet(a, de_genes, u), len(part[ii]))
    
edf2 = pd.DataFrame.from_records(enrichment_rows_nomhc(), 
                                columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 'twas_or', 'twas_p', 'de_or', 'de_p'],
                                index='module_id')
edf2['twas_fdr_bh'] = multipletests(edf2['twas_p'], method='fdr_bh')[1]
edf2['gwas_fdr_bh'] = multipletests(edf2['gwas_p'], method='fdr_bh')[1]
edf2['de_fdr_bh'] = multipletests(edf2['de_p'], method='fdr_bh')[1]

edf2[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('module_enrichment_excluding_mhc_region.csv')

edf2

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,twas_or,twas_p,de_or,de_p,twas_fdr_bh,gwas_fdr_bh,de_fdr_bh
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,4917,1.324797,0.000445,1.44421,0.004375,2.551013,7.972072e-90,0.01739,0.008896,7.972072e-89
1,2877,0.870493,0.225448,0.6274,0.024133,0.246658,7.371443e-53,0.05763,0.375747,4.914295e-52
2,2288,1.047981,0.682872,1.609621,0.005217,0.859728,0.04902329,0.01739,0.951527,0.05767446
3,1949,1.01228,0.899541,1.802268,0.000816,1.79146,5.286182e-17,0.005545,0.99949,1.057236e-16
4,1891,1.273429,0.041003,1.797865,0.001047,0.910112,0.2644619,0.005545,0.117152,0.278381
5,1869,1.048663,0.698717,0.554064,0.025933,0.559047,4.70538e-10,0.05763,0.951527,7.842299e-10
6,1285,0.646181,0.017297,0.944156,1.0,2.179576,3.0041320000000003e-22,1.0,0.082011,1.001377e-21
7,1095,1.034995,0.804356,0.682027,0.278153,0.203759,3.62071e-24,0.505732,0.99949,1.4482840000000002e-23
8,1024,0.627987,0.026124,0.213462,0.001107,0.320125,3.54382e-15,0.005545,0.087081,6.443309e-15
9,1020,0.738109,0.125045,0.213244,0.001109,0.182561,2.849244e-24,0.005545,0.303429,1.424622e-23


# GO enrichment for each cluster

In [16]:
# Run pipeline for each cluster
for cluster_id in range(len(cgowc.part)):
    cgowc.pipeline(cluster_id, 'module%d' % cluster_id)


Run BP Gene Ontology Analysis: current study set of 4688 IDs ...
 94%  4,269 of  4,560 study items found in association
 97%  4,560 of  4,688 study items found in population(20517)
Calculating 12,372 uncorrected p-values using fisher_scipy_stats
  12,372 GO terms are associated with 18,499 of 20,517 population items
   7,554 GO terms are associated with  4,269 of  4,688 study items
  METHOD fdr_bh:
     385 GO terms found significant (< 0.05=alpha) (351 enriched +  34 purified): statsmodels fdr_bh
   3,145 study items associated with significant GO IDs (enriched)
     631 study items associated with significant GO IDs (purified)

Run CC Gene Ontology Analysis: current study set of 4688 IDs ...
 97%  4,444 of  4,560 study items found in association
 97%  4,560 of  4,688 study items found in population(20517)
Calculating 1,756 uncorrected p-values using fisher_scipy_stats
   1,756 GO terms are associated with 19,334 of 20,517 population items
   1,279 GO terms are associated with  4,444

In [17]:
pwd

'/ceph/users/apua/projects/v2_caudate_gene_vae_394/disvae/model/embedding/leiden/_h'