In [None]:
import pandas as pd
import re
import functools
import scipy.stats as stats


# Gene neighborhood
from sklearn.neighbors import DistanceMetric

# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS


# Ensembl ID to Entrez ID conversion
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

# GTF parser for Ensembl ID to gene symbol conversion
from gtfparse import read_gtf


# Word cloud
import wordcloud
import math
import random


## Gene Neighborhood

In [None]:
class GeneNeighborhood():
    def __init__(self, expression_df, gtf_df_genes):
        
        self.dist = DistanceMetric.get_metric('euclidean')
        self.df_dist = pd.DataFrame(self.dist.pairwise(expression_df), index=expression_df.index, columns = expression_df.index)
        self.gtf_df_genes = gtf_df_genes
        
    def neighbors(self, gene, n_neighbors):
        return set(self.df_dist[gene].sort_values().head(n_neighbors).index)

    
    def neighbors_df(self, gene, n_neighbors):
        return pd.DataFrame(self.df_dist[gene].sort_values())\
        .merge(self.gtf_df_genes[['gene_id', 'gene_name']], 
               left_index=True, right_on='gene_id', how='left')\
        .rename(columns={gene:'distance'})\
        .set_index('gene_id')


## Ensembl ID to Entrez ID conversion


In [None]:
class EnsemblIDToEntrezIDConverter():
    def __init__(self):
        with localconverter(ro.default_converter + pandas2ri.converter):
            df_a = ro.conversion.rpy2py(ro.r('''
                                             library(org.Hs.eg.db)
                                             as.data.frame(org.Hs.egENSEMBL)
                                            '''))
            self.entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
    
    def convert(self, gene_list_or_set):
        
        ensembl_id_df = pd.DataFrame(index = { re.sub("\..*$","",x) for x in gene_list_or_set })
        entrez_ids = set(ensembl_id_df.merge(self.entrez_ensembl_df, left_index=True, right_index=True)['gene_id'].astype(int))
        return entrez_ids
        
    

## Gene Ontology

In [None]:
def get_entrez_ensembl_df():
    with localconverter(ro.default_converter + pandas2ri.converter):
        df_a = ro.conversion.rpy2py(ro.r('''
                                         library(org.Hs.eg.db)
                                         as.data.frame(org.Hs.egENSEMBL)
                                        '''))
        entrez_ensembl_df = df_a.groupby(['ensembl_id']).first()
        return entrez_ensembl_df


class MyGeneOntologyAnalysis():
    def __init__(self):
        

        obo_fname = download_go_basic_obo()
        file_gene2go = download_ncbi_associations()
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[9606])

        # Get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        for nspc, id2gos in ns2assoc.items():
            print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
          
        genes_with_annotation = set.union(*(set(x.keys()) for x in ns2assoc.values()))

        
        self.alpha = 0.05
        
        self.goeaobj = GOEnrichmentStudyNS(
            genes_with_annotation, # List of human genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = self.alpha, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method
        
        

    def goea_results_all(self, gene_set):
        return self.goeaobj.run_study(gene_set)
    
    
    def goea_results_significant(self, gene_set):
        all_results = self.goea_results_all(gene_set)
        return [r for r in all_results if r.p_fdr_bh < self.alpha]

    


## Word cloud

In [None]:
class GeneOntologyWordCloud():
    def __init__(self):
        self.wc = wordcloud.WordCloud(colormap='rainbow', 
                             stopwords=['integral', 'component', 'of', 'process', 'activity', 'to'],
                             collocations = True,
                             ranks_only=True,
                         )
        
    def gen_random_text(self):
        # Generate a spacer between gene ontology terms
        return ' '.join(''.join((random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(150))) for _ in range(3))
        
    def goea_to_text(self, goea_results):
        # Generate a text concatenating GO term names, each name being repeated proportionally to the minus log of its pvalue
        return ' '.join((''.join( ' ' + self.gen_random_text() + ' ' + x.name for _ in range(int((-10) * math.log2(x.p_uncorrected))))
                for x in goea_results  if x.enrichment == 'e'))
    
    def generate_image(self, goea_results):
        return self.wc.generate(self.goea_to_text(goea_results)).to_image()
        

## Putting all together

In [None]:
def get_gtf_genes_df():
    try:
        gtf_df_genes = pd.read_csv("gtf_df_genes.csv")
    except:
        gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
        gtf_df_genes = gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]
        gtf_df_genes.to_csv("gtf_df_genes.csv", index=None)
        
    return gtf_df_genes


def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
                               [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return (*stats.fisher_exact(m), m)
    #, m, len(yes_b)/len(u), len(yes_a)/len(u), len(yes_a.intersection(yes_b)) * len(no_a.intersection(no_b)) / (len(yes_a.intersection(no_b)) * len(no_a.intersection(yes_b)))

    
    
    
class NeighborhoodGOWordCloud():
    def __init__(self, n_neighbors):
        
        self.n_neighbors = n_neighbors
        expression_df = pd.read_csv('../../_m/latent_variables.csv', index_col=0)
        mucols = [x for x in expression_df.columns if 'mu' in x]
        expression_df = expression_df[mucols]
        
        
        # gene sets to test for enrichment
        self.de_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/analysis/differential_expression/_m/genes/diffExpr_szVctl_FDR05.txt',
                      sep='\t', usecols=[0], index_col=0).index)

        self.twas_genes = set(pd.read_csv('/ceph/users/apua/projects/caudate_twas_reader/genes/_m/twas_significant_genes.csv')['gene_id'])

        self.gwas_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/genes/_m/gwas_genes.csv')['gene_id'])
     
    
        # universe of all genes
        self.universe = set(expression_df.index[3:])

        
        
        self.gtf_df_genes = get_gtf_genes_df()

        self.gn = GeneNeighborhood(expression_df, self.gtf_df_genes)
        
        self.e2e = EnsemblIDToEntrezIDConverter()
        self.mygoa = MyGeneOntologyAnalysis()
        self.gowc = GeneOntologyWordCloud()
        
        
        
    
        
    
    def pipeline(self, gene_id, filename_prefix):
        
        #with open("%s_neighbors.txt" % filename_prefix, "wt") as f:
        #    for x in nn:
        #        print(x, file=f)

        self.gn.neighbors_df(gene_id, self.n_neighbors)\
        .to_csv("%s_neighbors.csv" % filename_prefix)
        
        nn = self.gn.neighbors(gene_id, self.n_neighbors)
        
        
        
        sets_df = pd.DataFrame.from_records( ((xx[0], *fet(nn, xx[1], self.universe)) for xx in
            [('gwas', self.gwas_genes),
             ('twas', self.twas_genes),
             ('de', self.de_genes),]), columns=['set', 'or', 'pvalue', 'm'])
        sets_df.to_csv("%s_sets_enrichment.csv" % filename_prefix)
        print(sets_df)
        
        
        
        go_r = self.mygoa.goea_results_significant(self.e2e.convert(nn))
        #self.mygoa.goeaobj.wr_txt("%s_go_enrichment.txt" % filename_prefix, go_r)
        self.mygoa.goeaobj.wr_tsv("%s_go_enrichment.tsv" % filename_prefix, go_r)
        p = self.gowc.generate_image(go_r)
        p.save("%s_go_wordcloud.png" % filename_prefix)
        return p
        
    
    

In [None]:
ngowc = NeighborhoodGOWordCloud(250)

In [None]:
ngowc.pipeline('chr11:113412884-113415420(-)', 'DRD2_jucntion_5_7')

In [None]:
ngowc.pipeline('chr11:113414462-113415420(-)', 'DRD2_jucntion_5_6')

In [None]:
len(set(ngowc.gn.neighbors('chr11:113412884-113415420(-)', ngowc.n_neighbors))\
.intersection(set(ngowc.gn.neighbors('chr11:113414462-113415420(-)', ngowc.n_neighbors))))

In [None]:
d = {z[0]:z[1]  for z in zip(ngowc.gtf_df_genes['gene_name'], ngowc.gtf_df_genes['gene_id'])}

In [None]:
for x in ['C4A', 'CSDC2', 'HCG4', 'REEP2', 'ZNF14', 'CNNM2', 'ENSA', 'PPTC7',
 'NGEF', 'TDRD9', 'ZNF204P', 'ZNF391', 'BAG6', 'HIRIP3', 'IP6K3', 'NELFE',
 'PRRC2A', 'TSNARE1', 'ZC3H7B', 'BNIP3L', 'BRD2', 'CACNA1I', 'CKB', 'ELFN1',
 'LLGL1', 'PCCB', 'PHF1', 'PLPP5', 'PPM1M', 'SREBF2', 'TRMT61A', 'ZSCAN9',
 'DRD2', 'SETD1A']:
    try:
        print(x)
        ngowc.pipeline(d[x], x)
    except:
        print("Error processing %s" % x)
        pass