In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Overview

This colab analyzes the [GREAT](https://great.stanford.edu) results for
functional enrichment of the input GWAS results.

It assumes you have already submitted the input GWAS results to the GREAT server
using the web API. This is performed in the following way:

## Converting an `association_results.loci.tsv` file to a BED file

Given the GWAS input file `association_results.loci.tsv`, convert to BED format
file named `association_results.bed` with the following command:

```bash
tail -n+2 association_results.loci.tsv \
  awk -F'\t' -vOFS='\t' '{print $2, $3-1, $3, NR}' \
  > association_results.bed
```

## Running GREAT web API on the resulting BED.

Assuming that your BED file is accessible at

https://my.domain.com/association_results.bed

run the web API and write to the file `association_results.great` in the
following way:

```bash
# Run the GREAT command and write to a temporary file.
# See https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655447/Programming+Interface for details.
wget -O association_results.great.tmp \
  http://bejerano.stanford.edu/great/public/cgi-bin/greatStart.php?outputType=batch&requestSpecies=hg19&requestName=myjob&requestSender=Client+A&requestURL=https%3A%2F%2Fmy.domain.com%2Fassociation_results.bed

# Strip off some bad formatting from GREAT for the final output file.
awk '{if(index($0, "<script>") == 1) {print substr($0, index($0, "</script>")+9) } else {print $0}}' association_results.great.tmp > association_results.great
```

In [None]:
import io
from typing import Optional, Sequence
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Running analyses once the GREAT TSV files are generated

In [None]:
ASSEMBLY = 'hg19'

# All columns in a single ontology output file from https://great.stanford.edu
ALL_COLUMNS = [
    'Ontology',
    'ID',
    'Desc',
    'BinomRank',
    'BinomP',
    'BinomBonfP',
    'BinomFdrQ',
    'RegionFoldEnrich',
    'ExpRegions',
    'ObsRegions',
    'GenomeFrac',
    'SetCov',
    'HyperRank',
    'HyperP',
    'HyperBonfP',
    'HyperFdrQ',
    'GeneFoldEnrich',
    'ExpGenes',
    'ObsGenes',
    'TotalGenes',
    'GeneSetCov',
    'TermCov',
    'Regions',
    'Genes',
]

# The columns that are related to significance testing.
SIGNIFICANCE_COLUMNS = [
    'BinomP',
    'BinomBonfP',
    'BinomFdrQ',
    'HyperP',
    'HyperBonfP',
    'HyperFdrQ',
]

# The subset of columns we typically care to compare between two analyses.
JOINED_COLUMN_PREFIXES = (
    ['Desc'] + SIGNIFICANCE_COLUMNS + ['RegionFoldEnrich', 'GeneFoldEnrich']
)

# These are the default ontologies we keep. It excludes "Ensembl Genes" and
# "Mouse Phenotype" ontologies, the former because enrichment for a single gene
# is not relevant when analyzing GWAS results and the latter because empirically
# we have found the single knockout version of the Mouse Phenotype ontology to
# provide cleaner enrichment signals.
_DEFAULT_ONTOLOGIES = (
    'GO Biological Process',
    'GO Cellular Component',
    'GO Molecular Function',
    'Human Phenotype',
    'Mouse Phenotype Single KO',
)


def load_great_results(
    great_tsv: str, ontologies: Optional[Sequence[str]] = _DEFAULT_ONTOLOGIES
) -> pd.DataFrame:
  """Returns a pd.DataFrame of the given dataset.

  This assumes the TSV passed in is of the format returned by the GREAT
  programmatic interface, which has all ontology-specific results concatenated
  into a single file.

  Args:
    great_tsv: The input TSV file returned by running GREAT on the data of
      interest.
    ontologies: An optional list of ontologies to restrict results to. If
      unspecified, the default ontologies are retained. If an empty sequence or
      None is passed in, all ontologies are retained.

  Returns:
    A DataFrame of the results.

  Raises:
    ValueError: The column headers are not ordered as expected.
  """
  with open(great_tsv) as f:
    lines = f.readlines()
  comment_lines = [line for line in lines if line.startswith('#')]
  if comment_lines[3][2:-1].split() != ALL_COLUMNS:
    raise ValueError('Column headers not ordered as expected')
  if (
      comment_lines[0]
      != f'# GREAT version 4.0.4\tSpecies assembly: hg19\tAssociation rule: '
      f'Basal+extension: 5000 bp upstream, 1000 bp downstream, 1000000 bp max '
      f'extension, curated regulatory domains included\n'
  ):
    raise ValueError(f'Unexpected initial comment line: {comment_lines[0]}')

  df = pd.read_csv(
      io.StringIO(''.join(lines)), sep='\t', names=ALL_COLUMNS, comment='#'
  )

  if ontologies:
    keep_mask = df.Ontology.isin(ontologies)
    print(
        f'Retaining {keep_mask.sum()} of {len(df)} entries in {great_tsv} '
        f'when restricting to ontologies {ontologies}'
    )
    df = df[keep_mask]
  return df.set_index(['Ontology', 'ID'])


def significant_results(
    df: pd.DataFrame,
    columns: Sequence[str],
    threshold: float,
    description_filter: str = '',
) -> pd.DataFrame:
  """Returns a copy of the subset of `df` that satisfy the threshold."""
  masks = [df[col] <= threshold for col in columns]
  if description_filter:
    masks.append(df.Desc.str.contains(description_filter, case=False))
  mask = np.logical_and.reduce(masks)
  return df[mask].copy()


def create_latex_table(df: pd.DataFrame) -> str:
  cardio_respiratory_df = significant_results(
      df,
      ['BinomBonfP', 'HyperBonfP'],
      threshold=0.05,
      description_filter=(
          r'cardiac|cardio|cardial|heart|circulatory|respir|lung|pulmon'
      ),
  )

  original_precision = pd.get_option('display.precision')
  pd.set_option('display.precision', 2)
  columns = ['ID', 'Desc', 'BinomBonfP', 'HyperBonfP', 'ObsRegions']
  name_map = {
      'ID': 'Term',
      'Desc': 'Description',
      'BinomBonfP': 'Region P',
      'HyperBonfP': 'Gene P',
      'ObsRegions': 'Num regions',
  }
  retval = (
      cardio_respiratory_df.reset_index()[columns]
      .sort_values('BinomBonfP')
      .rename(columns=name_map)
      .to_latex(index=False)
  )
  pd.set_option('display.precision', original_precision)
  return (
      r"""\begin{table}[ht]
\small
\centering
"""
      + retval
      + r"""\caption{\textbf{Cardiovascular and respiratory term enrichments of the ML-based
COPD loci.} Enrichments were computed using GREAT~\cite{McLean2010} with default
parameters. The 82 total terms significant at Bonferroni-corrected P-value 0.05
by both the region-based binomial and gene-based hypergeometric tests were
filtered to those with description that matched the regular expression
`cardiac|cardio|cardial|heart|circulatory|respir|lung|pulmon'.}
\end{table}
"""
  )

# Compute enrichments

In [None]:
# The path to the enrichments for the GWAS.
g_great_path = ''  # @param

# Load results.
g_df = load_great_results(g_great_path)

# Write the LaTeX table of results of respiratory/cardio terms.
print(create_latex_table(g_df))