In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Gene Conversion Appyter

Here we convert a transcript expression count matrix into a gene expression count matrix leveraging official NCBI Symbols.

See: <ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/> for more information.

In [None]:
import numpy as np
import pandas as pd

In [None]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Gene Conversion',
    subtitle='Configure your gene conversion',
    img='icon.png',
) %}

## Step 1: Load the input data

It's important to note that the **first column** of your matrix should contain the symbols we're converting, and the **first row** the column header. More advanced tabular formats are not yet supported.

In [None]:
%%appyter hide_code

{% do DescriptionField(
    name='desc',
    text='''
      <div class="alert alert-info mb-0">
      Your input matrix should have the gene identifiers to be
      converted in the <b>first column</b> of the matrix, and
      at most <b>one row of header</b>. In the case of Excel which
      is, in general, not recommended due to its tendency to convert
      gene names into dates, only the first sheet will be used.
      </div>
    ''',
    section='primary',
) %}

{% set file = FileField(
  name='file',
  label='File to convert',
  description='Tab-separated matrix to be converted from transcripts to genes',
  default='GSE152418.txt.gz',
  examples={
    'GSE152418.txt.gz': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE152418&format=file&file=GSE152418%5Fp20047%5FStudy1%5FRawCounts%2Etxt%2Egz',
  },
  section='primary',
) %}

{% set file_format = ChoiceField(
  name='file_format',
  label='File Format',
  description='Please select your file format',
  default='TSV (.tsv / .txt)',
  choices={
    'TSV (.tsv / .txt)': "sep='\\t',",
    'GZipped TSV (.tsv.gz / .txt.gz)': "sep='\\t', compression='gzip',",
    'CSV (.csv)': "sep=',',",
    'GZipped CSV (.csv.gz)': "compression='gzip',",
    'Excel Sheet 1 (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt)': "excel",
  },
  section='primary',
) %}

In [None]:
%%appyter code_eval

{% if file_format.value == "excel" %}
data = pd.read_excel(
    {{ file }},
    index_col=0,
)
{% else %}
data = pd.read_csv(
    {{ file }},
    index_col=0,
    {{ file_format }}
)
{% endif %}
data

In [None]:
%%appyter hide_code

{% set organism = ChoiceField(
  name='organism',
  label='Select the organism',
  description='Different organisms have different sets of genes available; all NCBI supported organisms can be supported.',
  default='Homo sapiens',
  choices={
    'Homo sapiens': 'Mammalia/Homo_sapiens',
    'Mus musculus': 'Mammalia/Mus_musculus',
    'Rattus norvegicus': 'Mammalia/Rattus_norvegicus',
    'Pan troglodytes': 'Mammalia/Pan_troglodytes',
    'Sus scrofa': 'Mammalia/Sus_scrofa',
    'Canis familiaris': 'Mammalia/Canis_familiaris',
    'Bos taurus': 'Mammalia/Bos_taurus',
  },
  section='primary',
) %}

{% set gene_types = MultiChoiceField(
  name='gene_types',
  label='Gene types',
  description='Types of genes to include in the mapping',
  default=['protein-coding'],
  choices=[
    'protein-coding',
    'ncRNA',
    'pseudo',
    'biological-region',
    'unknown',
    'other',
    'tRNA',
    'snoRNA',
    'snRNA',
    'rRNA',
    'scRNA',
  ],
  section='primary',
) %}

In [None]:
%%appyter code_eval

ncbi_gene_info = pd.read_csv(
    'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{{ organism }}.gene_info.gz',
    compression='gzip', sep='\t',
)
ncbi_gene_info = ncbi_gene_info[np.in1d(ncbi_gene_info['type_of_gene'], {{ gene_types }})]
ncbi_gene_info

## Step 1. Prepare NCBI Gene Synonym lookup table

To perform this conversion, we'll prepare a lookup table with the ncbi gene info table. Then we'll use that lookup for the conversion.

In [None]:
# create a lookup dictionary using the ncbi table

def maybe_split(record):
  ''' NCBI Stores Nulls as '-' and lists '|' delimited
  '''
  if record in {'', '-'}:
    return set()
  return set(record.split('|'))

def supplement_dbXref_prefix_omitted(ids):
  ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
  '''
  for id in ids:
    # add original id
    yield id
    # also add id *without* prefix
    if ':' in id:
      yield id.split(':', maxsplit=1)[1]

synonym, symbol = zip(*{
  (alternate_symbol, gene_info['Symbol'])
  for _, gene_info in ncbi_gene_info.iterrows()
  for alternate_symbol in set.union(
    maybe_split(gene_info['Symbol']),
    maybe_split(gene_info['Symbol_from_nomenclature_authority']),
    maybe_split(str(gene_info['GeneID'])),
    maybe_split(gene_info['Synonyms']),
    maybe_split(gene_info['Other_designations']),
    maybe_split(gene_info['LocusTag']),
    set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
  )
})
ncbi_lookup = pd.Series(symbol, index=synonym)
ncbi_lookup

In [None]:
# turns out some synonyms are ambiguous
index_values = ncbi_lookup.index.value_counts()
index_values

In [None]:
# for our sanity we'll drop these entirely, in the future we could consider a better way to take care of these
ncbi_lookup_disambiguated = ncbi_lookup.drop(index_values[index_values > 1].index)
ncbi_lookup_disambiguated

## Step 2: Use Disambiguated NCBI lookup table to map our data

In [None]:
%%appyter hide_code

{% set versioned_ensembl_id = BoolField(
  name='versioned_ensembl_id',
  label='Do you have versioned ENSEMBL IDs?',
  description='ENSEMBL Ids may be postfixed with the version e.g `ENSG00000227232.1`. If your ENSEMBL ids are versioned, you can check this to prune the versions for a successful conversion.',
  default=False,
  section='primary',
) %}

In [None]:
%%appyter code_eval

# Mapping between current transcripts to the official NCBI Gene symbols, dropping anything that doesn't map
ncbi_lookup_disambiguated_dict = ncbi_lookup_disambiguated.to_dict()
transcript_genes = pd.Series(
{%- if versioned_ensembl_id.value %}
    data.index.map(lambda i: ncbi_lookup_disambiguated_dict.get(i.split('.')[0])),
{%- else %}
    data.index.map(ncbi_lookup_disambiguated_dict.get),
{%- endif %}
    index=data.index,
).dropna()
transcript_genes

In [None]:
# Compute variance of each transcript
transcript_variance = data.var(axis=1)
transcript_variance

In [None]:
# Merge the transcript_genes & transcript_variance
transcript_gene_variance = pd.merge(
    left=transcript_variance.to_frame('variance'), left_index=True,
    right=transcript_genes.to_frame('gene'), right_index=True,
)
transcript_gene_variance

In [None]:
# for all transcripts that map to multiple genes, take the gene, transcript mapping with the highest variance
transcript_gene_mapping = transcript_gene_variance.groupby('gene')['variance'].idxmax()
transcript_gene_mapping

In [None]:
# actually perform the mapping
data_genes = pd.merge(
    left=transcript_gene_mapping.to_frame('transcript'), left_on='transcript',
    right=data, right_index=True,
).drop('transcript', axis=1)
data_genes

## Step 3. Export results

In [None]:
print(f"Converted Matrix of shape {data.shape} => {data_genes.shape}")

In [None]:
%%appyter code_eval

{% if file_format.value == "excel" %}
data_genes.to_excel(
    'converted_' + {{ file }},
)
{% else %}
data_genes.to_csv(
    'converted_' + {{ file }},
    {{ file_format }}
)
{% endif %}
data_genes

In [None]:
%%appyter markdown
# Your results are available for download **[here](./{{ ('converted_' + file.value)|urlencode }})**.