In [1]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Gene Conversion Appyter

Here we convert a transcript expression count matrix into a gene expression count matrix leveraging official NCBI Symbols.

See: <ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/> for more information.

In [2]:
import numpy as np
import pandas as pd

In [3]:
%%appyter hide_code

{% do SectionField(
    name='primary',
    title='Gene Conversion',
    subtitle='Configure your gene conversion',
) %}

## Step 1: Load the input data

It's important to note that the **first column** of your matrix should contain the symbols we're converting, and the **first row** the column header. More advanced tabular formats are not yet supported.

In [4]:
%%appyter hide_code

{% do DescriptionField(
    name='desc',
    text='''
      <div class="alert alert-info mb-0">
      Your input matrix should have the gene identifiers to be
      converted in the <b>first column</b> of the matrix, and
      at most <b>one row of header</b>. In the case of Excel which
      is, in general, not recommended due to its tendency to convert
      gene names into dates, only the first sheet will be used.
      </div>
    ''',
    section='primary',
) %}

{% set file = FileField(
  name='file',
  label='File to convert',
  description='Tab-separated matrix to be converted from transcripts to genes',
  default='GSE152418.txt.gz',
  examples={
    'GSE152418.txt.gz': 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE152418&format=file&file=GSE152418%5Fp20047%5FStudy1%5FRawCounts%2Etxt%2Egz',
  },
  section='primary',
) %}

{% set file_format = ChoiceField(
  name='file_format',
  label='File Format',
  description='Please select your file format',
  default='TSV (.tsv / .txt)',
  choices={
    'TSV (.tsv / .txt)': "sep='\\t',",
    'GZipped TSV (.tsv.gz / .txt.gz)': "sep='\\t', compression='gzip',",
    'CSV (.csv)': "sep=',',",
    'GZipped CSV (.csv.gz)': "compression='gzip',",
    'Excel Sheet 1 (.xls, .xlsx, .xlsm, .xlsb, .odf, .ods, .odt)': "read_excel",
  },
  section='primary',
) %}

In [6]:
%%appyter code_eval

{% if file_format.value == "read_excel" %}
data = pd.read_excel(
    {{ file }},
    index_col=0,
)
{% else %}
data = pd.read_csv(
    {{ file }},
    index_col=0,
    {{ file_format }}
)
{% endif %}
data

```python
data = pd.read_csv(
    'GSE152418.txt.gz',
    index_col=0,
    sep='\t',
)
data
```

Unnamed: 0_level_0,S145_nCOV001_C,S147_nCoV001EUHM-Draw-1,S149_nCoV002EUHM-Draw-2,S150_nCoV003EUHM-Draw-1,S151_nCoV004EUHM-Draw-1,S152_nCoV006EUHM-Draw-1,S153_nCoV007EUHM-Draw-1,S154_nCoV0010EUHM-Draw-1,S155_nCOV021EUHM,S156_nCOV024EUHM-Draw-1,...,S068_272,S069_273,S070_279,S071_280,S181_255,S182_SHXA10,S183_263,S184_SHXA18,S185_266,S186_SHXA14
ENSEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000227232,1,0,3,16,1,2,18,6,20,0,...,1,1,1,2,3,4,1,0,0,7
ENSG00000278267,3,1,2,0,3,0,0,1,3,2,...,2,7,2,6,3,2,6,12,5,3
ENSG00000243485,2,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
ENSG00000284332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000271254,40,31,56,42,42,66,40,24,32,49,...,62,48,88,38,63,62,83,30,60,77
ENSG00000275405,31,22,13,19,5,4,18,6,6,4,...,8,6,14,15,2,2,7,5,15,3
ENSG00000275987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000277475,0,0,0,0,1,0,0,0,2,0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
%%appyter hide_code

{% set organism = ChoiceField(
  name='organism',
  label='Select the organism',
  description='Different organisms have different sets of genes available; all NCBI supported organisms can be supported.',
  default='Homo sapiens',
  choices={
    'Homo sapiens': 'Mammalia/Homo_sapiens',
    'Mus musculus': 'Mammalia/Mus_musculus',
    'Rattus norvegicus': 'Mammalia/Rattus_norvegicus',
    'Pan troglodytes': 'Mammalia/Pan_troglodytes',
    'Sus scrofa': 'Mammalia/Sus_scrofa',
    'Canis familiaris': 'Mammalia/Canis_familiaris',
    'Bos taurus': 'Mammalia/Bos_taurus',
  },
  section='primary',
) %}

{% set gene_types = MultiChoiceField(
  name='gene_types',
  label='Gene types',
  description='Types of genes to include in the mapping',
  default=['protein-coding'],
  choices=[
    'protein-coding',
    'ncRNA',
    'pseudo',
    'biological-region',
    'unknown',
    'other',
    'tRNA',
    'snoRNA',
    'snRNA',
    'rRNA',
    'scRNA',
  ],
  section='primary',
) %}

In [8]:
%%appyter code_eval

ncbi_gene_info = pd.read_csv(
    'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/{{ organism }}.gene_info.gz',
    compression='gzip', sep='\t',
)
ncbi_gene_info = ncbi_gene_info[np.in1d(ncbi_gene_info['type_of_gene'], {{ gene_types }})]
ncbi_gene_info

```python
ncbi_gene_info = pd.read_csv(
    'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
    compression='gzip', sep='\t',
)
ncbi_gene_info = ncbi_gene_info[np.in1d(ncbi_gene_info['type_of_gene'], ['protein-coding'])]
ncbi_gene_info
```

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20201124,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20201124,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20201124,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20201124,-
6,9606,12,SERPINA3,-,AACT|ACT|GIG24|GIG25,MIM:107280|HGNC:HGNC:16|Ensembl:ENSG00000196136,14,14q32.13,serpin family A member 3,protein-coding,SERPINA3,serpin family A member 3,O,alpha-1-antichymotrypsin|cell growth-inhibitin...,20201124,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61730,741158,8923209,ND1,-,-,-,MT,-,NADH dehydrogenase subunit 1,protein-coding,-,-,-,NADH dehydrogenase subunit 1,20200909,-
61731,741158,8923210,ND5,-,-,-,MT,-,NADH dehydrogenase subunit 5,protein-coding,-,-,-,NADH dehydrogenase subunit 5,20200909,-
61732,741158,8923211,ATP8,-,-,-,MT,-,ATP synthase F0 subunit 8,protein-coding,-,-,-,ATP synthase F0 subunit 8,20200909,-
61733,741158,8923212,ND2,-,-,-,MT,-,NADH dehydrogenase subunit 2,protein-coding,-,-,-,NADH dehydrogenase subunit 2,20200909,-


## Step 1. Prepare NCBI Gene Synonym lookup table

To perform this conversion, we'll prepare a lookup table with the ncbi gene info table. Then we'll use that lookup for the conversion.

In [9]:
# create a lookup dictionary using the ncbi table

def maybe_split(record):
  ''' NCBI Stores Nulls as '-' and lists '|' delimited
  '''
  if record in {'', '-'}:
    return set()
  return set(record.split('|'))

def supplement_dbXref_prefix_omitted(ids):
  ''' NCBI Stores external IDS with Foreign:ID while most datasets just use the ID
  '''
  for id in ids:
    # add original id
    yield id
    # also add id *without* prefix
    if ':' in id:
      yield id.split(':', maxsplit=1)[1]

synonym, symbol = zip(*{
  (alternate_symbol, gene_info['Symbol'])
  for _, gene_info in ncbi_gene_info.iterrows()
  for alternate_symbol in set.union(
    maybe_split(gene_info['Symbol']),
    maybe_split(gene_info['Symbol_from_nomenclature_authority']),
    maybe_split(str(gene_info['GeneID'])),
    maybe_split(gene_info['Synonyms']),
    maybe_split(gene_info['Other_designations']),
    maybe_split(gene_info['LocusTag']),
    set(supplement_dbXref_prefix_omitted(maybe_split(gene_info['dbXrefs']))),
  )
})
ncbi_lookup = pd.Series(symbol, index=synonym)
ncbi_lookup

HGNC:1471                                                                CAMLG
lc3 synthase                                                            B3GNT5
ENSG00000170703                                                          TTLL6
HGNC:4388                                                                 GNAL
HNP3                                                                     DEFA3
                                                                        ...   
pyruvate dehydrogenase, E1 beta polypeptide                               PDHB
pyruvate dehydrogenase, lipoamide, kinase isozyme 3, mitochondrial        PDK3
Ensembl:ENSG00000171564                                                    FGB
HGNC:HGNC:23536                                                       NAALADL1
MIM:607969                                                                CLK4
Length: 281717, dtype: object

In [10]:
# turns out some synonyms are ambiguous
index_values = ncbi_lookup.index.value_counts()
index_values

epididymis secretory sperm binding protein    387
seven transmembrane helix receptor             49
flavoprotein-linked monooxygenase              16
microsomal monooxygenase                       15
H4C4                                           14
                                             ... 
VPS4-1                                          1
HGNC:HGNC:10729                                 1
HGNC:6816                                       1
HGNC:9176                                       1
C16orf41                                        1
Length: 274553, dtype: int64

In [11]:
# for our sanity we'll drop these entirely, in the future we could consider a better way to take care of these
ncbi_lookup_disambiguated = ncbi_lookup.drop(index_values[index_values > 1].index)
ncbi_lookup_disambiguated

HGNC:1471                                                                CAMLG
lc3 synthase                                                            B3GNT5
ENSG00000170703                                                          TTLL6
HGNC:4388                                                                 GNAL
HNP3                                                                     DEFA3
                                                                        ...   
pyruvate dehydrogenase, E1 beta polypeptide                               PDHB
pyruvate dehydrogenase, lipoamide, kinase isozyme 3, mitochondrial        PDK3
Ensembl:ENSG00000171564                                                    FGB
HGNC:HGNC:23536                                                       NAALADL1
MIM:607969                                                                CLK4
Length: 269494, dtype: object

## Step 2: Use Disambiguated NCBI lookup table to map our data

In [12]:
%%appyter hide_code

{% set versioned_ensembl_id = BoolField(
  name='versioned_ensembl_id',
  label='Do you have versioned ENSEMBL IDs?',
  description='ENSEMBL Ids may be postfixed with the version e.g `ENSG00000227232.1`. If your ENSEMBL ids are versioned, you can check this to prune the versions for a successful conversion.',
  default=False,
  section='primary',
) %}

In [13]:
%%appyter code_eval

# Mapping between current transcripts to the official NCBI Gene symbols, dropping anything that doesn't map
ncbi_lookup_disambiguated_dict = ncbi_lookup_disambiguated.to_dict()
transcript_genes = pd.Series(
{%- if versioned_ensembl_id.value %}
    data.index.map(lambda i: ncbi_lookup_disambiguated_dict.get(i.split('.')[0])),
{%- else %}
    data.index.map(ncbi_lookup_disambiguated_dict.get),
{%- endif %}
    index=data.index,
).dropna()
transcript_genes

```python
# Mapping between current transcripts to the official NCBI Gene symbols, dropping anything that doesn't map
ncbi_lookup_disambiguated_dict = ncbi_lookup_disambiguated.to_dict()
transcript_genes = pd.Series(
    data.index.map(ncbi_lookup_disambiguated_dict.get),
    index=data.index,
).dropna()
transcript_genes
```

ENSEMBLID
ENSG00000186092           OR4F5
ENSG00000284733          OR4F29
ENSG00000284662          OR4F16
ENSG00000187634          SAMD11
ENSG00000188976           NOC2L
                       ...     
ENSG00000277196    LOC102724788
ENSG00000277630    LOC100288966
ENSG00000278633    LOC102724151
ENSG00000276345    LOC107987373
ENSG00000271254    LOC102724250
Name: ENSEMBLID, Length: 19286, dtype: object

In [14]:
# Compute variance of each transcript
transcript_variance = data.var(axis=1)
transcript_variance

ENSEMBLID
ENSG00000223972      0.270945
ENSG00000227232     51.871658
ENSG00000278267      8.074866
ENSG00000243485      0.167558
ENSG00000284332      0.000000
                      ...    
ENSG00000271254    573.469697
ENSG00000275405    118.106952
ENSG00000275987      0.117647
ENSG00000277475      0.528520
ENSG00000268674      0.000000
Length: 60683, dtype: float64

In [15]:
# Merge the transcript_genes & transcript_variance
transcript_gene_variance = pd.merge(
    left=transcript_variance.to_frame('variance'), left_index=True,
    right=transcript_genes.to_frame('gene'), right_index=True,
)
transcript_gene_variance

Unnamed: 0_level_0,variance,gene
ENSEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000186092,0.311052,OR4F5
ENSG00000284733,0.000000,OR4F29
ENSG00000284662,0.000000,OR4F16
ENSG00000187634,3.910873,SAMD11
ENSG00000188976,97646.467914,NOC2L
...,...,...
ENSG00000277196,1.476827,LOC102724788
ENSG00000277630,0.029412,LOC100288966
ENSG00000278633,0.264706,LOC102724151
ENSG00000276345,625.450980,LOC107987373


In [16]:
# for all transcripts that map to multiple genes, take the gene, transcript mapping with the highest variance
transcript_gene_mapping = transcript_gene_variance.groupby('gene')['variance'].idxmax()
transcript_gene_mapping

gene
A1BG       ENSG00000121410
A1CF       ENSG00000148584
A2M        ENSG00000175899
A2ML1      ENSG00000166535
A3GALT2    ENSG00000184389
                ...       
ZYG11A     ENSG00000203995
ZYG11B     ENSG00000162378
ZYX        ENSG00000159840
ZZEF1      ENSG00000074755
ZZZ3       ENSG00000036549
Name: variance, Length: 19233, dtype: object

In [None]:
# actually perform the mapping
data_genes = pd.merge(
    left=transcript_gene_mapping.to_frame('transcript'), left_on='transcript',
    right=data, right_index=True,
).drop('transcript', axis=1)
data_genes

In [None]:
%%appyter markdown

## Step 3. Export results

Your results are available for download [here](./converted_{{ file }}).

In [None]:
print(f"Converted Matrix of shape {data.shape} => {data_genes.shape}")

In [None]:
%%appyter code_eval

{% if file_format.value == "excel" %}
data_genes.to_excel(
    'converted_' + {{ file }},
)
{% else %}
data_genes.to_csv(
    'converted_' + {{ file }},
    {{ file_format }}
)
{% endif %}
data