<a href="https://colab.research.google.com/github/Karine-Moussa/PANGO-Genomic-Conversions/blob/main/COVID_19_Variant_Conversion_Utility_(single_input).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PANGO Mutation Nomenclature: Convert Genetic Location to Genomic Location


---



# Instructions
- In colab menu: *Runtime > Runall*

- In the **Input** section, enter mutation of interest in PANGO format (then hit `enter`)

 - `[GENE]:[REF_AMINO_ACID][AMINO_ACID_LOC][ALT_AMINO_ACID]`

   - Example: `S:D1118H`
 -  **Note:** do not include mutation type
   - ie: ~~`aa:`~~`S:D1118H`

---



## INPUT

In [None]:
# Input mutation of interest
import re
while True:
  mut_input = input()
 # Verify nomenclature
  if re.match("([a-z 0-9]+):([a-z]+)([0-9]+)([a-z\*]+)", mut_input, flags=re.IGNORECASE):
    mutation = mut_input
    break
  else:
    print("Try again:")


S:D1118H


# MAIN

## Set up

In [None]:
# Install python packages
!pip -q install pyfaidx # -q suppresses pip output message

  Building wheel for pyfaidx (setup.py) ... [?25l[?25hdone


In [None]:
# Import python libraries
import os.path
from os import path
import pandas as pd
from pyfaidx import Fasta

In [None]:
# Get Genome
if path.exists("GCF_009858895.2_ASM985889v3_genomic.fna") == False:
  !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz
  !gunzip GCF_009858895.2_ASM985889v3_genomic.fna.gz

--2021-05-01 15:31:43--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.7, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9591 (9.4K) [application/x-gzip]
Saving to: ‘GCF_009858895.2_ASM985889v3_genomic.fna.gz’


2021-05-01 15:31:43 (139 MB/s) - ‘GCF_009858895.2_ASM985889v3_genomic.fna.gz’ saved [9591/9591]



In [None]:
# Get genome gff 
if path.exists("GCF_009858895.2_ASM985889v3_genomic.gff") == False:
  !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gff.gz
  !gunzip GCF_009858895.2_ASM985889v3_genomic.gff

--2021-05-01 15:31:44--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gff.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.7, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2914 (2.8K) [application/x-gzip]
Saving to: ‘GCF_009858895.2_ASM985889v3_genomic.gff.gz’


2021-05-01 15:31:44 (139 MB/s) - ‘GCF_009858895.2_ASM985889v3_genomic.gff.gz’ saved [2914/2914]



In [None]:
# Load genome, grab DNA like genome["NC_045512.2"][start:end]
genome = Fasta('GCF_009858895.2_ASM985889v3_genomic.fna')

In [None]:
# Load gff as pandas data frame
df = pd.read_csv("GCF_009858895.2_ASM985889v3_genomic.gff", comment="#", sep="\t", header=None)

## Conversion

In [None]:
codon_table = {"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
               "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
               "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
               "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
               "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
               "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
               "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
               "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
               "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
               "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
               "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
               "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
               "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
               "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
               "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
               "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"}

In [None]:
# Create dna -> aa translate function
def translate(seq):
    p = ""
    for loc in range(0, len(seq), 3):
        p += codon_table[seq[loc:loc+3]]
    return p

In [None]:
# Parse mutation format.
gene_name, before, loc, after = re.match("([a-z 0-9]+):([a-z]+)([0-9]+)([a-z\*]+)", mutation, flags=re.IGNORECASE).groups()
loc = int(loc)

In [None]:
# Collect GFF information on gene
df = df[df[2] == "gene"]
df[8] = df[8].apply(lambda line: [n.lstrip("gene=").lower() for n in line.split(";") if n.startswith("gene=")][0])
gene_data = df[df[8] == gene_name.lower()].to_dict("records")[0]
gene_data


{0: 'NC_045512.2',
 1: 'RefSeq',
 2: 'gene',
 3: 21563,
 4: 25384,
 5: '.',
 6: '+',
 7: '.',
 8: 's'}

In [None]:
### Locate the amino acid in the genome
start, end = (gene_data[3]-1 + 3*(loc-1), gene_data[3]-1 + 3*(loc-1) + 3)

## Check

In [None]:
# Check:
check = translate(str(genome["NC_045512.2"][start:end]))
print(f"PANGO\t\tRef Amino Acid:\t%s" % before)  
print(f"Converter\tRef Amino Acid:\t%s" % check)
check_flag = True
if before != check: check_flag = False; print("Check Failed")

PANGO		Ref Amino Acid:	D
Converter	Ref Amino Acid:	D


In [None]:
# Prepare final output
output = ""
output += "PANGO Mutation\t\t{}\n".format(mutation)
output += "Genomic Start Loc:\t{}\n".format(start)
output += "Genomic End Loc:\t{}\n".format(end)
output += "Genomic Amino Acid:\t{}\n".format(check)
if check_flag == False:
  output += ("Note: Amino Acid Verification failed")

# OUTPUT

In [None]:
print(output)

PANGO Mutation		S:D1118H
Genomic Start Loc:	24913
Genomic End Loc:	24916
Genomic Amino Acid:	D

