<a href="https://colab.research.google.com/github/Karine-Moussa/PANGO-Genomic-Conversions/blob/main/COVID_19_PANGO_Variant_Conversion_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PANGO Mutation Nomenclature: Convert Genetic Location to Genomic Location


---



# Instructions

1. In colab menu: *Runtime > Runall*

2. In the **INPUT** section enter one or more mutations in PANGO format, then hit `ENTER`

 - `[GENE]:[REF_AMINO_ACID][AMINO_ACID_LOC][ALT_AMINO_ACID]`
    - Example: `S:D1118H`
 -  **Note:** do not include mutation type
     - ie: ~~`aa:`~~`S:D1118H`
 - For multiple mutations, enter mutations in **comma seperated** format

---



## INPUT

In [None]:
# Input mutations of interest
import re
while True:
  mut_input = input()
  break
# Example: S:D1118H, N:P80R, orf1ab:S1188L 

S:D1118H, orf1ab:S1188L 


# MAIN

## Check Input Format

In [None]:
# Check inputted mutations. Only proceed with those in the correct format.
import re
mutations = []
mut_input = mut_input.replace(" ", "").split(",")
for mut in mut_input:
 # Verify nomenclature
  if re.match("([a-z 0-9]+):([a-z]+)([0-9]+)([a-z\*]+)", mut, flags=re.IGNORECASE):
    mutations.append(mut)
  else:
    print(f"'%s' invalid format" % mut)

## Set up

In [None]:
# Install python packages
!pip -q install pyfaidx # -q suppresses pip output message

In [None]:
# Import python libraries
import os.path
from os import path
import pandas as pd
from pyfaidx import Fasta

In [None]:
# Get Genome
if path.exists("GCF_009858895.2_ASM985889v3_genomic.fna") == False:
  !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.fna.gz
  !gunzip GCF_009858895.2_ASM985889v3_genomic.fna.gz

In [None]:
# Get genome gff 
if path.exists("GCF_009858895.2_ASM985889v3_genomic.gff") == False:
  !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gff.gz
  !gunzip GCF_009858895.2_ASM985889v3_genomic.gff

In [None]:
# Load genome, grab DNA like genome["NC_045512.2"][start:end]
genome = Fasta('GCF_009858895.2_ASM985889v3_genomic.fna')

In [None]:
# Load gff as pandas data frame
df = pd.read_csv("GCF_009858895.2_ASM985889v3_genomic.gff", comment="#", sep="\t", header=None)

# Collect GFF information on all genes
df = df[df[2] == "gene"]
df[8] = df[8].apply(lambda line: [n.lstrip("gene=").lower() for n in line.split(";") if n.startswith("gene=")][0])
# Need to add a row for orf1a
#orf1a = pd.DataFrame(pd.DataFrame([['NC_045512.2', 'RefSeq','gene',1,265,'.','+','.','orf1a']],)) 
df = df.append(orf1a, ignore_index=True)
genes_list = list(df[8])

## Conversion

In [None]:
codon_table = {"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
               "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
               "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
               "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
               "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
               "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
               "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
               "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
               "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*",
               "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
               "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
               "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
               "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W",
               "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
               "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
               "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G"}

In [None]:
# Create dna -> aa translate function
def translate(seq):
    p = ""
    for loc in range(0, len(seq), 3):
        p += codon_table[seq[loc:loc+3]]
    return p

In [None]:
# Create mutant parsing function
def mutant_parse(mutation):
  gene_name, before, loc, after = re.match("([a-z 0-9]+):([a-z]+)([0-9]+)([a-z\*]+)", mutation, flags=re.IGNORECASE).groups()
  loc = int(loc)
  return gene_name, before, loc, after
gene_name, before, loc, after = mutant_parse("S:D80L")

In [None]:
# Get gene start and end
def get_gene_start(x):
  if x.lower() in genes_list:
    start = int(df[df[8]==x.lower()][3].values[0])
  else:
    start = int()
  return start

def get_gene_end(x):
  if x.lower() in genes_list:
    end = int(df[df[8]==x.lower()][4].values[0])
  else:
    end = int()
  return end

In [None]:
# Locate the amino acid in the genome
def locate_aa(gene_data):
  start, end = (gene_data[3]-1 + 3*(loc-1), gene_data[3]-1 + 3*(loc-1) + 3)
  return(start, end)

In [None]:
# Create df with information on each mutation
snpaa = pd.DataFrame()
snpaa['mutation'] = mutations
snpaa[['gene_name','before','loc','after']] = snpaa.apply(
    lambda x: pd.Series(mutant_parse(x['mutation'])), axis=1)

# add gene start and end to dataframe
snpaa['gene_start'] = snpaa['gene_name'].apply(get_gene_start)
snpaa['gene_end'] = snpaa['gene_name'].apply(get_gene_end)

# add genomic start and end to dataframe
snpaa['mut_start'] = (snpaa['gene_start']-1 + 3*(snpaa['loc']-1))
snpaa['mut_end'] = (snpaa['gene_start']-1 + 3*(snpaa['loc']-1) + 3)

# add translated sequence
snpaa['trans_aa'] = snpaa.apply(
    lambda x: translate(str(genome["NC_045512.2"][x['mut_start']:x['mut_end']])), axis =1)


# add verification column
snpaa['pass'] = (snpaa['before'] == snpaa['trans_aa'])

In [None]:
# Save df to csv
snpaa.to_csv('snpaa_subset.csv')

# OUTPUT

In [None]:
snpaa

Unnamed: 0,mutation,gene_name,before,loc,after,gene_start,gene_end,mut_start,mut_end,trans_aa,pass
0,S:D1118H,S,D,1118,H,21563,25384,24913,24916,D,True
1,orf1ab:S1188L,orf1ab,S,1188,L,266,21555,3826,3829,S,True




<br></br>

To download output, go to the left tab and select the *files* icon > right-click on `snpaa_subset.csv` > select *download*

<img width="200" alt="image" src="https://i.imgur.com/xAc9Eym.png">