In [2]:
import argparse
import json
from Bio import SeqIO
import pandas as pd
from collections import Counter, defaultdict

In [18]:
def CDS_finder(reference):
    """this function finds CDS location """
    cds_ = dict()
    for feature in reference.features:
        if feature.type == 'CDS':  cds_[feature.qualifiers['gene'][0]] = (list(feature.location))
    return(cds_)

def Synonymous_Mutations(reffile, node, dictionary_=None, new_=None):
    """ Finds Synonymous mutations in CDS regions. Input:nested json"""
    ref_file = SeqIO.read(reffile, "genbank")
    gene_cds = CDS_finder(ref_file)
    if new_ is None: new_ = []
    if dictionary_ is None: dictionary_ = dict()
    if 'mutations' in node['branch_attrs']:
        aa_mutations, new_, in_it = ([] for i in range(3))
        if 'nuc' in node['branch_attrs']['mutations']:
            for gene, loc in gene_cds.items():
                if gene in node['branch_attrs']['mutations']:
                    for mut in node['branch_attrs']['mutations'][gene]:
                        aa_mutations.append(int(mut[1:-1])*3+loc[0]) # converting the amino acid location to nucleotide
            print(aa_mutations, node['branch_attrs']['mutations']['nuc'])
            for mut in node['branch_attrs']['mutations']['nuc']:
                if '-' not in mut and '*' not in mut and 'N' not in mut and "R" not in mut and "Y" not in mut and "M" not in mut and "D" not in mut:
                    #print(mut, (int(mut[1:-1])*3+loc[0]),  aa_mutations)
                    #make sure the codon is correct. 
                    if int(mut[1:-1]) not in aa_mutations and int(mut[1:-1])+2 not in aa_mutations and int(mut[1:-1])+1 not in aa_mutations:  #if the mutation is not in the same codon as a aa mutation
                        new_.append(mut)
                    else: in_it.append(mut[1:-1])
    if 'name' in node: dictionary_[node['name']] = new_
    if 'children' in node:
        for child in node['children']: Synonymous_Mutations(reffile, child, dictionary_, new_=None)
    return(dictionary_)



#ref =  SeqIO.read("/home/laura/code/mutation_patterns/data/areference.gbk", "genbank")


with open ("/home/laura/code/mutation_patterns/data/rsv_a_genome.json") as file_:
    f = json.load(file_) 
ref =  "/home/laura/code/mutation_patterns/data/breference.gbk"
Synonymous_Mutations(ref, f['tree'])

[390, 716, 1763, 2542, 3511, 4677, 4779, 4878, 4887, 4965, 4968, 4977, 4986, 4998, 5028, 5034, 5061, 5088, 5136, 5289, 5310, 5331, 5364, 5397, 5415, 5427, 5439, 5523, 5532, 5577, 5580, 5595, 5607, 5616, 5625, 5628, 5665, 5689, 5701, 6007, 6013, 8185, 8206, 8263, 8275, 8284, 8293, 8335, 8371, 8661, 8727, 8796, 8970, 9156, 9180, 9195, 9204, 9249, 9819, 10293, 10746, 10947, 12024, 13137, 13455, 13644, 13653, 13683, 13758, 13809, 13824, 14397, 14547] ['G27A', 'T85C', 'G88A', 'C191T', 'C242A', 'A260G', 'G314A', 'T342C', 'T356C', 'C386T', 'C404T', 'A408C', 'A524T', 'G577A', 'T586C', 'C609T', 'G616A', 'A636T', 'C675T', 'T699C', 'G737A', 'G792A', 'G843A', 'C858T', 'A906G', 'T912C', 'G951A', 'C963T', 'G998A', 'T1001C', 'C1009T', 'G1012T', 'C1025T', 'G1026A', 'T1030C', 'T1038C', 'C1039T', 'C1049T', 'C1107T', 'G1115A', 'A1116G', 'A1136G', 'C1196T', 'T1346G', 'A1355G', 'G1421A', 'G1520A', 'G1571A', 'G1628A', 'A1649G', 'C1704T', 'T1748C', 'A1769G', 'T1782C', 'T1787C', 'T1853C', 'G1919A', 'C1958T', 

{'NODE_0000000': [],
 'NODE_0000117': ['G27A',
  'T85C',
  'G88A',
  'C191T',
  'C242A',
  'A260G',
  'G314A',
  'T342C',
  'T356C',
  'C386T',
  'C404T',
  'A408C',
  'A524T',
  'G577A',
  'T586C',
  'C609T',
  'G616A',
  'A636T',
  'C675T',
  'T699C',
  'G737A',
  'G792A',
  'G843A',
  'C858T',
  'A906G',
  'T912C',
  'G951A',
  'C963T',
  'G998A',
  'T1001C',
  'C1009T',
  'G1012T',
  'C1025T',
  'G1026A',
  'T1030C',
  'T1038C',
  'C1039T',
  'C1049T',
  'C1107T',
  'G1115A',
  'A1116G',
  'A1136G',
  'C1196T',
  'T1346G',
  'A1355G',
  'G1421A',
  'G1520A',
  'G1571A',
  'G1628A',
  'A1649G',
  'C1704T',
  'T1748C',
  'A1769G',
  'T1782C',
  'T1787C',
  'T1853C',
  'G1919A',
  'C1958T',
  'G1991T',
  'A2024T',
  'A2063T',
  'T2075C',
  'A2144G',
  'A2159G',
  'T2183A',
  'T2238C',
  'G2325A',
  'A2391G',
  'C2403T',
  'A2409G',
  'C2526T',
  'A2532C',
  'T2561C',
  'T2616C',
  'G2619A',
  'A2640T',
  'T2691C',
  'A2706G',
  'T2730C',
  'A2739G',
  'A2766G',
  'A2802G',
  'A2805G',

In [28]:
def Synonymous_Mutations(reffile, node, dictionary_=None, new_=None):
    """ Finds Synonymous mutations in CDS regions. Input:nested json"""
    ref_file = SeqIO.read(reffile, "genbank")
    gene_cds = CDS_finder(ref_file)
    if new_ is None: new_ = []

    if dictionary_ is None: dictionary_ = dict()

    if 'mutations' in node['branch_attrs']:
        #aa_mutations, new_, in_it = ([] for i in range(3))
        if 'nuc' in node['branch_attrs']['mutations']:
            for gene, loc in gene_cds.items():
                if gene in node['branch_attrs']['mutations']:
                    print(gene, node['branch_attrs']['mutations'][gene])
            print("nucl", node['branch_attrs']['mutations']['nuc'])


    if 'name' in node: dictionary_[node['name']] = new_
    if 'children' in node:
        for child in node['children']: Synonymous_Mutations(reffile, child, dictionary_, new_=None)
    return(dictionary_)
#Synonymous_Mutations(ref, f['tree'])


print(f)

{'version': 'v2', 'meta': {'title': 'RSV-A phylogeny', 'updated': '2023-06-04', 'build_url': 'https://github.com/nextstrain/rsv', 'data_provenance': [{'name': 'GenBank', 'url': 'https://www.ncbi.nlm.nih.gov/genbank/'}], 'maintainers': [{'name': 'Laura Urbanska', 'url': 'http://nextstrain.org'}, {'name': 'Richard Neher', 'url': 'http://nextstrain.org'}], 'display_defaults': {'map_triplicate': True, 'transmission_lines': False}, 'genome_annotations': {'nuc': {'end': 15282, 'start': 1, 'strand': '+', 'seqid': 'results/a/genome/newreference.gbk', 'type': 'source'}, 'F': {'end': 7449, 'seqid': 'results/a/genome/newreference.gbk', 'start': 5725, 'strand': '+', 'type': 'CDS'}, 'G': {'end': 5645, 'seqid': 'results/a/genome/newreference.gbk', 'start': 4680, 'strand': '+', 'type': 'CDS'}, 'L': {'end': 15047, 'seqid': 'results/a/genome/newreference.gbk', 'start': 8560, 'strand': '+', 'type': 'CDS'}, 'M': {'end': 4022, 'seqid': 'results/a/genome/newreference.gbk', 'start': 3252, 'strand': '+', 'ty

In [80]:
with open ("/home/laura/code/mutation_patterns/data/rsv_a_genome.json") as file_:
    file_ = json.load(file_) 
CDS_locations = dict()
for gene, data in f['meta']['genome_annotations'].items():
    #print(gene, data['start'], data['end'])
    CDS_locations[gene] = [i for i in range(data['start'], data['end']+1)]
CDS_locations.pop('nuc')


def CDS_finder(jsonfile):
    CDS_locations = dict()
    for gene, data in jsonfile['meta']['genome_annotations'].items(): CDS_locations[gene] = [i for i in range(data['start'], data['end']+1)]
    CDS_locations.pop('nuc')
    return(CDS_locations)


def all_loc_CDS(jsonfile):
    gene_cds = CDS_finder(jsonfile)
    all_loc_CDS_ = []
    for gene, locations in gene_cds.items(): all_loc_CDS_.extend(locations)
    return(all_loc_CDS_)
    
    

def Synonymous_Mutations(f, node, dictionary_=None, new_=None):
    """ Finds Synonymous mutations in CDS regions. Input:nested json"""
    gene_cds = CDS_finder(f)
    all_cds = all_loc_CDS(f)
    if new_ is None: new_ = []
    if dictionary_ is None: dictionary_ = dict()
    if 'mutations' in node['branch_attrs']:
        aa_mutations, new_, in_it, nucls = ([] for i in range(4))
        if 'nuc' in node['branch_attrs']['mutations']:
            for gene, loc in gene_cds.items():
                if gene in node['branch_attrs']['mutations']:
                    for mut in node['branch_attrs']['mutations'][gene]:
                        #each possible codon location
                        aa_mutations.append(int(mut[1:-1])*3+ loc[0]-1) 
                        aa_mutations.append(int(mut[1:-1])*3+ loc[0]-2) 
                        aa_mutations.append(int(mut[1:-1])*3+ loc[0]-3) 
            for nucl in node['branch_attrs']['mutations']['nuc']:
                if int(nucl[1:-1]) not in aa_mutations: 
                    if '-' not in nucl and int(nucl[1:-1]) in all_cds: new_.append(nucl)
    if 'name' in node: dictionary_[node['name']] = new_
    if 'children' in node:
        for child in node['children']: Synonymous_Mutations(f, child, dictionary_, new_=None)
    return(dictionary_)

print(Synonymous_Mutations(file_, file_['tree']))

{'NODE_0000000': [], 'NODE_0000117': [], 'NODE_0000118': [], 'KU316171': [], 'NODE_0000119': [], 'MG642060': [], 'KU316112': [], 'NODE_0000120': [], 'KU316140': [], 'KU316106': [], 'NODE_0000123': [], 'KU316168': [], 'KU316155': [], 'NODE_0000124': [], 'KU316103': [], 'NODE_0000017': [], 'KJ723478': [], 'NODE_0000031': [], 'NODE_0000127': [], 'KU316146': [], 'NODE_0000128': [], 'KU316120': [], 'MG642038': [], 'KJ723467': [], 'MG642069': [], 'KJ723461': [], 'KJ723475': [], 'NODE_0000134': [], 'MG642075': [], 'KU316160': [], 'NODE_0000136': [], 'KU316169': [], 'KU316152': [], 'KU316154': [], 'NODE_0000138': [], 'NODE_0000139': [], 'NODE_0000140': [], 'KJ723474': [], 'KJ723489': [], 'NODE_0000141': [], 'KU316119': [], 'NODE_0000142': [], 'KU316109': [], 'KU316124': [], 'NODE_0000143': [], 'NODE_0000144': [], 'NODE_0000145': [], 'KP258729': [], 'KP258741': [], 'NODE_0000146': [], 'MG642067': [], 'NODE_0000148': [], 'KJ723491': [], 'KU316162': [], 'NODE_0000149': [], 'KP258730': [], 'KU3161