In [2]:
import json
from Bio import SeqIO, Phylo, Seq
from string import digits
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt

In [169]:
def CDS_finder(reference):
    """this function finds CDS location and not CDS location, and saves only those not located at the end of the function"""
    cds_ = dict()
    for feature in reference.features:
        if feature.type == 'CDS': 
            cds_[feature.qualifiers['gene'][0]] = (list(feature.location))
    return(cds_)

ref_file = SeqIO.read("data/areference.gbk", "genbank")

def Synonymous_Mutations(node, dictionary_=None, new_=None):
    """ Finds Synonymous mutations in CDS regions. Input:nested json"""
    gene_cds = CDS_finder(ref_file)
    if new_ is None: new_ = []
    if dictionary_ is None: dictionary_ = dict()
    if 'mutations' in node['branch_attrs']:
        aa_mutations, new_, in_it = ([] for i in range(3))
        if 'nuc' in node['branch_attrs']['mutations']:
            for gene, loc in gene_cds.items():
                if gene in node['branch_attrs']['mutations']:
                    for mut in node['branch_attrs']['mutations'][gene]:
                        aa_mutations.append(int(mut[1:-1])*3+loc[0]) # converting the amino acid location to nucleotide
            for mut in node['branch_attrs']['mutations']['nuc']:
                if '-' not in mut and '*' not in mut and 'N' not in mut and "R" not in mut and "Y" not in mut and "M" not in mut and "D" not in mut:
                    if int(mut[1:-1]) not in aa_mutations and int(mut[1:-1])+2 not in aa_mutations and int(mut[1:-1])+1 not in aa_mutations:  #if the mutation is not in the same codon as a aa mutation
                        new_.append(mut)
                    else: in_it.append(mut[1:-1])
    if 'name' in node:
            dictionary_[node['name']] = new_
    if 'children' in node:
        for child in node['children']:
           Synonymous_Mutations(child, dictionary_, new_=None)
    return(dictionary_)


ref_file = SeqIO.read("data/areference.gbk", "genbank")
with open ("data/rsv_a_genome.json") as file_: f = json.load(file_)  
synonymous = Synonymous_Mutations(f['tree'])



#Extracting all synonymous mutations from json as a list
all_muts = []
for branch, muts in synonymous.items():
    if branch != []:
        for mut in muts:
            if mut[0] and mut[-1] in ["A", "T", "C", "G"]:
                all_muts.append(f'{mut[0]}{mut[-1]}')
all_muts_counter = Counter(all_muts)

#making matrix with mutation rates
df = pd.DataFrame(index=['A', 'C', 'G', 'T'], columns=['A', 'C', 'G', 'T'])      

for mutation, nr in all_muts_counter.items():
    df.at[mutation[0], mutation[-1]] = int(nr)
print("matrix unscaled synonymous")
print(df)


#now have to divide by the nr of locations where the mutation can occur
ref_file = SeqIO.read("data/areference.gbk", "genbank")
gene_cds = CDS_finder(ref_file)

sequence_ref_cds = dict()
whole_seq_CDS = ""
for gene, cds in gene_cds.items(): 
    sequence_ref_cds[gene] = ref_file.seq[cds[0]:cds[-1]]
    whole_seq_CDS = whole_seq_CDS+ref_file.seq[cds[0]:cds[-1]]
whole_seq_CDS = whole_seq_CDS.replace("N", "")
whole_seq_CDS = whole_seq_CDS.replace("M", "")

#translation matrix
translations = {'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'], 'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], 'C': ['TGT', 'TGC'], 'W': ['TGG'], 'E': ['GAA', 'GAG'], 'D': ['GAT', 'GAC'], 'P': ['CCT', 'CCC', 'CCA', 'CCG'], 'V': ['GTT', 'GTC', 'GTA', 'GTG'], 'N': ['AAT', 'AAC'], 'M': ['ATG'], 'K': ['AAA', 'AAG'], 'Y': ['TAT', 'TAC'], 'I': ['ATT', 'ATC', 'ATA'], 'Q': ['CAA', 'CAG'], 'F': ['TTT', 'TTC'], 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'T': ['ACT', 'ACC', 'ACA', 'ACG'], '*': ['TAA', 'TAG', 'TGA'], 'A': ['GCT', 'GCC', 'GCA', 'GCG'], 'G': ['GGT', 'GGC', 'GGA', 'GGG'], 'H': ['CAT', 'CAC']}


synonymous_possibilities, nonsynonymous_possibilities = (0 for i in range(2))
for gene, sequence in sequence_ref_cds.items():
    for i, letter in enumerate(sequence):
        if i%3 == 0:
            codon = sequence[i: i+3]
            for key, entry in translations.items():
                if codon in entry:
                    synonymous = len(entry)
                    synonymous_possibilities += synonymous-1 
                    nonsynonymous = 9 - len(entry)-1
                    nonsynonymous_possibilities += nonsynonymous

nonsyn_ratio = nonsynonymous_possibilities/(nonsynonymous_possibilities+synonymous_possibilities)
syn_ratio = synonymous_possibilities/(nonsynonymous_possibilities+synonymous_possibilities)

print("\n","syn  fractions")
print(syn_ratio)

print("\n","scaled by syn ratio")
scaled = df.divide(syn_ratio)
scaled = scaled.fillna(0)
print(scaled)

print("\n", "ratio of nt", "\n")
counter = Counter(whole_seq_CDS)
total = 0
for i in counter.values(): total += i

df_ratios = pd.DataFrame.from_dict(counter, orient='index').astype(int).T
df_ratios = df_ratios.divide(total)
print(df_ratios)

nuc = ["A", "C", "G", "T"]
for n in nuc:
    scaled.loc[[n]] = scaled.loc[[n]].div(float(df_ratios[n]))

print(scaled)

print("\n", "normalized and scaled matrix")

sum_ = scaled.to_numpy().sum()
print("sum of all:",  sum_, "\n")

scaled_and_normalized = scaled.divide(sum_)

print(scaled_and_normalized)




matrix unscaled synonymous
      A     C     G     T
A   NaN   249  2342   662
C   589   NaN    28  4079
G  2301    10   NaN   168
T   541  4138   116   NaN

 syn  fractions
0.34887810367869454

 scaled by syn ratio
             A             C            G             T
A     0.000000    713.716331  6712.946371   1897.510887
C  1688.268750      0.000000    80.257258  11691.762702
G  6595.426815     28.663306     0.000000    481.543548
T  1550.684879  11860.876210   332.494355      0.000000

 ratio of nt 

          A         T        G         C
0  0.384226  0.277973  0.16134  0.176461
              A             C             G             T
A      0.000000   1857.545055  17471.367549   4938.533440
C   9567.377562      0.000000    454.815911  66256.932216
G  40878.949886    177.657322      0.000000   2984.643016
T   5578.544397  42669.162136   1196.138910      0.000000

 normalized and scaled matrix
sum of all: 194031.6674024871 

          A         C         G         T
A  0.000000