In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data import CodonTable
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
article_pos_file = "data/B.1_2022-08-22.og.aln.pruned.tree.amino_acid.reconstruction.csv"
article_df = pd.read_csv(article_pos_file, sep=',')
print(len(article_df))
article_df.head()

800


Unnamed: 0,site,gene,direction,snp,dimer,apobec,aa_position,parent,parent_codon,parent_aa,child,child_codon,child_aa,mutation_category,score,prediction,homoplasy,occurrence
0,3111,OPG003_CDS_176,reverse,G->A,GA,True,2.0,Node1,TCC,S,Node2,TTC,F,nonsynonymous,155.0,radical,False,1
1,39139,OPG057_CDS_142,reverse,C->T,TC,True,3.0,Node1,GCG,A,Node2,GCA,A,synonymous,,,False,1
2,73239,OPG093_CDS_107,forward,G->A,GA,True,2.0,Node1,AGA,R,Node2,AAA,K,nonsynonymous,26.0,conservative,False,1
3,74205,OPG094_CDS_106,forward,G->A,GA,True,1.0,Node1,GAA,E,Node2,AAA,K,nonsynonymous,56.0,moderately conservative,False,1
4,77383,OPG098_CDS_102,forward,G->A,GA,True,2.0,Node1,TGA,*,Node2,TAA,*,synonymous,,,False,1


In [3]:
article_df.columns

Index(['site', 'gene', 'direction', 'snp', 'dimer', 'apobec', 'aa_position',
       'parent', 'parent_codon', 'parent_aa', 'child', 'child_codon',
       'child_aa', 'mutation_category', 'score', 'prediction', 'homoplasy',
       'occurrence'],
      dtype='object')

In [4]:
article_df['REF'] = article_df.apply(lambda row: row['snp'].split('->')[0], axis = 1)
article_df['ALT'] = article_df.apply(lambda row: row['snp'].split('->')[1], axis = 1)
article_df = article_df[['site', 'gene', 'direction', 'REF', "ALT", 'dimer', 'apobec', 'aa_position',
       'parent', 'parent_codon', 'parent_aa', 'child', 'child_codon',
       'child_aa', 'mutation_category', 'score', 'prediction', 'homoplasy',
       'occurrence']]
article_df.head()

Unnamed: 0,site,gene,direction,REF,ALT,dimer,apobec,aa_position,parent,parent_codon,parent_aa,child,child_codon,child_aa,mutation_category,score,prediction,homoplasy,occurrence
0,3111,OPG003_CDS_176,reverse,G,A,GA,True,2.0,Node1,TCC,S,Node2,TTC,F,nonsynonymous,155.0,radical,False,1
1,39139,OPG057_CDS_142,reverse,C,T,TC,True,3.0,Node1,GCG,A,Node2,GCA,A,synonymous,,,False,1
2,73239,OPG093_CDS_107,forward,G,A,GA,True,2.0,Node1,AGA,R,Node2,AAA,K,nonsynonymous,26.0,conservative,False,1
3,74205,OPG094_CDS_106,forward,G,A,GA,True,1.0,Node1,GAA,E,Node2,AAA,K,nonsynonymous,56.0,moderately conservative,False,1
4,77383,OPG098_CDS_102,forward,G,A,GA,True,2.0,Node1,TGA,*,Node2,TAA,*,synonymous,,,False,1


In [6]:
def load_genome(fasta_file):
    """Load the genome from a FASTA file."""
    genome = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
    new_genome = genome["NC_063383.1"].seq
    return new_genome

def load_gtf(gtf_file):
    """Load the GTF file into a pandas DataFrame."""
    gtf_columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
    gtf = pd.read_csv(gtf_file, sep="\t", comment='#', header=None, names=gtf_columns)
    return gtf

def find_parent_mutation(genome, position, parent, table=article_df):
    parent_filter_df = table[table['child']==parent]
    parent_filter_df = parent_filter_df.reset_index(drop=True)
    new_parent = parent_filter_df.loc[0, "parent"]
    sites_list = parent_filter_df.site.tolist()
    if position in sites_list or new_parent == "Node1":
        if new_parent == "Node1":
            return genome
        else:
            new_df = parent_filter_df[parent_filter_df['site']==position]
            new_df = new_df.reset_index(drop=True)
            alt_nucl = new_df.loc[0, "ALT"]
            new_genome = list(genome)
            new_genome[position-1] = alt_nucl
            return Seq(''.join(new_genome))
    else:
        return find_parent_mutation(genome, position, new_parent)

def get_cds(gtf, chromosome, position):
    """Find the CDS where the mutation is located."""
    cds_rows = gtf[(gtf['seqname'] == chromosome) & 
                   (gtf['feature'] == 'CDS') & 
                   (gtf['start'] <= position) & 
                   (gtf['end'] >= position)]
    return cds_rows

def translate_codon(codon):
    """Translate a codon to its corresponding amino acid."""
    return str(Seq(codon).translate(table=CodonTable.unambiguous_dna_by_id[11]))

def detect_mutation_impact(genome, gtf, chromosome, position, ref, alt):
    """Detect the amino acid change caused by a mutation."""
    cds = get_cds(gtf, chromosome, position)
    
    if cds.empty:
        ref_nucl = genome[position-1]
        return '-', '-', '-', '-', ref_nucl, '-'
    
    strand = cds.iloc[0]['strand']
    gene_start = cds.iloc[0]['start']
    
    # Calculate the codon position
    codon_start = gene_start + ((position - gene_start) // 3) * 3
    
    # Extract the codon
    codon_seq = genome[codon_start-1:codon_start+2]
    if strand == '-':
        codon_seq = codon_seq.reverse_complement()
        alt = str(Seq(alt).reverse_complement())
    
    # Translate the original codon
    original_amino_acid = translate_codon(codon_seq)
    
    # Create the mutated codon
    mutation_index = (position - codon_start) % 3
    if strand == '-':
        if mutation_index == 0:
            mutation_index = 2
        elif mutation_index == 2:
            mutation_index = 0

    mutated_codon = list(codon_seq)
    mutated_codon[mutation_index] = alt
    mutated_codon = Seq(''.join(mutated_codon))
    
    # Translate the mutated codon
    mutated_amino_acid = translate_codon(mutated_codon)
    
    ref_nucl = genome[position-1]
    pos = mutation_index+1
    
    return original_amino_acid, mutated_amino_acid, codon_seq, mutated_codon, ref_nucl, pos



# BODY
genome = load_genome(os.path.join(r"C:\Users\User\Documents\msu\diplom\APOBEC_mutagenesis\MPXV", "NC_063383.1.fasta"))
gtf = load_gtf(os.path.join(r"C:\Users\User\Documents\msu\diplom\APOBEC_mutagenesis\MPXV\aa_APOBEC_pos", "GCF_014621545.1_ASM1462154v1_genomic.220824.gff"))
chromosome = "NC_063383.1"

original_aa_list = []
mutated_aa_list = []
original_codon_list = []
mutated_codon_list = []
ref_nucl_list = []
nucl_pos_list = []
mutation_category = []

##duplicated data
duplicated_df_science = article_df[article_df.duplicated('site', keep=False) == True]
duplicated_df_science = duplicated_df_science.reset_index(drop=True)
duplicates_list = duplicated_df_science.site.unique()

for row in range(0, len(article_df)):
    position = article_df.loc[row, "site"] 
    ref_nucl = article_df.loc[row, "REF"]
    alt_nucl = article_df.loc[row, "ALT"]

    if position in duplicates_list:
        node = article_df.loc[row, "parent"]
        new_genome = find_parent_mutation(genome, position, node)
        original_aa, mutated_aa, original_codon, mutated_codon, ref, nucl_pos = detect_mutation_impact(new_genome, gtf, chromosome, position, ref_nucl, alt_nucl)
    else:
        original_aa, mutated_aa, original_codon, mutated_codon, ref, nucl_pos = detect_mutation_impact(genome, gtf, chromosome, position, ref_nucl, alt_nucl)
        
    # CDS empty
    if original_aa == "-":
        mutation_category.append("intergenic")
        original_aa_list.append('-')
        mutated_aa_list.append('-')
        original_codon_list.append('-')
        mutated_codon_list.append('-')
        nucl_pos_list.append('-')
        ref_nucl_list.append(ref)
    else:
        original_aa_list.append(original_aa)
        mutated_aa_list.append(mutated_aa)
        original_codon_list.append(str(original_codon))
        mutated_codon_list.append(str(mutated_codon))
        ref_nucl_list.append(ref)
        nucl_pos_list.append(nucl_pos)

        if mutated_aa == original_aa:
            mutation_category.append("synonymous")
        elif mutated_aa == "*":
            mutation_category.append("nonsense")
        else:
            mutation_category.append("nonsynonymous")
            
article_df["REF_v2"] = ref_nucl_list
article_df["nucl_pos_codon"] = nucl_pos_list
article_df["parent_codon_v2"] = original_codon_list
article_df["parent_aa_v2"] = original_aa_list
article_df["child_codon_v2"] = mutated_codon_list
article_df["mutated_aa_v2"] = mutated_aa_list
article_df["mutation_category_v2"] = mutation_category

In [7]:
article_df

Unnamed: 0,site,gene,direction,REF,ALT,dimer,apobec,aa_position,parent,parent_codon,...,prediction,homoplasy,occurrence,REF_v2,nucl_pos_codon,parent_codon_v2,parent_aa_v2,child_codon_v2,mutated_aa_v2,mutation_category_v2
0,3111,OPG003_CDS_176,reverse,G,A,GA,True,2.0,Node1,TCC,...,radical,False,1,G,3,ATC,I,ATT,I,synonymous
1,39139,OPG057_CDS_142,reverse,C,T,TC,True,3.0,Node1,GCG,...,,False,1,C,1,GAG,E,AAG,K,nonsynonymous
2,73239,OPG093_CDS_107,forward,G,A,GA,True,2.0,Node1,AGA,...,conservative,False,1,G,1,GAT,D,AAT,N,nonsynonymous
3,74205,OPG094_CDS_106,forward,G,A,GA,True,1.0,Node1,GAA,...,moderately conservative,False,1,G,3,ATG,M,ATA,I,nonsynonymous
4,77383,OPG098_CDS_102,forward,G,A,GA,True,2.0,Node1,TGA,...,,False,1,G,1,GAG,E,AAG,K,nonsynonymous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,127220,OPG148_CDS_52,forward,C,T,TC,True,3.0,Node2,CTC,...,,False,1,C,2,TCA,S,TTA,L,nonsynonymous
796,144527,,,C,T,TC,True,,Node2,,...,,False,1,C,1,CCT,P,TCT,S,nonsynonymous
797,79348,OPG102_CDS_98,forward,C,T,TC,True,1.0,Node1,CAT,...,moderately conservative,False,1,C,3,ATC,I,ATT,I,synonymous
798,86979,OPG108_CDS_92,reverse,C,T,GC,False,2.0,Node1,AGC,...,conservative,False,1,C,3,GAG,E,GAA,E,synonymous


In [8]:
new_col = []
for row in range(0, len(article_df)):
    if article_df.loc[row, "REF"] != article_df.loc[row, "REF_v2"]:
        new_col.append('-')
    else:
        new_col.append("+")
article_df["new_col"] = new_col
print('does not match to reference: ', len(article_df[article_df['new_col']=='-']))
article_df[article_df['new_col']=='-']

does not match to reference:  2


Unnamed: 0,site,gene,direction,REF,ALT,dimer,apobec,aa_position,parent,parent_codon,...,homoplasy,occurrence,REF_v2,nucl_pos_codon,parent_codon_v2,parent_aa_v2,child_codon_v2,mutated_aa_v2,mutation_category_v2,new_col
268,168120,OPG193_CDS_17,forward,T,C,,False,2.0,Node386,TTT,...,False,1,C,1,CTT,L,CTT,L,synonymous,-
423,81977,OPG105_CDS_95,forward,G,A,GA,True,1.0,Node561,GAC,...,False,1,A,3,CAA,Q,CAA,Q,synonymous,-


In [9]:
#filter only APOBEC mutations
df1_new1 = article_df[(article_df['REF_v2']=="G") & (article_df['ALT']=="A")]
df1_new2 = article_df[(article_df['REF_v2']=="C") & (article_df['ALT']=="T")]
df1_new = pd.concat([df1_new1, df1_new2])
df1_new = df1_new.reset_index(drop=True)
len(df1_new)

724

In [10]:
#filter TC and GA motifs
from Bio import SeqIO

def load_genome(fasta_file):
    """Load the genome from a FASTA file."""
    genome = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))
    new_genome = genome["NC_063383.1"].seq
    return new_genome

def find_parent_mutations(genome, parent, table=article_df):
    parent_filter_df = table[table['child']==parent]
    parent_filter_df = parent_filter_df.reset_index(drop=True)
    new_parent = parent_filter_df.loc[0, "parent"]
    if new_parent == "Node1":
        new_genome = list(genome)
        for row in range(0, len(parent_filter_df)):
            position = parent_filter_df.loc[row, "site"]
            alt_nucl = parent_filter_df.loc[0, "ALT"]
            new_genome[position-1] = alt_nucl
        return Seq(''.join(new_genome))
    else:
        new_genome = list(genome)
        for row in range(0, len(parent_filter_df)):
            position = parent_filter_df.loc[row, "site"]
            alt_nucl = parent_filter_df.loc[0, "ALT"]
            new_genome[position-1] = alt_nucl
        return find_parent_mutations(new_genome, new_parent)
    
    
genome = load_genome(os.path.join(r"C:\Users\User\Documents\msu\diplom\APOBEC_mutagenesis\MPXV", "NC_063383.1.fasta"))
motifs = []
for row in range(0, len(df1_new)):
    pos = df1_new.loc[row, 'site']
    ref = df1_new.loc[row, 'REF']
    node = df1_new.loc[row, 'parent']
    if node != "Node1":
        new_genome = find_parent_mutations(genome, node)
        if ref == "C":
            motifs.append(new_genome[pos-2:pos])
        else:
            motifs.append(new_genome[pos-1:pos+1])
    else:
        if ref == "C":
            motifs.append(genome[pos-2:pos])
        else:
            motifs.append(genome[pos-1:pos+1])

df1_new['motif'] = motifs

df1_new1 = df1_new[df1_new['motif'] == 'GA']
df1_new2 = df1_new[df1_new['motif'] == 'TC']
df1_new = pd.concat([df1_new1, df1_new2])
len(df1_new)

641

In [11]:
#filter Node1
df1_new = df1_new[df1_new['parent'] != 'Node1']
df1_new = df1_new.reset_index(drop=True)
len(df1_new)

631

In [12]:
df2 = pd.read_csv("APOBEC_targets_aa.csv", sep='\t')
print(df2.head())

        #CHROM  position REF ALT nucl_pos_codon parent_codon parent_aa  \
0  NC_063383.1        23   C   T              -            -         -   
1  NC_063383.1        60   C   T              -            -         -   
2  NC_063383.1        78   C   T              -            -         -   
3  NC_063383.1        82   C   T              -            -         -   
4  NC_063383.1        98   C   T              -            -         -   

  child_codon mutated_aa mutation_category  
0           -          -        intergenic  
1           -          -        intergenic  
2           -          -        intergenic  
3           -          -        intergenic  
4           -          -        intergenic  


In [13]:
df1_new['#CHROM'] = "NC_063383.1"
df1_new = df1_new[['#CHROM', 'site', 'REF_v2', 'ALT', 'nucl_pos_codon', 'parent_codon_v2', 'parent_aa_v2','child_codon_v2','mutated_aa_v2','mutation_category_v2']]
df1_new.head()

Unnamed: 0,#CHROM,site,REF_v2,ALT,nucl_pos_codon,parent_codon_v2,parent_aa_v2,child_codon_v2,mutated_aa_v2,mutation_category_v2
0,NC_063383.1,186165,G,A,1,GAT,D,AAT,N,nonsynonymous
1,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
2,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
3,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
4,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous


In [14]:
df1_new.columns = df2.columns
df1_new.head()

Unnamed: 0,#CHROM,position,REF,ALT,nucl_pos_codon,parent_codon,parent_aa,child_codon,mutated_aa,mutation_category
0,NC_063383.1,186165,G,A,1,GAT,D,AAT,N,nonsynonymous
1,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
2,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
3,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous
4,NC_063383.1,55133,G,A,3,CTC,L,CTT,L,synonymous


In [15]:
df1_new.to_csv("article_science_pos_aa.csv", sep='\t', index=False)

In [175]:
df2['type'] = 'TC or GA target site'
df1_new['type'] = 'Observed APOBEC3-like'
df_concat = pd.concat([df2, df1_new], ignore_index=True)
df_concat.head()

Unnamed: 0,#CHROM,position,REF,ALT,nucl_pos_codon,parent_codon,parent_aa,child_codon,mutated_aa,mutation_category,type
0,NC_063383.1,23,C,T,-,-,-,-,-,intergenic,TC or GA target site
1,NC_063383.1,60,C,T,-,-,-,-,-,intergenic,TC or GA target site
2,NC_063383.1,78,C,T,-,-,-,-,-,intergenic,TC or GA target site
3,NC_063383.1,82,C,T,-,-,-,-,-,intergenic,TC or GA target site
4,NC_063383.1,98,C,T,-,-,-,-,-,intergenic,TC or GA target site


In [176]:
df_concat.to_csv("combined_table_APOBEC_target.csv", sep='\t', index=False)

In [177]:
df_grouped = df_concat.groupby(by=["mutation_category", 'type']).size()
df_grouped = df_grouped.reset_index()
df_grouped = df_grouped.rename(columns={0: 'number'})
df_grouped

Unnamed: 0,mutation_category,type,number
0,intergenic,Observed APOBEC3-like,129
1,intergenic,TC or GA target site,3608
2,nonsense,Observed APOBEC3-like,8
3,nonsense,TC or GA target site,703
4,nonsynonymous,Observed APOBEC3-like,310
5,nonsynonymous,TC or GA target site,15162
6,synonymous,Observed APOBEC3-like,184
7,synonymous,TC or GA target site,5222


## grantham_score

In [51]:
df_score = pd.read_csv("grantham_score.csv", sep=',')
df_score_targets = df_score[df_score['type'] == "TC or GA target site"]
df_score_targets = df_score_targets.reset_index(drop=True)
df_score_targets

Unnamed: 0,X.CHROM,position,REF,ALT,nucl_pos_codon,parent_codon,parent_aa,child_codon,mutated_aa,mutation_category,type,grantham_score,grantham_rank
0,NC_063383.1,7584,C,T,2.0,TCG,S,TTG,L,nonsynonymous,TC or GA target site,145,moderately radical
1,NC_063383.1,7595,C,T,1.0,CTG,L,TTG,L,synonymous,TC or GA target site,0,synonymous
2,NC_063383.1,7609,C,T,3.0,TTC,F,TTT,F,synonymous,TC or GA target site,0,synonymous
3,NC_063383.1,7624,C,T,3.0,ATC,I,ATT,I,synonymous,TC or GA target site,0,synonymous
4,NC_063383.1,7629,C,T,2.0,TCA,S,TTA,L,nonsynonymous,TC or GA target site,145,moderately radical
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8923,NC_063383.1,149265,G,A,2.0,TCG,S,TTG,L,nonsynonymous,TC or GA target site,145,moderately radical
8924,NC_063383.1,149281,G,A,1.0,CCA,P,TCA,S,nonsynonymous,TC or GA target site,0,conservative
8925,NC_063383.1,149300,G,A,3.0,GTC,V,GTT,V,synonymous,TC or GA target site,0,synonymous
8926,NC_063383.1,149314,G,A,1.0,CAA,Q,TAA,*,nonsense,TC or GA target site,0,conservative


In [55]:
#transform table to make ggsankey
x = []
node = []
next_x = []
next_node = []
grantham_rank_list = []
for row in range(0, len(df_score_targets)):
    parent_aa = df_score_targets.loc[row, "parent_aa"]
    mutated_aa = df_score_targets.loc[row, "mutated_aa"]
    grantham_rank = df_score_targets.loc[row, "grantham_rank"]
    if parent_aa == mutated_aa:
        x.append("parent_aa")
        node.append('=')
        next_x.append("mutated_aa")
        next_node.append('=')
        grantham_rank_list.append(grantham_rank)
        
        x.append("mutated_aa")
        node.append('=')
        next_x.append(None)
        next_node.append(None)
        grantham_rank_list.append(grantham_rank)
    else:
        x.append("parent_aa")
        node.append(parent_aa)
        next_x.append("mutated_aa")
        next_node.append(mutated_aa)
        grantham_rank_list.append(grantham_rank)
    
        x.append("mutated_aa")
        node.append(mutated_aa)
        next_x.append(None)
        next_node.append(None)
        grantham_rank_list.append(grantham_rank)
    
data = {'x': x, 'node': node, "next_x":next_x, "next_node":next_node, "grantham_rank": grantham_rank_list}
ggsankey_df = pd.DataFrame.from_dict(data)
ggsankey_df

Unnamed: 0,x,node,next_x,next_node,grantham_rank
0,parent_aa,S,mutated_aa,L,moderately radical
1,mutated_aa,L,,,moderately radical
2,parent_aa,=,mutated_aa,=,synonymous
3,mutated_aa,=,,,synonymous
4,parent_aa,=,mutated_aa,=,synonymous
...,...,...,...,...,...
17851,mutated_aa,=,,,synonymous
17852,parent_aa,Q,mutated_aa,*,conservative
17853,mutated_aa,*,,,conservative
17854,parent_aa,=,mutated_aa,=,synonymous


In [56]:
ggsankey_df.to_csv("ggsankey_targets.csv", sep='\t', index=False)