## Data proccessing for phylogenetic analysis

Following https://github.com/abacus-gene/paml-tutorial/tree/main/positive-selection/00_data


Input: HRR25_nuc_nonAligned.fasta

mafft output nucletotide: HRR25_mafft_translatorx.nt_ali.fasta
mafft output aminoacid  : HRR25_mafft_translatorx.aa_ali.fasta

# Evaluating gaps in the alignment file

In [3]:
import os, sys
import pandas as pd
import math

def fasta2dict(f):
    d = {}
    with  open(f,"r") as F:
        for line in F:
            if line[0]==">":
                name = line[1:].split("\n")[0].split("|")[0]
                d[name] = ''
            else:
                #print(name)
                d[name]+= line.split("\n")[0]
    F.close()
    return d


def MSAdict2df_AA(d):
    """
    Dictionary of Multiple Sequence Alignment to pandas dataframe for evaluating gap regions.
    """
    n_entries = len(d)
    n_chars   = max([len(d[k]) for k in d])

    df = pd.DataFrame(data={k:list(d[k]) for k in d} )
    
    try:
        gap = df.apply(pd.Series.value_counts, axis=1).fillna(0)["-"]    
    except KeyError:
        gap = pd.Series(data=0, index=[i for i in range(n_chars)])
        
    
    df.insert(0,'Position', [i+1 for i in range(n_chars)])
    df.insert(1,'GapRatio', [(float(i)/float(n_entries)) for i in gap])
    df2 = df.swapaxes("index", "columns")
    return df2 



def MSAdict2df_nt(d):
    """
    Dictionary of Multiple Sequence Alignment to pandas dataframe for evaluating gap regions.
    """
    n_entries = len(d)-1
    n_chars   = max([len(d[k]) for k in d])

    df = pd.DataFrame(data={k:list(d[k]) for k in d} )
    
     
    try:
        gap = df.apply(pd.Series.value_counts, axis=1).fillna(0)["-"]    
    except KeyError:
        gap = pd.Series(data=0, index=[i for i in range(n_chars)])
  
    df.insert(0,'Position', [math.floor(i/3)+1 for i in range(n_chars)])
    df.insert(1,'GapRatio', [(float(i+0)/float(n_entries)) for i in gap])
    df2 = df.swapaxes("index", "columns")
    return df2 


def df2fasta(df,f):
    with open(f,"w") as F:
        for index, row in df.iterrows():
            gene = index
            seq  = ("").join([row[i] for i in df.columns.values.tolist()])
            entry = ">%s\n%s\n"%(gene,seq)
            F.write(entry)
        F.close()
    print("Generated fasta: %s"%(f))
        
    
def df2phy(df,f):
    nrow  = len(df.index)
    nchar = len(df. columns)
    with open(f,"w") as F:
        F.write("%d   %d\n\n"%(nrow,nchar))
        
        for index, row in df.iterrows():
            gene = index
            seq  = ("").join([row[i] for i in df.columns.values.tolist()])
            entry = "%s      %s\n"%(gene,seq)
            F.write(entry)
        F.close() 
        
    
def removekey(d, key):
    r = dict(d)
    del r[key]
    return r


In [4]:
nt_f = "./raw_data/HRR25_mafft_translatorx.nt_ali.fasta"
aa_f = "./raw_data/HRR25_mafft_translatorx.aa_ali.fasta"

aa_d = fasta2dict(aa_f)
nt_d = fasta2dict(nt_f)


# removing post-WGD species with aleged low quality
remove_nodes = ['HRR25_04.1','HRR25_02.1','HRR25_01.1','HRR25_03.1','HRR25_07.1']

for k in remove_nodes:
    aa_d = removekey(aa_d, k)
    nt_d = removekey(nt_d, k)





df_aa= MSAdict2df_AA(aa_d)
df_aa.head()

df_nt= MSAdict2df_nt(nt_d)
df_nt.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2684,2685,2686,2687,2688,2689,2690,2691,2692,2693
Position,1,1,1,2,2,2,3,3,3,4,...,895,896,896,896,897,897,897,898,898,898
GapRatio,0.011364,0.011364,0.011364,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.977273,0.977273,0.977273,0.977273,0.136364,0.136364,0.136364,0.136364,0.136364,0.136364
HRR25_19.1,A,T,G,-,-,-,-,-,-,-,...,C,C,T,G,T,G,G,C,T,C
HRR25_19.2,A,T,G,-,-,-,-,-,-,-,...,-,-,-,-,T,G,G,C,T,A
HRR25_18.1,A,T,G,-,-,-,-,-,-,-,...,A,T,T,A,C,A,A,T,T,T


In [5]:
df_aa.to_csv("./data_prep4codeml/HRR25_mafft_gap_evaluation_aa.csv", sep=',')
df_nt.to_csv("./data_prep4codeml/HRR25_mafft_gap_evaluation_nt.csv", sep=',')



df_aa.head()
#df_aa = df_aa.drop(["Name"], axis=1)
#df_nt = df_nt.drop(["Name"], axis=1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,888,889,890,891,892,893,894,895,896,897
Position,1,2,3,4,5,6,7,8,9,10,...,889,890,891,892,893,894,895,896,897,898
GapRatio,0.011236,0.988764,0.988764,0.988764,0.011236,0.011236,0.011236,0.011236,0.011236,0.011236,...,0.977528,0.977528,0.977528,0.977528,0.988764,0.977528,0.966292,0.966292,0.134831,0.134831
HRR25_19.1,M,-,-,-,D,L,R,V,G,R,...,H,E,Q,I,E,G,P,L,W,L
HRR25_19.2,M,-,-,-,D,L,R,V,G,R,...,-,-,-,-,-,-,-,-,W,L
HRR25_18.1,M,-,-,-,E,V,K,V,G,K,...,-,-,-,-,-,-,K,L,Q,F


In [6]:
gap_ratio = 0.10
# Define protein positions to be analyzed downstream. Here I take protein position which has ≥90% occupancy
Positions = list(df_aa.loc["Position"][df_aa.loc['GapRatio'] <= gap_ratio])
# Defining column index of protein and nucleotide to extract
AA_col    = [i for i in df_aa.columns if df_aa[i]["Position"] in Positions]
nt_col    = [i for i in df_nt.columns if df_nt[i]["Position"] in Positions]

# Extracting sequence based on protein potision defined above
df_aa_nogaps = df_aa[df_aa.columns[df_aa.columns.isin(AA_col)]]
df_nt_nogaps = df_nt[df_nt.columns[df_nt.columns.isin(nt_col)]]

#df_aa_nogaps = df_aa_nogaps.drop_duplicates(keep='last')
#df_nt_nogaps = df_nt_nogaps.drop_duplicates(keep='last')
df2fasta(df_aa_nogaps.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps.fasta")
df2fasta(df_nt_nogaps.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps.fasta")

df2phy(df_nt_nogaps.drop(["Position","GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps.phy")
df2phy(df_nt_nogaps.iloc[:, [i for i in range(24)]].drop(["Position","GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_4test.phy")

df_aa_nogaps.drop(["Position","GapRatio"]).to_csv("./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps.csv", sep=',')
df_aa_nogaps.drop(["Position","GapRatio"]).to_csv("./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps.csv", sep=',')

Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps.fasta


In [36]:
ref = "HRR25_20.1"

pos_domains = {}

pos = 0
for i in range(0,len(aa_d[ref])):
    msa_pos = i+1
    if aa_d[ref][i] != '-':
        pos+=1
        if pos < 291:
            domain = "kinase"
        elif pos < 395:
            domain = "central"
        else:
            domain = "tail"
        pos_domains[msa_pos] = domain
    else:
        if pos < 291:
            domain = "kinase"
        elif pos < 395:
            domain = "central"
        else:
            domain = "tail"
        pos_domains[msa_pos] = domain


In [39]:
gap_ratio = 0.10
# Define protein positions to be analyzed downstream. Here I take protein position which has ≥90% occupancy
Positions = list(df_aa.loc["Position"][df_aa.loc['GapRatio'] <= gap_ratio])
Positions_domain = [i for i in Positions if pos_domains[i]=="kinase"] 
# Defining column index of protein and nucleotide to extract
AA_col_domain    = [i for i in df_aa.columns if df_aa[i]["Position"] in Positions_domain]
nt_col_domain    = [i for i in df_nt.columns if df_nt[i]["Position"] in Positions_domain]
# Extracting sequence based on protein potision defined above
df_aa_nogaps_domain = df_aa[df_aa.columns[df_aa.columns.isin(AA_col_domain)]]
df_nt_nogaps_domain = df_nt[df_nt.columns[df_nt.columns.isin(nt_col_domain)]]
df2fasta(df_aa_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_kinasedomain.fasta")
df2fasta(df_nt_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_kinasedomain.fasta")
df2phy(df_nt_nogaps_domain.drop(["Position","GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_kinasedomain.phy")


Positions_domain = [i for i in Positions if pos_domains[i]=="central"] 
# Defining column index of protein and nucleotide to extract
AA_col_domain    = [i for i in df_aa.columns if df_aa[i]["Position"] in Positions_domain]
nt_col_domain    = [i for i in df_nt.columns if df_nt[i]["Position"] in Positions_domain]
# Extracting sequence based on protein potision defined above
df_aa_nogaps_domain = df_aa[df_aa.columns[df_aa.columns.isin(AA_col_domain)]]
df_nt_nogaps_domain = df_nt[df_nt.columns[df_nt.columns.isin(nt_col_domain)]]
df2fasta(df_aa_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_centraldomain.fasta")
df2fasta(df_nt_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_centraldomain.fasta")
df2phy(df_nt_nogaps_domain.drop(["Position","GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_centraldomain.phy")


Positions_domain = [i for i in Positions if pos_domains[i]=="tail"] 
# Defining column index of protein and nucleotide to extract
AA_col_domain    = [i for i in df_aa.columns if df_aa[i]["Position"] in Positions_domain]
nt_col_domain    = [i for i in df_nt.columns if df_nt[i]["Position"] in Positions_domain]
# Extracting sequence based on protein potision defined above
df_aa_nogaps_domain = df_aa[df_aa.columns[df_aa.columns.isin(AA_col_domain)]]
df_nt_nogaps_domain = df_nt[df_nt.columns[df_nt.columns.isin(nt_col_domain)]]
df2fasta(df_aa_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_taildomain.fasta")
df2fasta(df_nt_nogaps_domain.drop(["Position", "GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_taildomain.fasta")
df2phy(df_nt_nogaps_domain.drop(["Position","GapRatio"]),"./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_taildomain.phy")

Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_kinasedomain.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_kinasedomain.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_centraldomain.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_centraldomain.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_aa_ali_nogaps_taildomain.fasta
Generated fasta: ./data_prep4codeml/HRR25_mafft_translatorx_nt_ali_nogaps_taildomain.fasta
