In [18]:
import pandas as pd

In [1]:
path_to_mafft_input = "../data/phylogenetics/metb_mety_metz.faa"
path_to_mafft_output = "../results/processed_data/phylogenetics/mafft/metb_mety_metz.msa"
path_to_fasttree_output = "../results/processed_data/phylogenetics/fasttree/metb_mety_metz.nwk"

In [43]:
# !mafft $path_to_mafft_input > $path_to_mafft_output

In [44]:
# !fasttree $path_to_mafft_output > $path_to_fasttree_output

In [14]:
entrez_result_path = "../results/processed_data/phylogenetics/entrez/"
metB_entrez = entrez_result_path + "metB_table.tsf"
metY_entrez = entrez_result_path + "metY_table.tsf"
metZ_entrez = entrez_result_path + "metZ_table.tsf"

In [45]:
# !esearch -db protein -query "metB AND proteobacteria[organism]" | efetch -format docsum | xtract -pattern DocumentSummary -element Id Caption SourceDb Completeness TaxId Organism Title Slen > $metB_entrez

In [46]:
# !esearch -db protein -query "metY AND proteobacteria[organism]" | efetch -format docsum | xtract -pattern DocumentSummary -element Id Caption SourceDb Completeness TaxId Organism Title Slen > $metY_entrez

In [47]:
# !esearch -db protein -query "metZ AND proteobacteria[organism]" | efetch -format docsum | xtract -pattern DocumentSummary -element Id Caption SourceDb Completeness TaxId Organism Title Slen > $metZ_entrez

In [28]:
'''read_table

    This function reads the data obtained by the eserach E-Direct tool from NCBI
    
'''
def read_table(data_path:str)->pd.DataFrame:
    try:
        cols = "Id Caption SourceDb Completeness TaxId Organism Title Slen".split(" ")
        data_as_lines = []
        with open(data_path,"r") as data:
            for line in data.readlines():
                line = line.rstrip()
                line = line.split("\t")
                # Completeness info is missing
                if len(line) == 7:
                    line = [line[0],line[1],line[2],"no info",line[3],line[4],line[5],line[6]]
                data_as_lines.append(line)
        dataframe = pd.DataFrame(data_as_lines,columns=cols)
        dataframe = dataframe.astype({"Id":int,"TaxId":int,"Slen":int})
        return dataframe
    except Exception as e:
        raise Exception("[-] ERROR creating dataframe with exception: {}".format(e))

In [29]:
metB_data = read_table(metB_entrez)
metZ_data = read_table(metZ_entrez)
metY_data = read_table(metY_entrez)

In [39]:
'''filter_entrez_data

    This function filters the provided entrez dataframes by sequence length, taxid and source database.
    
'''
def filter_entrez_data(dataframe:pd.DataFrame,length_filter=300,taxid_filter=True,source_db_filter=True)->pd.DataFrame:
    try:
        data = dataframe.copy()
        print("[+] DataFrame has a length of: {}".format(len(data)))
        filtered_data = data[data["Slen"] > 300]
        print("\t[+] Sequence length filter: {}".format(length_filter))
        if taxid_filter:
            print("\t[+] Applying TaxId filter")
            filtered_data = filtered_data.drop_duplicates(subset="TaxId",keep="first")
        if source_db_filter:
            print("\t[+] Applying SourceDb filter")
            filtered_data = filtered_data[filtered_data["SourceDb"] == "refseq"]
        print("[+] Filtered DataFrame has a length of: {}".format(len(filtered_data)))
        return filtered_data
    except Exception as e:
        raise Exception("[-] ERROR filtering entrez dataframe with exception: {}".format(e))

In [42]:
metB_filtered_data = filter_entrez_data(metB_data)
metZ_filtered_data = filter_entrez_data(metZ_data)
metY_filtered_data = filter_entrez_data(metY_data,source_db_filter=False)

[+] DataFrame has a length of: 38180
	[+] Sequence length filter: 300
	[+] Applying TaxId filter
	[+] Applying SourceDb filter
[+] Filtered DataFrame has a length of: 2000
[+] DataFrame has a length of: 15593
	[+] Sequence length filter: 300
	[+] Applying TaxId filter
	[+] Applying SourceDb filter
[+] Filtered DataFrame has a length of: 3836
[+] DataFrame has a length of: 1114
	[+] Sequence length filter: 300
	[+] Applying TaxId filter
[+] Filtered DataFrame has a length of: 546


In [49]:
metB_protein_ids_path = entrez_result_path + "metB_protein_ids.txt"
metY_protein_ids_path = entrez_result_path + "metY_protein_ids.txt"
metZ_protein_ids_path = entrez_result_path + "metZ_protein_ids.txt"

In [50]:
metY_filtered_data.head()

Unnamed: 0,Id,Caption,SourceDb,Completeness,TaxId,Organism,Title,Slen
0,2839447466,CAM0125753,insd,no info,86188,Stenotrophomonas geniculata,O-acetyl-L-homoserine sulfhydrylase [Stenotrop...,428
1,2839447118,CAM0118147,insd,no info,40324,Stenotrophomonas maltophilia,O-acetyl-L-homoserine sulfhydrylase [Stenotrop...,428
2,2814348139,XHV07197,insd,no info,287,Pseudomonas aeruginosa,homocysteine synthase [Pseudomonas aeruginosa],425
3,2814205940,XHT91261,insd,complete,48296,Acinetobacter pittii,aminotransferase class I/II-fold pyridoxal pho...,425
6,67466069,P0A705,swiss_prot,complete,83333,Escherichia coli K-12,RecName: Full=Translation initiation factor IF-2,890


In [54]:
'''write_protein_id_file

    This function takes a pandas dataframe with an Id column and writes a file containing all
    values of the Id column per line.
    
'''
def write_protein_id_file(dataframe:pd.DataFrame,output_file:str)->int:
    try:
        data = dataframe.copy()
        protein_ids = data["Id"].to_list()
        with open(output_file, "w") as output:
            for protein in protein_ids:
                output.write(str(protein) + "\n")
        return 1
    except Exception as e:
        raise Exception("[-] ERROR with exception: {}".format(e))

In [55]:
write_protein_id_file(metB_filtered_data, metB_protein_ids_path)
write_protein_id_file(metY_filtered_data, metY_protein_ids_path)
write_protein_id_file(metZ_filtered_data, metZ_protein_ids_path)

1

In [57]:
metB_proteins = entrez_result_path + "metB_proteins.faa"
metY_proteins = entrez_result_path + "metY_proteins.faa"
metZ_proteins = entrez_result_path + "metZ_proteins.faa"

In [59]:
!efetch -db protein -input $metB_protein_ids_path -format fasta > $metB_proteins

In [60]:
!efetch -db protein -input $metY_protein_ids_path -format fasta > $metY_proteins

In [61]:
!efetch -db protein -input $metZ_protein_ids_path -format fasta > $metZ_proteins

In [113]:
proteins_path = entrez_result_path + "combined_proteins.faa"

In [114]:
!cat $metB_proteins $metY_proteins $metZ_proteins > $proteins_path

In [64]:
# add Curvibacter proteins to the $proteins file ...

In [115]:
# filter none-unique proteins
proteins = []
proteins_to_remove = []
with open(proteins_path, "r") as protein_file:
    for protein in protein_file.readlines():
        if protein.startswith(">"):
            protein = protein.split(">")[-1]
            protein = protein.split(" ")[0]
            if protein not in proteins:
                proteins.append(protein)
            else:
                proteins_to_remove.append(protein)

In [117]:
# filter none-unique proteins
proteins = []
proteins_to_remove = []
with open(proteins_path, "r") as protein_file:
    for protein in protein_file.readlines():
        if protein.startswith(">"):
            protein = protein.split(">")[-1]
            protein = protein.split(" ")[0]
            if protein not in proteins:
                proteins.append(protein)
            else:
                proteins_to_remove.append(protein)

protein_dict = {}
with open(proteins_path, "r") as protein_file:
    for protein in protein_file.readlines():
        
        if protein.startswith(">"):

            protein_id = protein.split(">")[-1]
            protein_id = protein_id.split(" ")[0]
            
            if protein_id not in proteins_to_remove:

                to_remove = False
                switch = True
                header = protein
                protein_dict[header] = ""

            else:
                print(protein_id)
                to_remove = True
        else:
            switch = False

        if switch == False and to_remove == False:
            protein_dict[header] += protein

with open(proteins_path,"w") as protein_file:
    for key in protein_dict.keys():
        protein_file.write(key)
        protein_file.write(protein_dict[key])

In [118]:
mafft_output = "../results/processed_data/phylogenetics/mafft/combined_proteins.msa"
fasttree_output = "../results/processed_data/phylogenetics/fasttree/combined_proteins.nwk"

In [123]:
!mafft --auto $proteins_path > $mafft_output

nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 39 ambiguous characters.
 6301 / 6385
done.

Constructing a UPGMA tree (efffree=0) ... 
 6380 / 6385
done.

Progressive alignment 1/2... 
STEP  6301 / 6384  h
Reallocating..done. *alloclen = 2936

done.

Making a distance matrix from msa.. 
 6300 / 6385
done.

Constructing a UPGMA tree (efffree=1) ... 
 6380 / 6385
done.

Progressive alignment 2/2... 
STEP  6301 / 6384  h
Reallocating..done. *alloclen = 2893

done.

disttbfast (aa) Version 7.453
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct)

In [124]:
!fasttree $mafft_output > $fasttree_output

FastTree Version 2.1.11 Double precision (No SSE3)
Alignment: ../results/processed_data/phylogenetics/mafft/combined_proteins.msa
Amino acid distances: BLOSUM45 Joins: balanced Support: SH-like 1000
Search: Normal +NNI +SPR (2 rounds range 10) +ML-NNI opt-each=1
TopHits: 1.00*sqrtN close=default refresh=0.80
ML Model: Jones-Taylor-Thorton, CAT approximation with 20 rate categories
Ignored unknown character X (seen 39 times)
Initial topology in 34.28 seconds of   6281    1 of   6284 seqs   6100)   
Refining topology: 50 rounds ME-NNIs, 2 rounds ME-SPRs, 25 rounds ML-NNIs
Total branch-length 338.782 after 97.47 sec6101 of 6282 splits, 1 changes (max delta 0.005)     
ML-NNI round 1: LogLk = -958026.706 NNIs 1175 max delta 24.68 Time 303.67es (max delta 24.680)   
Switched to using 20 rate categories (CAT approximation)20 of 20   
Rate categories were divided by 0.990 so that average rate = 1.0
CAT-based log-likelihoods may not be comparable across runs
Use -gamma for approximate but comp