In [1]:
from Bio.Align import AlignInfo
from Bio import AlignIO


from Tools import PhyML, RaxML
from Utils import tree_functions
from Utils.defs_PhyAI import ROOTLIKE_NAME, TEST_DATA_PATH, SEP, DEFAULT_MODEL, SUBTREE1, SUBTREE2, LEARNING_DATA


In [2]:
def validate_input(msa_file, user_tree_file):
	"""
	:param msa_file: the path to an MSA file, one of biopython's formats
	:param user_tree_file: (optional) the path to a user tree file, if fixed tree was desired
	:return: a biopython object of the msa and an ete3 object of the tree if exists
	"""
	# identify format and retrieve all MSAs
	for aln_format in ["clustal", "emboss", "fasta", "fasta-m10", "ig", "maf", "mauve", "nexus", "phylip-relaxed", "phylip-sequential", "stockholm"]:
		try:
			msa_obj = AlignIO.read(msa_file, format=aln_format)
			print("INFO - The MSA file is format: " + aln_format)
			break
		except Exception:
			msa_obj = None
	if msa_obj is None:
		print("ERROR - Error occured: the input file is not a valid alignmnet in a supported format.\n"
					 "Please verify that all sequences are at the same length and that the input format is correct.")
	# validate MSA characters
	msa_info = AlignInfo.SummaryInfo(msa_obj)
	aln_letters = msa_info._get_all_letters()
	for let in aln_letters:
		if not (let.lower() in "acgt-"):
			print("WARNING - There are characters that are not nucleotides or gaps in your input MSA.")
			break
	# validate tree file in Newick format and suits the msa
	tree_obj = None
	if user_tree_file:
		try:
			with open(user_tree_file) as fpr:
				tree_obj = tree_functions.get_newick_tree(fpr.read().strip())
		except:
			print("ERROR - Tree file is invalid. Please verify that it's in Newick format.")
		print(tree_obj)
		# assert that the tree matches the corresponding MSA
		leaves = sorted([node.name for node in tree_obj.get_leaves()])
		seq_names = sorted([rec.id for rec in msa_obj])
		if len(leaves) != len(seq_names) or (not all(x == y for x,y  in zip(seq_names,leaves))):
			print("ERROR - The tips of the tree and the MSA sequences names do not match")

	return msa_obj

In [3]:
#for folder_name in os.listdir(TEST_DATA_PATH):
folder_name = '8'
CWD = TEST_DATA_PATH + folder_name + SEP
orig_msa_file = CWD + 'real_msa.phy'
run_id = DEFAULT_MODEL + "_" + folder_name
stats_file, tree_file = PhyML.run_phyml(orig_msa_file, DEFAULT_MODEL, run_id=run_id)
#msa_obj = validate_input(orig_msa_file, tree_file)

orig_tree_obj = tree_functions.get_phylo_tree(tree_file, orig_msa_file)
orig_tree_obj.get_tree_root().name = ROOTLIKE_NAME
print(orig_tree_obj.get_ascii(attributes=["name", "dist"])) 

INFO - Running PhyML: base_model='GTR' pinv=True gamma=True run_id='GTR+I+G_8'
INFO - Running PhyML: execution_tags='-m 012345 -f m -v e -a e -c 4 -o tlr -d nt -n 1 -b 0 --no_memory_check --run_id GTR+I+G_8'
INFO - The MSA file is format: phylip-relaxed

      /-Sp000
   /-|
  |  |   /-Sp003
  |   \-|
  |     |   /-Sp004
  |      \-|
  |        |   /-Sp001
--|         \-|
  |           |   /-Sp006
  |            \-|
  |              |   /-Sp005
  |               \-|
  |                  \-Sp007
  |
   \-Sp002

                     /-Sp000, 4.10264
              /N1, 0.0
             |      |             /-Sp003, 0.176516
             |       \N4, 0.0238354
             |                   |            /-Sp004, 1.58237
             |                    \N6, 0.769354
             |                               |            /-Sp001, 0.146422
-ROOT_LIKE, 0.0                               \N8, 0.117189
             |                                           |            /-Sp006, 1.136e-05

In [4]:
import pandas as pd
import csv

OUTPUT_TREES_FILE = CWD +  'newicks_step.csv'
with open(OUTPUT_TREES_FILE, "w", newline='') as fpw:
	csvwriter = csv.writer(fpw)
	csvwriter.writerow(['', 'prune_name', 'rgft_name', 'newick'])
print("RUN: parse_phyml_stats_output ===========")
params_dict = (PhyML.parse_phyml_stats_file(stats_file))
#keep pinv and alpha 
freq, rates, pinv, alpha = [params_dict["fA"], params_dict["fC"], params_dict["fG"], params_dict["fT"]], [params_dict["subAC"], params_dict["subAG"], params_dict["subAT"], params_dict["subCG"],params_dict["subCT"], params_dict["subGT"]], params_dict["pInv"], params_dict["gamma"]
df = pd.DataFrame()
df["orig_ds_ll"] = float(params_dict["logL"])
root_children = orig_tree_obj.get_tree_root().get_children()
outpath_prune = CWD + 'ds_summary_SPR_prune.csv'
outpath_rgft = CWD + 'ds_summary_SPR_rgft.csv'




In [5]:
for i, prune_node in enumerate(orig_tree_obj.iter_descendants("levelorder")):
	if prune_node in root_children:
		print("SKIP TREE prune_node: ", prune_node.get_ascii(attributes=["name", "dist"]))
		continue
	prune_name = prune_node.name
	x = prune_node.get_ascii(attributes=["name", "dist"])
	#print(f"+++++++++ Iteration {i} with pruned tree: {x} \n At node name: {prune_name} \n+++++++++")
	nname, subtree1, subtree2 = tree_functions.prune_branch(orig_tree_obj, prune_name) # subtree1 is the pruned subtree. subtree2 is the remaining subtree
	#print(">> pruned node: ", nname)
	#print(">> pruned subtree1", subtree1.get_ascii(attributes=["name", "dist"]))
	#print(f">> ROOT: {subtree1.get_tree_root().name}")
	#print(">> remaining subtree2", subtree2.get_ascii(attributes=["name", "dist"]))
	#print(f">> ROOT: {subtree2.get_tree_root().name}")
	#print("================")
	with open(OUTPUT_TREES_FILE, "a", newline='') as fpa:
		csvwriter = csv.writer(fpa)
		csvwriter.writerow([str(i)+",0", prune_name, SUBTREE1, subtree1.write(format=1)])
		csvwriter.writerow([str(i)+",1", prune_name, SUBTREE2, subtree2.write(format=1)])

	for j, rgft_node in enumerate(subtree2.iter_descendants("levelorder")): # traversing over subtree2 capture cases (1) and (3)
		# skip the ROOT node when regraft 
		ind = str(i) + "," + str(j)
		rgft_name = rgft_node.name
		y = rgft_node.get_ascii(attributes=["name", "dist"])
		print(f"++++++++++++++++++ Iteration {ind} with remaining tree: {y} \n At node name: {rgft_name} \n++++++++++++++++++")
		if nname == rgft_name: # captures case (2)
			continue
		rearr_tree, preserve = tree_functions.regraft_branch(subtree2, rgft_node, subtree1, rgft_name, nname)
		
		#print(">> rearr_tree: ", rearr_tree.get_ascii(attributes=["name", "dist"]))
		#print("-- rearr_tree has ROOT: ", rearr_tree.get_tree_root().name)
		neighbor_tree_str = rearr_tree.write(format=1, format_root_node=True)

		### save tree to file by using "append"
		with open(OUTPUT_TREES_FILE, "a", newline='') as fpa:
			csvwriter = csv.writer(fpa)
			csvwriter.writerow([ind, prune_name, rgft_name, neighbor_tree_str])
		#print("====== neighbor_tree_str ==========")
		#print(neighbor_tree_str)
		#neighbor_tree_str = neighbor_tree_str.replace(";", "R:0.0;")
		total_bl = tree_functions.get_total_branch_lengths(neighbor_tree_str)
		ll_rearr, rtime = RaxML.call_raxml_mem(neighbor_tree_str, orig_msa_file, rates, pinv, alpha, freq)
		print(f">> Total branch lenght: {total_bl}")
		print(f">> PIP Likelihood: {ll_rearr}" )

		df.loc[ind, "prune_name"], df.loc[ind, "rgft_name"] = prune_name, rgft_name
		df.loc[ind, "prune_name"], df.loc[ind, "rgft_name"] = prune_name, rgft_name
		df.loc[ind, "time"] = rtime
		df.loc[ind, "ll"] = ll_rearr

df.to_csv(outpath_prune.format("prune"))
df.to_csv(outpath_rgft.format("rgft"))

SKIP TREE prune_node:  
       /-Sp000, 4.10264
-N1, 0.0
      |             /-Sp003, 0.176516
       \N4, 0.0238354
                   |            /-Sp004, 1.58237
                    \N6, 0.769354
                               |            /-Sp001, 0.146422
                                \N8, 0.117189
                                           |            /-Sp006, 1.136e-05
                                            \N10, 2.22572
                                                       |              /-Sp005, 0.158931
                                                        \N12, 0.0757508
                                                                      \-Sp007, 0.181737
SKIP TREE prune_node:  
--Sp002, 0.132302
++++++++++++++++++ Iteration 2,0 with remaining tree: 
--Sp002, 0.132302 
 At node name: Sp002 
++++++++++++++++++
GTR{0.31152/0.36752/0.36901/0.19015/1.12597/0.15899}+I{0.001}+G{0.672}+F{0.26592/0.14671/0.1774/0.40997}
ll_re:  <re.Match object; span=(2811, 2844), matc

In [6]:
from Utils import collect_features

collect_features.collect_features(CWD, tree_file, outpath_prune, outpath_rgft)

  dff[ind, FEATURES["longest"]] = features_dicts_dict["longest"]				#f3
  dff[ind, FEATURES["ntaxa_{}".format(subtype)]] = features_dicts_dict["ntaxa_{}".format(subtype)][edge]  		#f4,5
  dff[ind, FEATURES["tbl_{}".format(subtype)]] = features_dicts_dict["tbl_{}".format(subtype)][edge]  			#f6,7
  dff[ind, FEATURES["longest_{}".format(subtype)]] = features_dicts_dict["longest_{}".format(subtype)][edge]	#f8,9
  dff[ind, FEATURES["ntaxa_{}".format(subtype)]] = features_dicts_dict["ntaxa_{}".format(subtype)][edge]  		#f4,5
  dff[ind, FEATURES["tbl_{}".format(subtype)]] = features_dicts_dict["tbl_{}".format(subtype)][edge]  			#f6,7
  dff[ind, FEATURES["longest_{}".format(subtype)]] = features_dicts_dict["longest_{}".format(subtype)][edge]	#f8,9
  df_rgft[ind, FEATURES["top_dist"]] = features_dict_prune['top_dist'][prune_name][rgft_name]
  df_rgft[ind, FEATURES["bl_dist"]] = features_dict_prune['bl_dist'][prune_name][rgft_name]
  df_rgft[ind, FEATURES["res_bl"]] = res_bl
  dff[ind, LABEL.f

In [7]:
import os
from Utils.defs_PhyAI import DATA_WITH_PREDS, FEATURES_SHARED, FEATURES, LABEL, SCORES_PER_DS, types_dict, FEATURES_RGFT_ONLY, FEATURES_MERGED, FEATURES_RGFT, FEATURES_PRUNE, GROUP_ID
from Utils import RF_learning_algorithm


In [8]:
import numpy as np
from Utils.tree_functions import get_total_branch_lengths
from Utils.defs_PhyAI import PHYML_TREE_FILENAME, SUMMARY_PER_DS

def parse_relevant_summaries_for_learning(datapath, outpath, move_type, step_number, tree_type='bionj'):

	for i,relpath in enumerate(os.listdir(datapath)):
		if i ==0:
			ds_path_init = datapath+relpath
			cols = list(pd.read_csv(SUMMARY_PER_DS.format(datapath, move_type)))
			cols.insert(1, "path")
			cols.extend([FEATURES[GROUP_ID], FEATURES["group_tbl"]])
			df = pd.DataFrame(index=np.arange(0), columns=cols)
			
		ds_path = datapath + relpath
		ds_tbl = get_total_branch_lengths(tree_file)
		summary_per_ds = SUMMARY_PER_DS.format(datapath, move_type)
		
		if os.path.exists(summary_per_ds) and FEATURES["bl"] in pd.read_csv(summary_per_ds).columns:
			df_ds = pd.read_csv(summary_per_ds)
			df_ds.insert(1, "path", datapath)
			df_ds[FEATURES[GROUP_ID]] = str(i)
			df_ds[FEATURES["group_tbl"]] = ds_tbl
			df = pd.concat([df, df_ds], ignore_index=True)
	
	df.to_csv(outpath)

In [9]:
move_type, st = "merged", "1"
transform_target, validation_set = False, False
#print(CWD)
df_path = TEST_DATA_PATH + LEARNING_DATA.format("all_moves")
#print(df_path)
df_prune_features = TEST_DATA_PATH + LEARNING_DATA.format("all_moves_prune")
#print(df_prune_features)
df_rgft_features = TEST_DATA_PATH + LEARNING_DATA.format("all_moves_rgft")
#print(df_rgft_features)

if not os.path.exists(df_path):
	parse_relevant_summaries_for_learning(CWD, df_prune_features, "prune", st,)
	parse_relevant_summaries_for_learning(CWD, df_rgft_features, "rgft", st)
	shared_cols = FEATURES_SHARED + ["path","prune_name","rgft_name","orig_ds_ll", "ll"]
	df_prune_moves = pd.read_csv(df_prune_features, dtype=types_dict)
	print(df_prune_moves.head())
	df_rgft_moves = pd.read_csv(df_rgft_features, dtype=types_dict)
	print(df_rgft_moves.head(5))
	complete_df = pd.merge(df_prune_moves, df_rgft_moves, on=shared_cols, left_index=True, right_index=True, suffixes=('_prune', '_rgft'))
 
	complete_df = complete_df.rename(columns={FEATURES[f]: FEATURES[f] + "_rgft" for f in FEATURES_RGFT_ONLY})
	complete_df[LABEL.format(move_type)] = complete_df[LABEL.format("prune")]
	complete_df.to_csv(df_path)

df_learning = pd.read_csv(df_path, dtype=types_dict)
df_learning = RF_learning_algorithm.fit_transformation(df_learning, move_type, trans=transform_target)

features = FEATURES_PRUNE if move_type == "prune" else FEATURES_RGFT if move_type == "rgft" else FEATURES_MERGED
features.remove(FEATURES[GROUP_ID])

########################

suf = "_{}_validation_set".format(st) if validation_set else "_{}".format(st)
iftrans = "" if not transform_target else "_ytransformed"
suf += iftrans
csv_with_scores = CWD + SCORES_PER_DS.format(str(len(features))+ suf)
csv_with_preds = CWD + DATA_WITH_PREDS.format(str(len(features)) + suf)
if not os.path.exists(csv_with_scores) or validation_set:
	print("*@*@*@* scores for step{} with {} features are not available, thus applying learning".format(suf, len(features)))
	res_dict, df_out = RF_learning_algorithm.cross_validation_RF(df_learning, move_type, features, trans=transform_target ,validation_set=validation_set)
	df_out.to_csv(csv_with_preds)

	df_datasets =  pd.DataFrame(columns=["init"])
else:
	df_datasets = pd.read_csv(csv_with_scores)
	res_dict = RF_learning_algorithm.extract_scores_dict({}, df_datasets)
df_datasets = RF_learning_algorithm.print_and_index_results(df_datasets, res_dict, features)
df_datasets.to_csv(csv_with_scores)

Empty DataFrame
Columns: [Unnamed: 0.1, Unnamed: 0, path, orig_ds_ll, prune_name, rgft_name, time, ll, ('2,0', 'd_ll_prune'), ('2,0', 'edge_length'), ('2,0', 'longest_branch'), ('2,0', 'name2ntaxa_pruned'), ('2,0', 'tbl_pruned'), ('2,0', 'longest_pruned'), ('2,0', 'name2ntaxa_remaining'), ('2,0', 'tbl_remaining'), ('2,0', 'longest_remaining'), ('2,1', 'd_ll_prune'), ('2,1', 'edge_length'), ('2,1', 'longest_branch'), ('2,1', 'name2ntaxa_pruned'), ('2,1', 'tbl_pruned'), ('2,1', 'longest_pruned'), ('2,1', 'name2ntaxa_remaining'), ('2,1', 'tbl_remaining'), ('2,1', 'longest_remaining'), ('2,2', 'd_ll_prune'), ('2,2', 'edge_length'), ('2,2', 'longest_branch'), ('2,2', 'name2ntaxa_pruned'), ('2,2', 'tbl_pruned'), ('2,2', 'longest_pruned'), ('2,2', 'name2ntaxa_remaining'), ('2,2', 'tbl_remaining'), ('2,2', 'longest_remaining'), ('2,3', 'd_ll_prune'), ('2,3', 'edge_length'), ('2,3', 'longest_branch'), ('2,3', 'name2ntaxa_pruned'), ('2,3', 'tbl_pruned'), ('2,3', 'longest_pruned'), ('2,3', 'name2

MergeError: Can only pass argument "on" OR "left_index" and "right_index", not a combination of both.