In [None]:
from Bio.Align import AlignInfo
from Bio import AlignIO


from Tools import PhyML, RaxML
from Utils import tree_functions
from Utils.defs_PhyAI import ROOTLIKE_NAME, TEST_DATA_PATH, SEP, DEFAULT_MODEL, SUBTREE1, SUBTREE2, LEARNING_DATA


In [None]:
def validate_input(msa_file, user_tree_file):
	"""
	:param msa_file: the path to an MSA file, one of biopython's formats
	:param user_tree_file: (optional) the path to a user tree file, if fixed tree was desired
	:return: a biopython object of the msa and an ete3 object of the tree if exists
	"""
	# identify format and retrieve all MSAs
	for aln_format in ["clustal", "emboss", "fasta", "fasta-m10", "ig", "maf", "mauve", "nexus", "phylip-relaxed", "phylip-sequential", "stockholm"]:
		try:
			msa_obj = AlignIO.read(msa_file, format=aln_format)
			print("INFO - The MSA file is format: " + aln_format)
			break
		except Exception:
			msa_obj = None
	if msa_obj is None:
		print("ERROR - Error occured: the input file is not a valid alignmnet in a supported format.\n"
					 "Please verify that all sequences are at the same length and that the input format is correct.")
	# validate MSA characters
	msa_info = AlignInfo.SummaryInfo(msa_obj)
	aln_letters = msa_info._get_all_letters()
	for let in aln_letters:
		if not (let.lower() in "acgt-"):
			print("WARNING - There are characters that are not nucleotides or gaps in your input MSA.")
			break
	# validate tree file in Newick format and suits the msa
	tree_obj = None
	if user_tree_file:
		try:
			with open(user_tree_file) as fpr:
				tree_obj = tree_functions.get_newick_tree(fpr.read().strip())
		except:
			print("ERROR - Tree file is invalid. Please verify that it's in Newick format.")
		print(tree_obj)
		# assert that the tree matches the corresponding MSA
		leaves = sorted([node.name for node in tree_obj.get_leaves()])
		seq_names = sorted([rec.id for rec in msa_obj])
		if len(leaves) != len(seq_names) or (not all(x == y for x,y  in zip(seq_names,leaves))):
			print("ERROR - The tips of the tree and the MSA sequences names do not match")

	return msa_obj

In [None]:
#for folder_name in os.listdir(TEST_DATA_PATH):
folder_name = '82'
CWD = TEST_DATA_PATH + folder_name + SEP
orig_msa_file = CWD + 'real_msa.phy'
run_id = DEFAULT_MODEL + "_" + folder_name
stats_file, tree_file = PhyML.run_phyml(orig_msa_file, DEFAULT_MODEL, run_id=run_id)
#msa_obj = validate_input(orig_msa_file, tree_file)

orig_tree_obj = tree_functions.get_phylo_tree(tree_file, orig_msa_file)
orig_tree_obj.get_tree_root().name = ROOTLIKE_NAME
print(orig_tree_obj.get_ascii(attributes=["name", "dist"])) 

In [None]:
import pandas as pd
import csv

OUTPUT_TREES_FILE = CWD +  'newicks_step.csv'
with open(OUTPUT_TREES_FILE, "w", newline='') as fpw:
	csvwriter = csv.writer(fpw)
	csvwriter.writerow(['iteration', 'prune_name', 'rgft_name', 'newick'])
print("RUN: parse_phyml_stats_output ===========")
params_dict = (PhyML.parse_phyml_stats_file(stats_file))
#keep pinv and alpha 
freq, rates, pinv, alpha = [params_dict["fA"], params_dict["fC"], params_dict["fG"], params_dict["fT"]], [params_dict["subAC"], params_dict["subAG"], params_dict["subAT"], params_dict["subCG"],params_dict["subCT"], params_dict["subGT"]], params_dict["pInv"], params_dict["gamma"]
df = pd.DataFrame()
orig_ds_ll = float(params_dict["logL"])
root_children = orig_tree_obj.get_tree_root().get_children()
outpath_prune = CWD + 'ds_summary_SPR_prune.csv'
outpath_rgft = CWD + 'ds_summary_SPR_rgft.csv'


In [None]:
for i, prune_node in enumerate(orig_tree_obj.iter_descendants("levelorder")):
	if prune_node in root_children:
		print(f"> SKIPPED Iteration {i} with pruned tree at node name: {prune_node.get_tree_root().name}")
		continue
	prune_name = prune_node.name
	#x = prune_node.get_ascii(attributes=["name", "dist"])
	#print(f"+++++++++ Iteration {i} with pruned tree: {x} \n At node name: {prune_name} \n+++++++++")
	nname, subtree1, subtree2 = tree_functions.prune_branch(orig_tree_obj, prune_name) # subtree1 is the pruned subtree. subtree2 is the remaining subtree
	print(f"+++++++++ Iteration {i} with PRUNED TREE at node name: {nname}")
	#print(">> pruned node: ", nname)
	#print(">> pruned subtree1", subtree1.get_ascii(attributes=["name", "dist"]))
	#print(f">> ROOT: {subtree1.get_tree_root().name}")
	#print(">> remaining subtree2", subtree2.get_ascii(attributes=["name", "dist"]))
	#print(f">> ROOT: {subtree2.get_tree_root().name}")
	#print("================")
	with open(OUTPUT_TREES_FILE, "a", newline='') as fpa:
		csvwriter = csv.writer(fpa)
		csvwriter.writerow([str(i)+",0", prune_name, SUBTREE1, subtree1.write(format=1)])
		csvwriter.writerow([str(i)+",1", prune_name, SUBTREE2, subtree2.write(format=1)])

	for j, rgft_node in enumerate(subtree2.iter_descendants("levelorder")): # traversing over subtree2 capture cases (1) and (3)
		# skip the ROOT node when regraft 
		ind = str(i) + "," + str(j)
		rgft_name = rgft_node.name
		#y = rgft_node.get_ascii(attributes=["name", "dist"])
		#print(f"++++++++++++++++++ Iteration {ind} with remaining tree: {y} \n At node name: {rgft_name} \n++++++++++++++++++")
		if nname == rgft_name: # captures case (2)
			continue
		print(f"++++++++++++++++++ Iteration {ind} REGRAFT TREE at node name: {rgft_name}")
		rearr_tree, preserve = tree_functions.regraft_branch(subtree2, rgft_node, subtree1, rgft_name, nname)
		
		#print(">> rearr_tree: ", rearr_tree.get_ascii(attributes=["name", "dist"]))
		#print("-- rearr_tree has ROOT: ", rearr_tree.get_tree_root().name)
		neighbor_tree_str = rearr_tree.write(format=1, format_root_node=True)

		### save tree to file by using "append"
		with open(OUTPUT_TREES_FILE, "a", newline='') as fpa:
			csvwriter = csv.writer(fpa)
			csvwriter.writerow([ind, prune_name, rgft_name, neighbor_tree_str])
		#print("====== neighbor_tree_str ==========")
		#print(neighbor_tree_str)
		#neighbor_tree_str = neighbor_tree_str.replace(";", "R:0.0;")
		#total_bl = tree_functions.get_total_branch_lengths(neighbor_tree_str)
		ll_rearr, rtime = RaxML.call_raxml_mem(neighbor_tree_str, orig_msa_file, rates, pinv, alpha, freq)
		print(f"INFO - Total branch lenght: {tree_functions.get_total_branch_lengths(neighbor_tree_str)}")
		#print(f"INFO - Total branch lenght: {total_bl}")
		#print(f"INFO - Rearranged Likelihood: {ll_rearr}" )
		
		df["orig_ds_ll"] = orig_ds_ll
		df.loc[ind, "prune_name"], df.loc[ind, "rgft_name"] = prune_name, rgft_name
		df.loc[ind, "time"] = rtime
		df.loc[ind, "ll"] = ll_rearr

df.to_csv(outpath_prune.format("prune"), index_label='iteration')
df.to_csv(outpath_rgft.format("rgft"), index_label='iteration')

In [None]:
from Utils import collect_features

collect_features.collect_features(CWD, tree_file, outpath_prune, outpath_rgft)

In [None]:
import os
from Utils.defs_PhyAI import DATA_WITH_PREDS, FEATURES_SHARED, FEATURES, LABEL, SCORES_PER_DS, types_dict, FEATURES_RGFT_ONLY, FEATURES_MERGED, FEATURES_RGFT, FEATURES_PRUNE, GROUP_ID
from Utils import RF_learning_algorithm


In [None]:
import numpy as np
from Utils.tree_functions import get_total_branch_lengths
from Utils.defs_PhyAI import PHYML_TREE_FILENAME, SUMMARY_PER_DS

def parse_relevant_summaries_for_learning(datapath, outpath, move_type, step_number, tree_type='bionj'):
	cols = FEATURES_PRUNE if move_type == "prune" else FEATURES_RGFT + ["prune_name","rgft_name","orig_ds_ll", "ll", "time", "d_ll_prune"]
	df = pd.DataFrame(columns=cols)
	for i,relpath in enumerate(os.listdir(datapath)):
		if os.path.isdir(os.path.join(datapath, relpath)):
			print(i, relpath)

			ds_path = datapath + relpath + '/'
			user_tree_file = os.path.join(ds_path, tree_file.format(relpath))
			with open(user_tree_file) as fpr:
				ds_tbl = get_total_branch_lengths(fpr.read().strip())
			summary_per_ds = SUMMARY_PER_DS.format(ds_path, move_type)

			if os.path.exists(summary_per_ds) and FEATURES["bl"] in pd.read_csv(summary_per_ds).columns:
				df_ds = pd.read_csv(summary_per_ds, index_col='iteration')
				#df_ds.insert(1, "path", datapath)
				df_ds[FEATURES[GROUP_ID]] = str(i)
				df_ds[FEATURES["group_tbl"]] = ds_tbl
				#print(df_ds)
				df = pd.concat([df, df_ds], join='inner', axis=0)
			df.to_csv(outpath, index_label='iteration')
			#print(df)

In [None]:
import pandas as pd

move_type, st = "merged", "1"
transform_target, validation_set = False, False
#print(CWD)
df_path = TEST_DATA_PATH + LEARNING_DATA.format("all_moves")
#print(df_path)
df_prune_features = TEST_DATA_PATH + LEARNING_DATA.format("all_moves_prune")
#print(df_prune_features)
df_rgft_features = TEST_DATA_PATH + LEARNING_DATA.format("all_moves_rgft")
#print(df_rgft_features)


parse_relevant_summaries_for_learning(TEST_DATA_PATH, df_prune_features, "prune", st)
parse_relevant_summaries_for_learning(TEST_DATA_PATH, df_rgft_features, "rgft", st)

#complete_df = pd.read_csv(df_prune_features, dtype=types_dict).merge(pd.read_csv(df_rgft_features, dtype=types_dict),on=shared_cols, left_index=True, right_index=True, suffixes=('_prune', '_rgft'))

shared_cols = FEATURES_SHARED + ["iteration", "prune_name","rgft_name","orig_ds_ll"]

df_prune_moves = pd.read_csv(df_prune_features, dtype=types_dict, index_col=0)
print(df_prune_moves.columns)

df_rgft_moves = pd.read_csv(df_rgft_features, dtype=types_dict, index_col=0)
print(df_rgft_moves.columns)

complete_df = pd.merge(df_prune_moves, df_rgft_moves, how='outer', on=shared_cols, suffixes=('_prune', '_rgft'))

complete_df = complete_df.rename(columns={FEATURES[f]: FEATURES[f] + "_rgft" for f in FEATURES_RGFT_ONLY})
complete_df[LABEL.format(move_type)] = complete_df[LABEL.format("prune")]
complete_df.to_csv(df_path)



In [3]:
import pandas as pd
import csv
import os
from Utils.defs_PhyAI import DATA_WITH_PREDS, FEATURES_SHARED, FEATURES, LABEL, SCORES_PER_DS, types_dict, FEATURES_RGFT_ONLY, FEATURES_MERGED, FEATURES_RGFT, FEATURES_PRUNE, GROUP_ID
from Utils import RF_learning_algorithm

move_type, st = "merged", "1"
transform_target, validation_set = False, False
CWD = "/Users/mihaid/Coding-Projects/thesis/DL_Phylo_Opt/data/training_data/"
df_path = CWD + "learning_all_moves.csv"
df_learning = pd.read_csv(df_path, dtype=types_dict)
#print("BEFORE: ", df_learning)
#df_learning['orig_ds_id'] = df_learning['orig_ds_id_rgft']
df_learning = RF_learning_algorithm.fit_transformation(df_learning, move_type, trans=transform_target)
#print("AFTER_FIT: ", df_learning)
features = FEATURES_PRUNE if move_type == "prune" else FEATURES_RGFT if move_type == "rgft" else FEATURES_MERGED
if FEATURES[GROUP_ID] in features:
	#print(features)
	features.remove(FEATURES[GROUP_ID])

########################

suf = "_{}_validation_set".format(st) if validation_set else "_{}".format(st)
iftrans = "" if not transform_target else "_ytransformed"
suf += iftrans
csv_with_scores = CWD + SCORES_PER_DS.format(str(len(features))+ suf)
csv_with_preds = CWD + DATA_WITH_PREDS.format(str(len(features)) + suf)
if not os.path.exists(csv_with_scores) or validation_set:
	print("*@*@*@* scores for step{} with {} features are not available, thus applying learning".format(suf, len(features)))
	#print("BEFORE RF: ", df_learning)
	res_dict, df_out = RF_learning_algorithm.cross_validation_RF(df_learning, move_type, features, trans=transform_target ,validation_set=validation_set)
	df_out.to_csv(csv_with_preds)

	df_datasets =  pd.DataFrame(columns=["init"])
else:
	df_datasets = pd.read_csv(csv_with_scores)
	res_dict = RF_learning_algorithm.extract_scores_dict({}, df_datasets)
df_datasets = RF_learning_algorithm.print_and_index_results(df_datasets, res_dict, features)
df_datasets.to_csv(csv_with_scores)

  df_learning = pd.read_csv(df_path, dtype=types_dict)


*@*@*@* scores for step_1 with 20 features are not available, thus applying learning
10 0 [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 2

KeyError: "['topology_dist_between_rgft', 'tbl_dist_between_rgft', 'res_tree_edge_length_rgft'] not in index"