In [1]:
import os
import pickle
import pandas as pd
import sys
import subprocess
import re
import time
from collections import OrderedDict

In [2]:
BATCH_SIZE = 10 # default use 10
NUM_TOTAL_GENE_TREES = 1000 # for 101-tax it is 1000 [can be taken from file as well.]

print(os.getcwd())

BASE_FOLDER = "/content/drive/MyDrive/101Tax-TRUE-GT/FOLDER_1"
os.chdir(BASE_FOLDER)
print(os.getcwd())

/content
/content/drive/MyDrive/101Tax-TRUE-GT/FOLDER_1


In [3]:
REPLICATE_START = 11 # inclusive
REPLICATE_END = 20 # inclusive

In [4]:
BASE_FOLDER_GENE_TREE = "/content/drive/MyDrive/101Tax-TRUE-GT/gene-trees-true"
BASE_FOLDER_WQRTS = "/content/drive/MyDrive/101Tax-TRUE-GT/weighted_quartets-{}-{}".format(REPLICATE_START, REPLICATE_END)

In [5]:
if not os.path.exists(BASE_FOLDER_WQRTS):
    os.mkdir(BASE_FOLDER_WQRTS)
else:
    print("{} exists".format(BASE_FOLDER_WQRTS))

/content/drive/MyDrive/101Tax-TRUE-GT/weighted_quartets-11-20 exists


In [6]:
## ./triplet soda will always be in this directory
!chmod u+x triplets.soda2103
# cmd = "chmod u+x {}/triplets.soda2103".format()

## Testing batch wise qrt generation

In [7]:
gt_file = "/content/drive/MyDrive/101Tax-TRUE-GT/all_gt_15Tax.tre"

In [8]:
start_time = time.time()

inputFile = gt_file
outputFile = "test.wqrts.15Tax.txt"

print(f"inputFile = {inputFile}, outputFile = {outputFile}")

inputFile = /content/drive/MyDrive/101Tax-TRUE-GT/all_gt_15Tax.tre, outputFile = test.wqrts.15Tax.txt


In [9]:
#@title Single File wqrt generation [commented]
# dictionary_line = {} # empty dictionary


# tmp_file_name = "TEMP_FILE_PYTHON_FOR_EMBEDDED_QUARTETS"

# with open(inputFile) as fin:
#     for line in fin: # for each gene tree
#         line = line.replace("\n", "")
#         # print("<", line, ">")
#         with open(tmp_file_name, 'w') as f_out_temp:
#             f_out_temp.write(line) # write that gene tree to temporary file.

#         result = subprocess.run(['./triplets.soda2103', 'printQuartets', tmp_file_name], stdout=subprocess.PIPE)
#         results_str = result.stdout.decode('utf-8')
#         results_str = results_str.strip() # remove the empty line at the end
#         results_str = re.sub(".*: ", "", results_str) # remove alpha,beta,gamma names
#         # starting, add (( [two open brackets]
#         results_str = re.sub("\n", "));\n((", results_str) # add initial brackets
#         results_str = re.sub("^", "((", results_str) # for the very first quartet
#         results_str = re.sub("$", "));", results_str) # for the very last quartet

#         results_str = re.sub(" ", ",", results_str) # change white space to comma, ((11,9,|,5,6));
#         results_str = re.sub(",\|,", "),(", results_str) # change ,|, to ),( to form ((11,9),(5,6));

#         results_array = results_str.split("\n") # split to form each quartets

#         # print(results_array)

#         for line_result in results_array:
#             if line_result not in dictionary_line: # THIS line doesn't exist in dictionary
#                 dictionary_line[line_result] = 1 # initialize to 1
#             else: # THIS line does exist in dictionary, so increment
#                 dictionary_line[line_result] += 1

# sorted_dict = OrderedDict(sorted(dictionary_line.items()))

# (pd.DataFrame.from_dict(data=sorted_dict, orient='index')
#    .to_csv(outputFile, header=False, sep=" "))

# end_time = time.time()

# print("--- Time to run => %s seconds ---" % (end_time - start_time))

## Required functions

In [10]:
get_gtree_file_from_replicate = lambda rep: os.path.join(BASE_FOLDER_GENE_TREE, (str(rep) + ".trueGT"))

In [11]:
get_replicate_num_from_gtree = lambda file_gtree: int(file_gtree.split("/")[-1].split(".")[0])

In [12]:
get_replicate_num_from_wqrt = lambda x: int(x.split("/")[-1].split("-")[2])
get_start_idx_from_wqrt = lambda x: int(x.split("/")[-1].split("-")[4])

In [13]:
get_pickle_file_name = lambda replicate, start_idx, batch_size: "wqrts-rep-{}-startidx-{}-batch-{}.pkl".format(replicate, start_idx, batch_size)
get_pickle_file_path = lambda replicate, start_idx, batch_size: os.path.join(BASE_FOLDER_WQRTS, get_pickle_file_name(replicate, start_idx, batch_size))

get_temp_file_path = lambda replicate, start_idx, batch_size: "temp-rep-{}-startidx-{}-batch-{}.txt".format(replicate, start_idx, batch_size)

In [14]:
is_this_replicate = lambda wqrt_file, replicate: True if ("rep-" + str(replicate)) in wqrt_file else False

In [15]:
"""
    Run the triplets.soda2103/printQuartets method on single line gene tree and retrieve as results array.
"""
def run_subprocess(file_single_gtree):
    result = subprocess.run(['./triplets.soda2103', 'printQuartets', file_single_gtree], stdout=subprocess.PIPE)
    results_str = result.stdout.decode('utf-8')
    results_str = results_str.strip() # remove the empty line at the end
    results_str = re.sub(".*: ", "", results_str) # remove alpha,beta,gamma names
    # starting, add (( [two open brackets]
    results_str = re.sub("\n", "));\n((", results_str) # add initial brackets
    results_str = re.sub("^", "((", results_str) # for the very first quartet
    results_str = re.sub("$", "));", results_str) # for the very last quartet

    results_str = re.sub(" ", ",", results_str) # change white space to comma, ((11,9,|,5,6));
    results_str = re.sub(",\|,", "),(", results_str) # change ,|, to ),( to form ((11,9),(5,6));

    results_array = results_str.split("\n") # split to form each quartets
    return results_array

In [16]:
def create_batch_dictionary(file_gtree, start_idx, batch_size): 
    replicate_num = get_replicate_num_from_gtree(file_gtree=file_gtree) 
    temp_file_path = get_temp_file_path(replicate_num, start_idx, batch_size)

    # read all the lines in this file.
    with open(file_gtree, encoding='utf-8', mode='r') as fin:
        lines = [l.strip() for l in fin.readlines()]

    # only keep required lines.
    lines = lines[start_idx: start_idx+batch_size]

    dictionary_line = {} # dictionary to dump as pkl file.

    # per line basis.
    for line in lines:
        # open temp file and write one line there.
        with open(temp_file_path, mode='w') as fout:
            fout.write(line)

        # retrieve as results array by running subprocess.
        results_array = run_subprocess(file_single_gtree=temp_file_path)

        # append to dictionary
        for line_result in results_array:
            if line_result not in dictionary_line: # THIS line doesn't exist in dictionary
                dictionary_line[line_result] = 1 # initialize to 1
            else: # THIS line does exist in dictionary, so increment
                dictionary_line[line_result] += 1

    # sort the dictionary's keys
    sorted_dict = OrderedDict(sorted(dictionary_line.items()))

    # Finally, remove temp file
    os.remove(temp_file_path)

    # Return the dictionary
    return sorted_dict

################################################################################

In [17]:
# file_gtree = "/content/drive/MyDrive/101Tax-TRUE-GT/01.tre_15Tax"
# file_gtree = "/content/drive/MyDrive/101Tax-TRUE-GT/02.txt"
# sorted_dict = create_batch_dictionary(file_gtree=file_gtree, start_idx=10, batch_size=BATCH_SIZE)

In [18]:
def run_gtree(file_gtree, STARTING_INDEX, batch_size=BATCH_SIZE):
    replicate_num = get_replicate_num_from_gtree(file_gtree=file_gtree)
    
    with open(file_gtree, mode='r') as fin:
        line_count = len(fin.readlines())

    for start_idx in range(STARTING_INDEX, line_count, batch_size):
        print("Running for rep {} start_idx {} batch_size {}".format(replicate_num, start_idx, batch_size))
        # retrieve the sorted dictionary
        sorted_dict = create_batch_dictionary(file_gtree=file_gtree, start_idx=start_idx, batch_size=batch_size)

        # get the required file name to save file
        file_save = get_pickle_file_path(replicate=replicate_num, start_idx=start_idx, batch_size=batch_size)
        print(f"Saving to file = {file_save}\n")

        # dump into pickle file
        with open(file_save, mode='wb') as fout:
            pickle.dump(sorted_dict, fout, protocol=pickle.HIGHEST_PROTOCOL)

################################################################################
# run_gtree(file_gtree=file_gtree, STARTING_INDEX=0)

## Get the latest replicate and latest index-start

In [19]:
"""
    Retrieve the latest replicate and start_idx
"""
def get_latest_file_stats(replicate_latest=None, directory=BASE_FOLDER_WQRTS):
    files = os.listdir(directory)
    
    if len(files) == 0: # no files exist.
        return REPLICATE_START, 0 # latest_rep = 1, start_idx = 0
    
    # Get latest replicate
    sorted_files_replicate = sorted(files, key=lambda x: get_replicate_num_from_wqrt(x), reverse=True) # eg. wqrts-rep-1-startidx-0-batch-10.pkl
    if replicate_latest is None:
        replicate_latest = get_replicate_num_from_wqrt(sorted_files_replicate[0])
    
    # Filter wrt latest replicate
    required_files = [f for f in sorted_files_replicate if is_this_replicate(f, replicate=replicate_latest)]

    # Get latest start_idx
    sorted_files_startIdx = sorted(required_files, key=lambda x: get_start_idx_from_wqrt(x), reverse=True)
    start_idx_latest = get_start_idx_from_wqrt(sorted_files_startIdx[0])

    print(f"replicate_latest = {replicate_latest}, start_idx_latest = {start_idx_latest}")

    return replicate_latest, start_idx_latest
################################################################################

## Gather everything and run

In [20]:
# Get latest replicate, required gene tree and start_idx_latest.
# start_rep, start_idx = get_latest_file_stats(directory=BASE_FOLDER_WQRTS)
# gene_tree_file = get_gtree_file_from_replicate(rep=start_rep)


In [25]:
start_rep, start_idx = get_latest_file_stats(directory=BASE_FOLDER_WQRTS)
while start_rep <= REPLICATE_END:
    gene_tree_file = get_gtree_file_from_replicate(rep=start_rep)
    run_gtree(file_gtree=gene_tree_file, STARTING_INDEX=start_idx, batch_size=BATCH_SIZE)

    start_idx = 0 # for next replicate, go to 0.
    start_rep += 1 # increment number of reps.

replicate_latest = 20, start_idx_latest = 990
Running for rep 20 start_idx 990 batch_size 10
Saving to file = /content/drive/MyDrive/101Tax-TRUE-GT/weighted_quartets-11-20/wqrts-rep-20-startidx-990-batch-10.pkl
