# Comparison results

In [None]:
# Imports
import os
from pathlib import Path
from datetime import datetime
import pandas as pd

# Constants
ALIGNERS = ['vgalignermap', 'graphaligner', 'graphchainer', 'vgmap', 'giraffe']

# Functions
## Crate the plot
def my_plot(df, x, xlabel, ys, ylabel, title, legend):
    plot = df.plot(x=x, y=ys, kind="bar", title=title, legend=legend)
    if legend:
        plot.legend(loc=(1.04,0.75));
    plot.set_xlabel(xlabel)
    plot.set_ylabel(ylabel)
    
## Create and export plot to external file
def my_plot_export(df, x, xlabel, ys, ylabel, title, file_name):
    plot = my_plot(df, x, xlabel, columns_list, title, ylabel)
    plot.get_figure().savefig("./plots/{}".format(title))

## Filter a df s.t. only those rows whose column having column_name is in values_list
def drop_rows_with_seqnames(df, column_name, values_list):
    new_df = df[~df[column_name].isin(values_list)]
    return new_df

In [None]:
DATASETS = [name for name in os.listdir(os.path.join(".","datasets"))]
DATASETS

In [None]:
results = dict()
for dataset in DATASETS:
    results_folder = os.path.join(".","datasets",dataset,"results")
    
    if os.path.exists(results_folder):
        results_by_path = dict()
        for name in os.listdir(results_folder):
            name_without_ext = name[:-4]
            path,aligner = name_without_ext.split('_')

            if path not in results_by_path.keys():
                results_by_path[path] = []
            results_by_path[path].append((aligner,os.path.join(results_folder, name)))
    
    results[dataset] = results_by_path

# Q1: Which aligners worked and which didn't?

In [None]:
aligners_results = []
for dataset in DATASETS:
    results_by_path = results[dataset]
    for path in results_by_path.keys():
        curr_record = dict()
        curr_record['dataset'] = dataset
        curr_record['path'] = path
        curr_record['mapped by'] = set(map(lambda result : result[0], results_by_path[path]))
        curr_record['not mapped by'] = set(ALIGNERS)-curr_record['mapped by']
        curr_record['n aligner that worked'] = len(results_by_path[path])
        curr_record['% aligner that worked'] = len(results_by_path[path])/len(ALIGNERS) * 100
        aligners_results.append(curr_record)
        # TODO: remove .gaf from vg map (in snakefile)
       

aligners_results_df = pd.DataFrame(aligners_results)
group_by = aligners_results_df.groupby(['dataset','path'])
group_by.first()

In [None]:
for name, group in aligners_results_df.groupby("dataset"):
    plot = my_plot(group, "path", "path_name", ["n aligner that worked"], "n aligner", "Aligners that worked for {}".format(name), False)

# Q2: How did the aligners perform? 

In [None]:
aligners_performance = []

for dataset in DATASETS:
    
    logs_folder = os.path.join(".","datasets",dataset, "logs")
    if os.path.exists(logs_folder):
        for name in os.listdir(logs_folder):

            if not name.endswith(".time"):
                continue

            name_without_ext = name[:-5]

            if name_without_ext == 'vgindex' or name_without_ext == "vgaligner-index":
                continue
            else:
                path,aligner = name_without_ext.split('_')

                log_full_path = os.path.join(logs_folder, name)
                with open(log_full_path, "r") as fp:
                    curr_record = dict()
                    curr_record['dataset'] = dataset
                    curr_record['aligner'] = aligner
                    curr_record['path'] = name.split('_')[0]
                    for line in fp.readlines():

                        # TODO in the future: this could either be hh:mm:ss OR mm:ss.ms
                        if line.lstrip().startswith("Elapsed (wall clock) time"):
                            elapsed_time_str = line.lstrip()[45:]
                            curr_record['time'] = datetime.strptime(elapsed_time_str.strip(), '%M:%S.%f').time()

                        if line.lstrip().startswith("Maximum resident set size"):
                            space = int(line.split(':')[1])
                            curr_record['space (kbytes)']= space
                            curr_record['space (mbytes)']= float("{:.2f}".format(space/1000))

                    aligners_performance.append(curr_record)
    
aligners_performance_df = pd.DataFrame(aligners_performance)
group_by = aligners_performance_df.groupby(['dataset','path','aligner'])
group_by.first()

# Q3: Parse graphs stats

In [None]:
graphs = list()
for dataset in DATASETS:
    curr_graph_stats = dict()
    curr_graph_stats["name"] = dataset
    
    stats_file = os.path.join(".","datasets",dataset, "stats", "graph_stats.txt".format(dataset))
    with open(stats_file, "r") as fp:
        for line in fp.readlines():
            if line.startswith("nodes"):
                curr_graph_stats["nodes"] = int(line.split("\t")[1])
            elif line.startswith("edges"):
                curr_graph_stats["edges"] = int(line.split("\t")[1])
            elif line.startswith("self-loops"):
                curr_graph_stats["self-loops"] = int(line.split("\t")[1])
            else:
                curr_graph_stats["cyclic"] = True if line == "cyclic" else False
    
    graphs.append(curr_graph_stats)
    
graphs_df = pd.DataFrame(graphs)
graphs_df

# Q4: Parse comparison results

In [None]:
jaccard_results = list()
for dataset in DATASETS:
    comparisons_folder = os.path.join(".","datasets",dataset, "comparisons")
    if os.path.exists(comparisons_folder):
        for name in os.listdir(comparisons_folder):
            name_without_ext = name[:-4]
            path,aligner = name_without_ext.split('_')

            curr_graph_stats = dict()
            curr_graph_stats["name"] = dataset

            comparison_file = os.path.join(".","datasets",dataset, "comparisons", name)
            if os.path.exists(comparison_file):
                with open(comparison_file, "r") as fp:
                    for line in fp.readlines():
                        if line.lstrip().startswith("Reads mapped correctly"):
                            _,value = line.lstrip().split(":")
                            absolute, _ = value.lstrip().split(" ")
                            n_mapped, total_reads = absolute.split("/")
                            curr_result = {
                                'name':dataset,
                                'aligner':aligner,
                                'path': path,
                                'n_returned_alignments': int(total_reads),
                                'n_correct_alignments':int(n_mapped),
                                #'% correct': int(n_mapped)/int(total_reads)
                            }
                            break

                jaccard_results.append(curr_result)

jaccard_results_df = pd.DataFrame(jaccard_results)
group_by = jaccard_results_df.groupby(['name','path','aligner'])
group_by.first()

# Read analysis

In [None]:
read_results = list()
for dataset in DATASETS:
    comparisons_folder = os.path.join(".","datasets",dataset, "comparisons")
    if os.path.exists(comparisons_folder):
        for name in os.listdir(comparisons_folder):
            name_without_ext = name[:-4]
            path,aligner = name_without_ext.split('_')

            curr_graph_stats = dict()
            curr_graph_stats["name"] = dataset

            comparison_file = os.path.join(".","datasets",dataset, "comparisons", name)
            
            if os.path.exists(comparison_file):
                with open(comparison_file, "r") as fp:
                    lines = fp.readlines()
                    for i in range(0, len(lines)):
                        if lines[i].lstrip().startswith("Type"):
                            (line_is_correct, line_incorrect_aln_len, line_read_name, line_gaf_length, line_correct_length) = (lines[i].strip(), lines[i+1].strip(), lines[i+2].strip(), lines[i+3].strip(), lines[i+4].strip())

                            is_correct = line_is_correct.split(":")[1].strip()
                            incorrect_aln_len = line_incorrect_aln_len.split(":")[1].strip()
                            read_name = line_read_name.split(":")[1].strip()
                            gaf_length = line_gaf_length.split(":")[1].strip()
                            correct_length = line_correct_length.split(":")[1].strip()

                            curr_result = {
                                'name':dataset,
                                'aligner':aligner,
                                'read_name':read_name,
                                'is_correct': is_correct,
                                "incorrect_aln_len": incorrect_aln_len,
                                "read_name": read_name,
                                "gaf_length": gaf_length,
                                "correct_length": correct_length
                            }
                            read_results.append(curr_result)



                    '''
                    for line in fp.readlines():
                        if line.lstrip().startswith("Read is"):
                            _,value = line.lstrip().split(":")
                            read_name = value.strip()[1:]
                            curr_result = {
                                'name':dataset,
                                'aligner':aligner,
                                'read_name':read_name,
                            }
                            read_results.append(curr_result)
                    '''

#print(read_results)
read_results_df = pd.DataFrame(read_results)
read_results_df
#group_by = read_results_df.groupby(['read_name'])
#group_by.first()