In [22]:
import os
import glob
import gzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import metrics
from statsmodels.stats.multitest import multipletests

sns.set_style("darkgrid")
sns.set_context("notebook")
sns.set_palette("deep")


# SNV Benchmark

## Comparative analysis

### Sensitivity, Precision, and F1


In [23]:
sample_ids = pd.read_csv("sample_ids.csv")


def read_summary(file_path):
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()

        none_line = next(
            (line for line in lines if line.strip().startswith("None")), None
        )

        if none_line:
            values = none_line.split()
            metrics = {
                "True-pos-baseline": int(values[1]),
                "True-pos-call": int(values[2]),
                "False-pos": int(values[3]),
                "False-neg": int(values[4]),
                "Precision": float(values[5]),
                "Sensitivity": float(values[6]),
                "F-measure": float(values[7]),
            }
            return metrics
        else:
            print(f"No 'None' threshold line found in {file_path}")
            return {}
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return {}
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return {}


def calculate_rtg_statistics(df):
    return df.groupby("complexity").agg(
        {
            "Precision": ["mean", "std", "median", "min", "max"],
            "Sensitivity": ["mean", "std", "median", "min", "max"],
            "F-measure": ["mean", "std", "median", "min", "max"],
        }
    )


ont_snv_hc_metrics = []
ont_snv_lc_metrics = []
illumina_snv_hc_metrics = []
illumina_snv_lc_metrics = []

complexities = ["hc", "lc"]
technologies = ["ont", "illumina"]

for index, row in sample_ids.iterrows():
    ont_id = row["ont_id"]
    lp_id = row["lp_id"]

    for tech in technologies:
        for complexity in complexities:
            sample_id = ont_id if tech == "ont" else lp_id
            summary_file = (
                f"output/snv/rtg_vcfeval/{complexity}/{sample_id}.snv/summary.txt"
            )
            summary = read_summary(summary_file)

            if summary:
                metrics_entry = {
                    "sample_id": sample_id,
                    "complexity": "hc" if complexity == "hc" else "lc",
                    **summary,
                }

                if tech == "ont":
                    if complexity == "hc":
                        ont_snv_hc_metrics.append(metrics_entry)
                    else:
                        ont_snv_lc_metrics.append(metrics_entry)
                elif tech == "illumina":
                    if complexity == "hc":
                        illumina_snv_hc_metrics.append(metrics_entry)
                    else:
                        illumina_snv_lc_metrics.append(metrics_entry)
            else:
                print(f"Skipping empty summary for {sample_id}, {tech}, {complexity}")

ont_snv_metrics_df = pd.DataFrame(ont_snv_hc_metrics + ont_snv_lc_metrics)

illumina_snv_metrics_df = pd.DataFrame(
    illumina_snv_hc_metrics + illumina_snv_lc_metrics
)


ont_snv_stats = calculate_rtg_statistics(ont_snv_metrics_df)

ont_snv_stats


Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,Sensitivity,Sensitivity,F-measure,F-measure,F-measure,F-measure,F-measure
Unnamed: 0_level_1,mean,std,median,min,max,mean,std,median,min,max,mean,std,median,min,max
complexity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
hc,0.9591,0.003764,0.9599,0.955,0.9624,0.965467,0.004704,0.9666,0.9603,0.9695,0.9623,0.004288,0.9633,0.9576,0.966
lc,0.789733,0.004941,0.789,0.7852,0.795,0.753633,0.005595,0.7514,0.7495,0.76,0.771233,0.00527,0.7697,0.7669,0.7771


In [24]:
illumina_snv_stats = calculate_rtg_statistics(illumina_snv_metrics_df)

illumina_snv_stats


Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,Sensitivity,Sensitivity,F-measure,F-measure,F-measure,F-measure,F-measure
Unnamed: 0_level_1,mean,std,median,min,max,mean,std,median,min,max,mean,std,median,min,max
complexity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
hc,0.9655,0.000721,0.9653,0.9649,0.9663,0.972567,0.000586,0.9728,0.9719,0.973,0.969067,0.000153,0.9691,0.9689,0.9692
lc,0.8017,0.001572,0.8014,0.8003,0.8034,0.743833,0.002098,0.7431,0.7422,0.7462,0.771667,0.001401,0.7721,0.7701,0.7728


In [25]:
def perform_ttest(ont_data, illumina_data, metric):
    ont_values = ont_data[metric]
    illumina_data_values = illumina_data[metric]
    t_stat, p_value = stats.ttest_ind(ont_values, illumina_data_values)
    return t_stat, p_value


metrics_to_test = ["Precision", "Sensitivity", "F-measure"]

for df in [ont_snv_stats, illumina_snv_stats]:
    for metric in metrics_to_test:
        df[(metric, "t_statistic")] = None
        df[(metric, "p_value")] = None
        df[(metric, "adjusted_p_value")] = None

all_p_values = []

for complexity in complexities:
    ont_data = ont_snv_metrics_df[ont_snv_metrics_df["complexity"] == complexity]
    illumina_data = illumina_snv_metrics_df[
        illumina_snv_metrics_df["complexity"] == complexity
    ]

    for metric in metrics_to_test:
        t_stat, p_value = perform_ttest(ont_data, illumina_data, metric)

        ont_snv_stats.loc[complexity, (metric, "t_statistic")] = t_stat
        ont_snv_stats.loc[complexity, (metric, "p_value")] = p_value

        illumina_snv_stats.loc[complexity, (metric, "t_statistic")] = t_stat
        illumina_snv_stats.loc[complexity, (metric, "p_value")] = p_value

        all_p_values.append(p_value)

_, adjusted_p_values, _, _ = multipletests(all_p_values, method="fdr_bh")

adjusted_p_value_index = 0
for complexity in complexities:
    for metric in metrics_to_test:
        adjusted_p_value = adjusted_p_values[adjusted_p_value_index]
        ont_snv_stats.loc[complexity, (metric, "adjusted_p_value")] = adjusted_p_value
        illumina_snv_stats.loc[complexity, (metric, "adjusted_p_value")] = (
            adjusted_p_value
        )
        adjusted_p_value_index += 1

ont_snv_stats


Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,Sensitivity,Sensitivity,...,F-measure,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,F-measure,F-measure,F-measure
Unnamed: 0_level_1,mean,std,median,min,max,mean,std,median,min,max,...,max,t_statistic,p_value,adjusted_p_value,t_statistic,p_value,adjusted_p_value,t_statistic,p_value,adjusted_p_value
complexity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
hc,0.9591,0.003764,0.9599,0.955,0.9624,0.965467,0.004704,0.9666,0.9603,0.9695,...,0.966,-2.892209,0.044464,0.072478,-2.594476,0.060398,0.072478,-2.731299,0.052373,0.072478
lc,0.789733,0.004941,0.789,0.7852,0.795,0.753633,0.005595,0.7514,0.7495,0.76,...,0.7771,-3.997535,0.016163,0.072478,2.840613,0.046839,0.072478,-0.137637,0.897177,0.897177


In [26]:
illumina_snv_stats


Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,Sensitivity,Sensitivity,...,F-measure,Precision,Precision,Precision,Sensitivity,Sensitivity,Sensitivity,F-measure,F-measure,F-measure
Unnamed: 0_level_1,mean,std,median,min,max,mean,std,median,min,max,...,max,t_statistic,p_value,adjusted_p_value,t_statistic,p_value,adjusted_p_value,t_statistic,p_value,adjusted_p_value
complexity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
hc,0.9655,0.000721,0.9653,0.9649,0.9663,0.972567,0.000586,0.9728,0.9719,0.973,...,0.9692,-2.892209,0.044464,0.072478,-2.594476,0.060398,0.072478,-2.731299,0.052373,0.072478
lc,0.8017,0.001572,0.8014,0.8003,0.8034,0.743833,0.002098,0.7431,0.7422,0.7462,...,0.7728,-3.997535,0.016163,0.072478,2.840613,0.046839,0.072478,-0.137637,0.897177,0.897177


### AUC
