In [1]:
import pandas as pd
import json
from glob import glob
from collections import Counter
import os
import numpy as np
from sklearn.metrics import f1_score
import scanpy as sc

In [17]:
adata = sc.read_h5ad("expression/results/solo/solo_imgExtra_noMissing.h5ad")

In [18]:
[v["f1_score"] for k, v in adata.uns.items() if "f1_score" in v]

[0.35602094240837695,
 0.3408521303258145,
 0.2765957446808511,
 0.3557951482479784,
 0.33658536585365856,
 0.3484848484848485,
 0.3486682808716707,
 0.34549878345498786,
 0.3652173913043478]

In [2]:
# agreements
labeler1_target = pd.read_csv("crop_target/targets.csv")
labeler2_target = pd.read_csv("crop_target/xinya_targets.csv")
combine = pd.merge(labeler1_target, labeler2_target, on="image_id")

In [3]:
combine

Unnamed: 0,image_id,cell_num_x,class_x,difficult_x,cell_num_y,class_y,difficult_y
0,Image10_25_15,1,Singlet,False,1,Singlet,False
1,Image10_9_18,1,Singlet,False,1,Singlet,False
2,Image10_4_3,1,Singlet,False,1,Singlet,False
3,Image10_32_12,1,Singlet,False,1,Singlet,False
4,Image10_23_4,0,Missing,False,1,Singlet,False
...,...,...,...,...,...,...,...
5293,Image5_11_10,1,Singlet,False,1,Singlet,False
5294,Image5_24_20,0,Missing,False,0,Missing,False
5295,Image5_39_11,0,Missing,False,0,Missing,False
5296,Image5_7_8,1,Singlet,False,1,Singlet,False


In [4]:
combine.shape[0]/ labeler2_target.shape[0]

0.7727537922987164

In [5]:
(combine.cell_num_x == combine.cell_num_y).mean()

0.9363910909777274

In [6]:
# show that model has enough samples to learn different classes
def count_classes(split_file):
    with open(split_file) as f:
        lines = f.readlines()
    
    missing, cells = 0, 0
    for line in lines:
        num_cols = len(line.strip().split())
        if num_cols == 1:
            missing += 1
        elif num_cols > 1:
            cells += 1
    
    return missing, cells

# Faster-RCNN-Ensemble/train_val_split/loocv/Image1/2007_train_5.txt
records = []
for split_file in glob("Faster-RCNN-Ensemble/train_val_split/loocv/Image*/2007_train_*.txt"):
    fold = str(split_file.split("/")[-2])
    model = int(os.path.basename(split_file).split(".")[0].split("_")[-1])
    missing, cells = count_classes(split_file)
    records.append([fold, model, missing, cells])

records = pd.DataFrame(records, columns=["fold", "model", "missing", "cells"])
records.to_csv("./Figure_revision/supp_count_loocv_classes.csv", index=False)

records = []
for split_file in glob("Faster-RCNN-Ensemble-Xinya/train_val_split/loocv/Image*/2007_train_*.txt"):
    fold = str(split_file.split("/")[-2])
    model = int(os.path.basename(split_file).split(".")[0].split("_")[-1])
    missing, cells = count_classes(split_file)
    records.append([fold, model, missing, cells])

records = pd.DataFrame(records, columns=["fold", "model", "missing", "cells"])
records.to_csv("./Figure_revision/supp_count_loocv_classes_labeler2.csv", index=False)

records = []
for split_file in glob("Faster-RCNN-Ensemble/train_val_split/for_expression/2007_train_*.txt"):
    model = int(os.path.basename(split_file).split(".")[0].split("_")[-1])
    missing, cells = count_classes(split_file)
    records.append([model, missing, cells])

records = pd.DataFrame(records, columns=["model", "missing", "cells"])
records.to_csv("./Figure_revision/supp_count_resolution_classes.csv", index=False)

records = []
for split_file in glob("Faster-RCNN-Ensemble-Xinya/train_val_split/for_expression/2007_train_*.txt"):
    model = int(os.path.basename(split_file).split(".")[0].split("_")[-1])
    missing, cells = count_classes(split_file)
    records.append([model, missing, cells])

records = pd.DataFrame(records, columns=["model", "missing", "cells"])
records.to_csv("./Figure_revision/supp_count_resolution_classes_labeler2.csv", index=False)

In [27]:
def count_classes(split_file):
    with open(split_file) as f:
        lines = f.readlines()
    
    missing, cells, singlet, doublet = 0, 0, 0, 0
    image_sets = set()
    for line in lines:
        contents = line.strip().split()
        image_set = contents[0].split("/")[-1].split("_")[0]
        image_sets.add(image_set)
        num_cols = len(contents)
        if num_cols == 1:
            missing += 1
        elif num_cols == 2:
            cells += 1
            singlet += 1
        elif num_cols > 2:
            cells += 1
            doublet += 1
    
    return ", ".join(list(image_sets)), missing, cells, singlet, doublet

def count_test_classes(target, fold):
    if fold == "low_resolution":
        target = target[(target.image_id.str.contains("Image5_") | target.image_id.str.contains("Image11_")) & 
                        (target.difficult == False)]
    elif fold.startswith("Image"):
        target = target[target.image_id.str.contains(fold+"_") & (target.difficult == False)]
    missing = target[target.cell_num == 0].shape[0]
    cells = target[target.cell_num > 0].shape[0]
    singlet = target[target.cell_num == 1].shape[0]
    doublet = target[target.cell_num > 1].shape[0]
    return missing, cells, singlet, doublet

# Faster-RCNN-Ensemble/train_val_split/loocv/Image1/2007_train_5.txt
records = []
for train_file in glob("Faster-RCNN-Ensemble/train_val_split/loocv/Image*/2007_train_*.txt"):
    val_file = train_file.replace("2007_train", "2007_val")
    fold = str(train_file.split("/")[-2])
    model = int(os.path.basename(train_file).split(".")[0].split("_")[-1])
    train_sets, train_missing, train_cells, train_singlet, train_doublet = count_classes(train_file)
    val_sets, val_missing, val_cells, val_singlet, val_doublet = count_classes(val_file)
    test_missing, test_cells, test_singlet, test_doublet = count_test_classes(labeler1_target, fold)
    records.append([fold, model, 
                    train_sets, train_missing, train_cells, train_singlet, train_doublet, 
                    val_sets, val_missing, val_cells, val_singlet, val_doublet,
                    test_missing, test_cells, test_singlet, test_doublet])
for train_file in glob("Faster-RCNN-Ensemble/train_val_split/for_expression/2007_train_*.txt"):
    fold = "low_resolution"
    model = int(os.path.basename(train_file).split(".")[0].split("_")[-1])
    train_sets, train_missing, train_cells, train_singlet, train_doublet = count_classes(train_file)
    val_sets, val_missing, val_cells, val_singlet, val_doublet = count_classes(train_file.replace("2007_train", "2007_val"))
    test_missing, test_cells, test_singlet, test_doublet = count_test_classes(labeler1_target, fold)
    records.append([fold, model, 
                    train_sets, train_missing, train_cells, train_singlet, train_doublet, 
                    val_sets, val_missing, val_cells, val_singlet, val_doublet,
                    test_missing, test_cells, test_singlet, test_doublet])
records = pd.DataFrame(records, columns=["fold", "model", 
                                         "train_image_sets", "train_missing", "train_cells", "train_singlet", "train_doublet",
                                         "val_image_sets", "val_missing", "val_cells", "val_singlet", "val_doublet",
                                         "test_missing", "test_cells", "test_singlet", "test_doublet"])
records.to_csv("./Figure_revision/supp_summary_cv_labeler1.tsv", index=False, sep="\t")

records = []
for train_file in glob("Faster-RCNN-Ensemble-Xinya/train_val_split/loocv/Image*/2007_train_*.txt"):
    val_file = train_file.replace("2007_train", "2007_val")
    fold = str(train_file.split("/")[-2])
    model = int(os.path.basename(train_file).split(".")[0].split("_")[-1])
    train_sets, train_missing, train_cells, train_singlet, train_doublet = count_classes(train_file)
    val_sets, val_missing, val_cells, val_singlet, val_doublet = count_classes(val_file)
    test_missing, test_cells, test_singlet, test_doublet = count_test_classes(labeler1_target, fold)
    records.append([fold, model, 
                    train_sets, train_missing, train_cells, train_singlet, train_doublet, 
                    val_sets, val_missing, val_cells, val_singlet, val_doublet,
                    test_missing, test_cells, test_singlet, test_doublet])
for train_file in glob("Faster-RCNN-Ensemble-Xinya/train_val_split/for_expression/2007_train_*.txt"):
    fold = "low_resolution"
    model = int(os.path.basename(train_file).split(".")[0].split("_")[-1])
    train_sets, train_missing, train_cells, train_singlet, train_doublet = count_classes(train_file)
    val_sets, val_missing, val_cells, val_singlet, val_doublet = count_classes(train_file.replace("2007_train", "2007_val"))
    test_missing, test_cells, test_singlet, test_doublet = count_test_classes(labeler1_target, fold)
    records.append([fold, model, 
                    train_sets, train_missing, train_cells, train_singlet, train_doublet, 
                    val_sets, val_missing, val_cells, val_singlet, val_doublet,
                    test_missing, test_cells, test_singlet, test_doublet])
records = pd.DataFrame(records, columns=["fold", "model", 
                                         "train_image_sets", "train_missing", "train_cells", "train_singlet", "train_doublet",
                                         "val_image_sets", "val_missing", "val_cells", "val_singlet", "val_doublet",
                                         "test_missing", "test_cells", "test_singlet", "test_doublet"])
records.to_csv("./Figure_revision/supp_summary_cv_labeler2.tsv", index=False, sep="\t")

In [4]:
def get_loocv(model_folder):
    columns = ["thresh", "acc", "balance_acc", "weight_f1", "avg_f1", "model", "cv_type", "testset", "model_id"]
    
    loocv_performances = []
    glob_path = f"./{model_folder}/map_out/loocv/*/performance*.json"
    for json_file in glob(glob_path):
        image_id = json_file.split("/")[-2]
        base_name = os.path.splitext(os.path.basename(json_file))[0]
        model_id = "ensemble" if "model" not in base_name else base_name.split("-")[-1]
        with open(json_file) as f:
            res_json = json.load(f)
        for k, subd in res_json.items():
            record = [float(k), subd["accuracy"], subd["balanced_accuracy"], subd["weighted_f1"], subd["avg_f1"]]
            # model, cv_type, testset, model_id
            record += [model_folder, "loocv", image_id, model_id]
            loocv_performances.append(record)        
    loocv_performances = pd.DataFrame(loocv_performances, columns=columns)

    glob_path = f"./{model_folder}/map_out/loocv/avg_scores*.tsv"
    for tsv_file in glob(glob_path):
        base_name = os.path.splitext(os.path.basename(tsv_file))[0]
        model_id = "ensemble" if "model" not in base_name else base_name.split("_")[-1]
        loocv_overall = pd.read_table(tsv_file, sep="\t", header=None,
                                      names=["thresh", "acc", "balance_acc", "weight_f1", "avg_f1"])
        loocv_overall["model"] = model_folder
        loocv_overall["cv_type"] = "loocv"
        loocv_overall["testset"] = "average"
        loocv_overall["model_id"] = model_id
        loocv_performances = pd.concat([loocv_performances, loocv_overall], ignore_index=True)

    return loocv_performances

def get_cross_resolution(model_folder):
    cross_reso_performances = pd.DataFrame()
    glob_path = f"./{model_folder}/map_out/for_expression/performance*.tsv"
    
    for tsv_file in glob(glob_path):
        base_name = os.path.splitext(os.path.basename(tsv_file))[0]
        model_id = "ensemble" if "model" not in base_name else base_name.split("-")[-1]
        performances = pd.read_table(
            tsv_file, sep="\t", header=None, 
            names=["thresh", "acc", "balance_acc", "weight_f1", "avg_f1"])
        performances["model"] = model_folder
        performances["cv_type"] = "cross-resolution"
        performances["testset"] = "low-resolution"
        performances["model_id"] = model_id
        cross_reso_performances = pd.concat([cross_reso_performances, performances])
    return cross_reso_performances

# show the performances of single models and ensemble model
records = get_loocv("Faster-RCNN-Ensemble")
records.to_csv("./Figure_revision/supp_loocv_singleVSensemble.csv", index=False)

records = get_cross_resolution("Faster-RCNN-Ensemble")
records.to_csv("./Figure_revision/supp_resolution_singleVSensemble.csv", index=False)

In [5]:
# the loss info
train_losses_file_loocv = glob("Faster-RCNN-Ensemble/logs/loocv/Image*/loss_2023_12_*/epoch_loss_*.txt")
val_losses_file_loocv = glob("Faster-RCNN-Ensemble/logs/loocv/Image*/loss_2023_12_*/epoch_val_loss_*.txt")
train_losses_file_reso = glob("Faster-RCNN-Ensemble/logs/for_expression/loss_2023_11_*/epoch_loss_*.txt")
val_losses_file_reso = glob("Faster-RCNN-Ensemble/logs/for_expression/loss_2023_11_*/epoch_val_loss_*.txt")

records = []
for file in train_losses_file_loocv:
    fold = file.split("/")[3]
    loss_id = file.split("/")[4]
    cv_type = "loocv"
    loss_type = "train"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
for file in val_losses_file_loocv:
    fold = file.split("/")[3]
    loss_id = file.split("/")[4]
    cv_type = "loocv"
    loss_type = "val"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
for file in train_losses_file_reso:
    fold = "low_resolution"
    loss_id = file.split("/")[3]
    cv_type = "cross-resolution"
    loss_type = "train"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
for file in val_losses_file_reso:
    fold = "low_resolution"
    loss_id = file.split("/")[3]
    cv_type = "cross-resolution"
    loss_type = "val"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
            
records = pd.DataFrame(records, columns=["fold", "loss_id", "cv_type", "loss_type", "epoch", "loss"])

records["model_id"] = 0
for fold in records["fold"].unique():
    for epoch in records["epoch"].unique():
        for loss_type in records["loss_type"].unique():
            idx = records[(records["fold"] == fold) & (records["epoch"] == epoch) & (records["loss_type"] == loss_type)].index
            records.loc[idx, "model_id"] = range(1, len(idx)+1)
            
records.to_csv("./Figure_revision/supp_loss_info_labeler1.csv", index=False)

In [6]:
# the loss info
train_losses_file_loocv = glob("Faster-RCNN-Ensemble-Xinya/logs/loocv/Image*/loss_2024_01_1*/epoch_loss_*.txt")
val_losses_file_loocv = glob("Faster-RCNN-Ensemble-Xinya/logs/loocv/Image*/loss_2024_01_1*/epoch_val_loss_*.txt")
# train_losses_file_reso = glob("Faster-RCNN-Ensemble-Xinya/logs/for_expression/*/epoch_loss_*.txt")
# val_losses_file_reso = glob("Faster-RCNN-Ensemble-Xinya/logs/for_expression/*/epoch_val_loss_*.txt")

records = []
for file in train_losses_file_loocv:
    fold = file.split("/")[3]
    loss_id = file.split("/")[4]
    cv_type = "loocv"
    loss_type = "train"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
for file in val_losses_file_loocv:
    fold = file.split("/")[3]
    loss_id = file.split("/")[4]
    cv_type = "loocv"
    loss_type = "val"
    with open(file) as f:
        for i, line in enumerate(f):
            loss = float(line.strip().split()[0])
            records.append([fold, loss_id, cv_type, loss_type, i, loss])
         
records = pd.DataFrame(records, columns=["fold", "loss_id", "cv_type", "loss_type", "epoch", "loss"])

records["model_id"] = 0
for fold in records["fold"].unique():
    for epoch in records["epoch"].unique():
        for loss_type in records["loss_type"].unique():
            idx = records[(records["fold"] == fold) & (records["epoch"] == epoch) & (records["loss_type"] == loss_type)].index
            records.loc[idx, "model_id"] = range(1, len(idx)+1)
            
records.to_csv("./Figure_revision/supp_loss_info_labeler2.csv", index=False)

In [7]:
records = get_loocv("Faster-RCNN-noAug")
records.to_csv("./Figure_revision/supp_loocv_noAug.csv", index=False)

records = get_loocv("Faster-RCNN")
records.to_csv("./Figure_revision/supp_loocv_wAug.csv", index=False)

In [10]:
data = ["Singlet", "Singlet", "Doublet", "Doublet", "Missing"]
counter = Counter(data)
class_order = {"Doublet": 0, "Singlet": 1, "Missing": 2}
sorted_items = sorted(counter.items(), key=lambda item: (-item[1], class_order[item[0]]))

In [9]:
get_loocv("Faster-RCNN-noAug")

Unnamed: 0,thresh,acc,balance_acc,weight_f1,avg_f1,model,cv_type,testset,model_id
0,0.3,0.901114,0.910930,0.912522,0.901114,Faster-RCNN-noAug,loocv,Image10,ensemble
1,0.4,0.912256,0.904280,0.921030,0.912256,Faster-RCNN-noAug,loocv,Image10,ensemble
2,0.5,0.916435,0.893184,0.924276,0.916435,Faster-RCNN-noAug,loocv,Image10,ensemble
3,0.6,0.923398,0.883079,0.929253,0.923398,Faster-RCNN-noAug,loocv,Image10,ensemble
4,0.7,0.924791,0.853631,0.928787,0.924791,Faster-RCNN-noAug,loocv,Image10,ensemble
...,...,...,...,...,...,...,...,...,...
61,0.4,0.931548,0.910976,0.935748,0.931548,Faster-RCNN-noAug,loocv,average,ensemble
62,0.5,0.937502,0.909570,0.940199,0.937502,Faster-RCNN-noAug,loocv,average,ensemble
63,0.6,0.941130,0.909653,0.942723,0.941130,Faster-RCNN-noAug,loocv,average,ensemble
64,0.7,0.943117,0.897548,0.943203,0.943117,Faster-RCNN-noAug,loocv,average,ensemble


In [2]:
def count_cells(detection_result_file, thresh=0.7):
    res_file = open(detection_result_file, "r")
    detected_cells = [line.strip().split() for line in res_file]
    n_valid_cell = len([cell for cell in detected_cells if float(cell[1]) > thresh])
    avg_conf_score = np.nan if n_valid_cell == 0 else \
        ",".join([str(cell[1]) for cell in detected_cells if float(cell[1]) > thresh])
    return n_valid_cell, avg_conf_score

In [3]:
records = pd.DataFrame()
image_sets = ["Image" + str(i) for i in [1,2,3,5,6,7,8,9,10,11]]
N = len(image_sets)

for image_set in image_sets:
    targets = pd.read_csv("crop_target/targets.csv")
    targets["image_set"] = targets["image_id"].apply(lambda x: x.split("_")[0])
    targets = targets[(targets.image_set == image_set) & ~targets.difficult]
    
    for thresh in [0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
        for model in ["model1", "model2", "model3", "model4", "model5"]:
            targets_copy = targets.copy()
            targets_copy["model"] = model
            targets_copy["thresh"] = thresh
            
            num_cells, score_cells = [], []
            for idx, target in targets_copy.iterrows():
                image_id = target["image_id"]
                n_cells, avg_conf_score = count_cells(f"Faster-RCNN-Ensemble/map_out/loocv/{image_set}/detection-results-{model}/{image_id}.txt", thresh)
                num_cells.append(n_cells)
                score_cells.append(avg_conf_score)
            targets_copy["num_cells"] = num_cells
            targets_copy["score_cells"] = score_cells
            records = pd.concat([records, targets_copy])
            
records.to_csv("./Figure_revision/supp_loocv_nCell_confScores_wThresh_labeler1.csv", index=False)

In [4]:
records = pd.DataFrame()
image_sets = ["Image" + str(i) for i in [1,2,3,5,6,7,8,9,10,11]]
N = len(image_sets)

for image_set in image_sets:
    targets = pd.read_csv("crop_target/targets.csv")
    targets["image_set"] = targets["image_id"].apply(lambda x: x.split("_")[0])
    targets = targets[(targets.image_set == image_set) & ~targets.difficult]
    
    for thresh in [0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
        for model in ["model1", "model2", "model3", "model4", "model5"]:
            targets_copy = targets.copy()
            targets_copy["model"] = model
            targets_copy["thresh"] = thresh
            
            num_cells, score_cells = [], []
            for idx, target in targets_copy.iterrows():
                image_id = target["image_id"]
                n_cells, avg_conf_score = count_cells(f"Faster-RCNN-Xinya2Kaiwen/map_out/loocv/{image_set}/detection-results-{model}/{image_id}.txt", thresh)
                num_cells.append(n_cells)
                score_cells.append(avg_conf_score)
            targets_copy["num_cells"] = num_cells
            targets_copy["score_cells"] = score_cells
            records = pd.concat([records, targets_copy])
            
records.to_csv("./Figure_revision/supp_loocv_nCell_confScores_wThresh_labeler2to1.csv", index=False)

In [5]:
records = pd.DataFrame()
image_sets = ["Image" + str(i) for i in [1,2,3,5,7,8,9,10,11]]
N = len(image_sets)

for image_set in image_sets:
    targets = pd.read_csv("crop_target/xinya_targets.csv")
    targets["image_set"] = targets["image_id"].apply(lambda x: x.split("_")[0])
    targets = targets[(targets.image_set == image_set) & ~targets.difficult]
    
    for thresh in [0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
        for model in ["model1", "model2", "model3", "model4", "model5"]:
            targets_copy = targets.copy()
            targets_copy["model"] = model
            targets_copy["thresh"] = thresh
            
            num_cells, score_cells = [], []
            for idx, target in targets_copy.iterrows():
                image_id = target["image_id"]
                n_cells, avg_conf_score = count_cells(f"Faster-RCNN-Ensemble-Xinya/map_out/loocv/{image_set}/detection-results-{model}/{image_id}.txt", thresh)
                num_cells.append(n_cells)
                score_cells.append(avg_conf_score)
            targets_copy["num_cells"] = num_cells
            targets_copy["score_cells"] = score_cells
            records = pd.concat([records, targets_copy])
            
records.to_csv("./Figure_revision/supp_loocv_nCell_confScores_wThresh_labeler2.csv", index=False)

In [6]:
records = pd.DataFrame()
image_sets = ["Image" + str(i) for i in [1,2,3,5,7,8,9,10,11]]
N = len(image_sets)

for image_set in image_sets:
    targets = pd.read_csv("crop_target/xinya_targets.csv")
    targets["image_set"] = targets["image_id"].apply(lambda x: x.split("_")[0])
    targets = targets[(targets.image_set == image_set) & ~targets.difficult]
    
    for thresh in [0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
        for model in ["model1", "model2", "model3", "model4", "model5"]:
            targets_copy = targets.copy()
            targets_copy["model"] = model
            targets_copy["thresh"] = thresh
            
            num_cells, score_cells = [], []
            for idx, target in targets_copy.iterrows():
                image_id = target["image_id"]
                n_cells, avg_conf_score = count_cells(f"Faster-RCNN-Kaiwen2Xinya/map_out/loocv/{image_set}/detection-results-{model}/{image_id}.txt", thresh)
                num_cells.append(n_cells)
                score_cells.append(avg_conf_score)
            targets_copy["num_cells"] = num_cells
            targets_copy["score_cells"] = score_cells
            records = pd.concat([records, targets_copy])
            
records.to_csv("./Figure_revision/supp_loocv_nCell_confScores_wThresh_labeler1to2.csv", index=False)

In [2]:
import scanpy as sc

In [16]:
res_methods = {
    "method": [],
    "settings": [],
    "is_rmEmpty": [],
    "f1_score": [],
}

res_doubletdetection = sc.read_h5ad("./expression/results/DoubletDetection/doubletdetection_img5_noMissing.h5ad")
for k, v in res_doubletdetection.uns.items():
    if k != "hvg":
        res_methods["method"].append("DoubletDetection")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])
res_doubletdetection = sc.read_h5ad("./expression/results/DoubletDetection/doubletdetection_img5.h5ad")
for k, v in res_doubletdetection.uns.items():
    if k != "hvg":
        res_methods["method"].append("DoubletDetection")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
        
        
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_img5.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_img5_noMissing.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_solo = sc.read_h5ad("./expression/results/solo/solo_img5.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_solo = sc.read_h5ad("./expression/results/solo/solo_img5_noMissing.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_socube = sc.read_h5ad("./expression/results/socube/socube_img5.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_socube = sc.read_h5ad("./expression/results/socube/socube_img5_noMissing.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])

In [17]:
res_scDblFinder = pd.read_csv("./expression/results/scDblFinder/scDblFinder/scores.csv", index_col=0)
for idx, row in res_scDblFinder.iterrows():
    if "img5" in idx:
        res_methods["method"].append("scDblFinder")
        res_methods["settings"].append(idx)
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])

res_scds = pd.read_csv("./expression/results/scds/scds/scores.csv", index_col=0)
for idx, row in res_scds.iterrows():
    if "img5" in idx:
        res_methods["method"].append("scds")
        res_methods["settings"].append(row["method"] + "_" + str(row["thresh"]))
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])
        
res_doubletfinder = pd.read_csv("./expression/results/DoubletFinder/DoubletFinder/scores.csv", index_col=0)
for idx, row in res_doubletfinder.iterrows():
    if "img5" in idx:
        res_methods["method"].append("DoubletFinder")
        res_methods["settings"].append(idx)
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])

In [18]:
pd.DataFrame(res_methods).to_csv("./Figure_revision/supp_doublet_detection_f1_score_img5.csv", index=False)

In [19]:
res_methods = {
    "method": [],
    "settings": [],
    "is_rmEmpty": [],
    "f1_score": [],
}

res_doubletdetection = sc.read_h5ad("./expression/results/DoubletDetection/doubletdetection_img11_noMissing.h5ad")
for k, v in res_doubletdetection.uns.items():
    if k != "hvg":
        res_methods["method"].append("DoubletDetection")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])
res_doubletdetection = sc.read_h5ad("./expression/results/DoubletDetection/doubletdetection_img11.h5ad")
for k, v in res_doubletdetection.uns.items():
    if k != "hvg":
        res_methods["method"].append("DoubletDetection")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
        
        
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_img11.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_img11_noMissing.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_solo = sc.read_h5ad("./expression/results/solo/solo_img11.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_solo = sc.read_h5ad("./expression/results/solo/solo_img11_noMissing.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_socube = sc.read_h5ad("./expression/results/socube/socube_img11.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_socube = sc.read_h5ad("./expression/results/socube/socube_img11_noMissing.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])

In [20]:
res_scDblFinder = pd.read_csv("./expression/results/scDblFinder/scDblFinder/scores.csv", index_col=0)
for idx, row in res_scDblFinder.iterrows():
    if "img11" in idx:
        res_methods["method"].append("scDblFinder")
        res_methods["settings"].append(idx)
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])

res_scds = pd.read_csv("./expression/results/scds/scds/scores.csv", index_col=0)
for idx, row in res_scds.iterrows():
    if "img11" in idx:
        res_methods["method"].append("scds")
        res_methods["settings"].append(row["method"] + "_" + str(row["thresh"]))
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])
        
res_doubletfinder = pd.read_csv("./expression/results/DoubletFinder/DoubletFinder/scores.csv", index_col=0)
for idx, row in res_doubletfinder.iterrows():
    if "img11" in idx:
        res_methods["method"].append("DoubletFinder")
        res_methods["settings"].append(idx)
        res_methods["is_rmEmpty"].append(("noMissing" in idx))
        res_methods["f1_score"].append(row["f1"])

In [21]:
pd.DataFrame(res_methods).to_csv("./Figure_revision/supp_doublet_detection_f1_score_img11.csv", index=False)

In [3]:
res_methods = {
    "method": [],
    "settings": [],
    "is_rmEmpty": [],
    "f1_score": [],
}
            
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_imgExtra.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_scrublet = sc.read_h5ad("./expression/results/scrublet/scrublet_imgExtra_noMissing.h5ad")
for k, v in res_scrublet.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Scrublet")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_solo = sc.read_h5ad("./expression/results/solo/solo_imgExtra.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_solo = sc.read_h5ad("./expression/results/solo/solo_imgExtra_noMissing.h5ad")
for k, v in res_solo.uns.items():
    if k != "hvg" and isinstance(v, dict):
        res_methods["method"].append("Solo")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])


res_socube = sc.read_h5ad("./expression/results/socube/socube_imgExtra.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(False)
        res_methods["f1_score"].append(v["f1_score"])
res_socube = sc.read_h5ad("./expression/results/socube/socube_imgExtra_noMissing.h5ad")
for k, v in res_socube.uns.items():
    if k != "hvg":
        res_methods["method"].append("SoCube")
        res_methods["settings"].append(k)
        res_methods["is_rmEmpty"].append(True)
        res_methods["f1_score"].append(v["f1_score"])

In [7]:
pd.DataFrame(res_methods)

Unnamed: 0,method,settings,is_rmEmpty,f1_score
0,Scrublet,eva_doublets_thresh_0.20,False,0.430769
1,Scrublet,eva_doublets_thresh_0.25,False,0.434783
2,Scrublet,eva_doublets_thresh_0.30,False,0.410959
3,Scrublet,eva_doublets_thresh_0.35,False,0.40327
4,Scrublet,eva_doublets_thresh_0.40,False,0.328671
5,Scrublet,eva_doublets_thresh_0.45,False,0.258333
6,Scrublet,eva_doublets_thresh_0.50,False,0.153846
7,Scrublet,eva_doublets_thresh_0.55,False,0.116959
8,Scrublet,eva_doublets_thresh_0.60,False,0.050314
9,Scrublet,eva_doublets_thresh_0.20,True,0.428822


In [15]:
res_image_based = sc.read_h5ad("./expression/results/image-based/adata_img5_wPreds.h5ad")
res_image_based = res_image_based[res_image_based.obs["class"].notnull() &
                                  (res_image_based.obs["difficult"] == False)]

In [16]:
res_image_based.obs["isMissing"] = (res_image_based.obs["class"] == "Missing").astype(int)
res_image_based.obs["isMissing_pred"] = (res_image_based.obs["pred_image_class"] == "Missing").astype(int)
f1_image_based = f1_score(res_image_based.obs["isMissing"], res_image_based.obs["isMissing_pred"])


  res_image_based.obs["isMissing"] = (res_image_based.obs["class"] == "Missing").astype(int)


In [17]:
f1_image_based

0.9847715736040609