# HLCA Datasets gene harmonisation

This notebook is in charge of using idtrack for the gene harmonisation of the HLCA.

__author__ = "Kemal Inecik, Ciro Ramírez-Suástegui"

__maintainer__ = "Lisa Sikkema, Ciro Ramírez-Suástegui"

__email__ = "lisa.sikkema@helmholtz-muenchen.de, ciro.rsuastegui@helmholtz-munich.de"

### Import

In [3]:
import os
import time
import pickle

In [None]:
import scanpy as sc
import idtrack

In [3]:
%load_ext autoreload
%autoreload 2

### Load data

Initialize the graph and pathfinder

In [4]:
local_dir = "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp"
idt = idtrack.API(local_repository=local_dir)
idt.configure_logger()
idt.initialize_graph(organism_name='homo_sapiens', ensembl_release=107, return_test=True)

2022-11-01 16:45:08 INFO:graph_maker: The graph is being read.


In [5]:
idt.calculate_graph_caches() 

2022-11-01 16:45:47 INFO:the_graph: Cached properties being calculated: combined_edges
2022-11-01 16:46:32 INFO:the_graph: Cached properties being calculated: combined_edges_assembly_specific_genes
2022-11-01 16:46:35 INFO:the_graph: Cached properties being calculated: combined_edges_genes
2022-11-01 16:46:46 INFO:the_graph: Cached properties being calculated: lower_chars_graph
2022-11-01 16:46:47 INFO:the_graph: Cached properties being calculated: get_active_ranges_of_id
2022-11-01 16:47:06 INFO:the_graph: Cached properties being calculated: available_external_databases
2022-11-01 16:47:07 INFO:the_graph: Cached properties being calculated: external_database_connection_form
2022-11-01 16:47:40 INFO:the_graph: Cached properties being calculated: available_genome_assemblies
2022-11-01 16:47:40 INFO:the_graph: Cached properties being calculated: available_external_databases_assembly
2022-11-01 16:47:41 INFO:the_graph: Cached properties being calculated: node_trios


The datasets of HLCA

In [6]:
base_path = "/lustre/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/HLCA_reproducibility/data"
dset0_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/ready/full")
dset1_dir = os.path.join(base_path, "HLCA_extended/extension_datasets/raw")

adata_dict = {
    "Kaminski_2020": [f"{dset0_dir}/adams.h5ad"],
    "Meyer_2021": [f"{dset0_dir}/meyer_2021.h5ad"],
    "MeyerNikolic_unpubl": [f"{dset0_dir}/meyer_nikolic_unpubl.h5ad"],
    "Barbry_unpubl": [f"{dset0_dir}/barbry.h5ad"],
    "Regev_2021": [
        f"{dset0_dir}/delorey_cryo.h5ad", f"{dset0_dir}/delorey_fresh.h5ad",
        f"{dset0_dir}/delorey_nuclei.h5ad"
    ],
    "Thienpont_2018": [f"{dset1_dir}/Lambrechts/lambrechts.h5ad"],
    "Budinger_2020": [f"{dset0_dir}/bharat.h5ad"],
    "Banovich_Kropski_2020": [f"{dset0_dir}/haberman.h5ad"],
    "Sheppard_2020": [f"{dset0_dir}/tsukui.h5ad"],
    "Wunderink_2021": [
        f"{dset0_dir}/grant_cryo.h5ad", f"{dset0_dir}/grant_fresh.h5ad"
    ],
    "Lambrechts_2021": [
        f"{dset0_dir}/wouters.h5ad" #, f"{dset0_dir}/wouters_labs.h5ad"
    ],
    "Zhang_2021": [f"{dset1_dir}/Liao/covid_for_publish.h5ad"],
    "Duong_lungMAP_unpubl": [f"{dset0_dir}/duong.h5ad"],
    "Janssen_2020": [f"{dset0_dir}/mould.h5ad"],
    "Sun_2020": [
        f"{dset0_dir}/wang_sub_batch1.h5ad", f"{dset0_dir}/wang_sub_batch2.h5ad",
        f"{dset0_dir}/wang_sub_batch3.h5ad", f"{dset0_dir}/wang_sub_batch4.h5ad"],
    "Gomperts_2021": [
        f"{dset0_dir}/carraro_ucla.h5ad", f"{dset0_dir}/carraro_cff.h5ad",
        f"{dset0_dir}/carraro_csmc.h5ad"],
    "Eils_2020": [f"{dset0_dir}/lukassen.h5ad"],
    "Schiller_2020": [f"{dset0_dir}/mayr.h5ad"],
    "Misharin_Budinger_2018": [f"{dset0_dir}/reyfman_disease.h5ad"],
    "Shalek_2018": [f"{dset0_dir}/ordovasmontanes.h5ad"],
    "Schiller_2021": [f"{dset0_dir}/schiller_discovair.h5ad"],
    "Peer_Massague_2020": [f"{dset0_dir}/laughney.h5ad"],
    "Lafyatis_2019": [f"{dset0_dir}/valenzi.h5ad"],
    "Tata_unpubl": [f"{dset0_dir}/tata_unpubl.h5ad"],
    "Xu_2020": [f"{dset0_dir}/guo.h5ad"],
    "Sims_2019": [f"{dset0_dir}/szabo.h5ad"],
    "Schultze_unpubl": [f"{dset0_dir}/schultze_unpubl.h5ad"]
}

Run the ID conversion with HGNC Symbol (a.k.a gene name)

In [7]:
result = dict()
final_database="HGNC Symbol"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Kaminski_2020:   0%|          | 0/45947 [00:00<?, ?it/s]

changed_only_1_to_n: 59
changed_only_1_to_1: 4002
alternative_target_1_to_1: 12331
alternative_target_1_to_n: 13
matching_1_to_0: 98
matching_1_to_1: 33446
matching_1_to_n: 59
input_identifiers: 45947
Source release: (38, 94)


Meyer_2021:   0%|          | 0/20922 [00:00<?, ?it/s]

changed_only_1_to_n: 3
changed_only_1_to_1: 997
alternative_target_1_to_1: 368
alternative_target_1_to_n: 2
matching_1_to_0: 17
matching_1_to_1: 20532
matching_1_to_n: 3
input_identifiers: 20922
Source release: (38, 84)


MeyerNikolic_unpubl:   0%|          | 0/33582 [00:00<?, ?it/s]



changed_only_1_to_n: 15
changed_only_1_to_1: 2711
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 106
matching_1_to_1: 25109
matching_1_to_n: 15
input_identifiers: 33582
Source release: (38, 93)


Barbry_unpubl:   0%|          | 0/16859 [00:00<?, ?it/s]



changed_only_1_to_n: 1
changed_only_1_to_1: 972
alternative_target_1_to_1: 1679
alternative_target_1_to_n: 4
matching_1_to_0: 20
matching_1_to_1: 15155
matching_1_to_n: 1
input_identifiers: 16859
Source release: (38, 98)


Regev_2021:   0%|          | 0/30983 [00:00<?, ?it/s]



changed_only_1_to_n: 8
changed_only_1_to_1: 2520
alternative_target_1_to_1: 7387
alternative_target_1_to_n: 9
matching_1_to_0: 113
matching_1_to_1: 23466
matching_1_to_n: 8
input_identifiers: 30983
Source release: (38, 93)


Thienpont_2018:   0%|          | 0/27958 [00:00<?, ?it/s]

changed_only_1_to_n: 8
changed_only_1_to_1: 3598
alternative_target_1_to_1: 5342
alternative_target_1_to_n: 43
matching_1_to_0: 171
matching_1_to_1: 22394
matching_1_to_n: 8
input_identifiers: 27958
Source release: (38, 84)


Budinger_2020:   0%|          | 0/26316 [00:00<?, ?it/s]

changed_only_1_to_n: 6
changed_only_1_to_1: 2122
alternative_target_1_to_1: 4807
alternative_target_1_to_n: 6
matching_1_to_0: 56
matching_1_to_1: 21441
matching_1_to_n: 6
input_identifiers: 26316
Source release: (38, 93)


Banovich_Kropski_2020:   0%|          | 0/33694 [00:00<?, ?it/s]



changed_only_1_to_n: 15
changed_only_1_to_1: 4470
alternative_target_1_to_1: 8131
alternative_target_1_to_n: 57
matching_1_to_0: 262
matching_1_to_1: 25229
matching_1_to_n: 15
input_identifiers: 33694
Source release: (38, 84)


Sheppard_2020:   0%|          | 0/27147 [00:00<?, ?it/s]

changed_only_1_to_n: 6
changed_only_1_to_1: 3464
alternative_target_1_to_1: 5020
alternative_target_1_to_n: 40
matching_1_to_0: 166
matching_1_to_1: 21915
matching_1_to_n: 6
input_identifiers: 27147
Source release: (38, 84)


Wunderink_2021:   0%|          | 0/21819 [00:00<?, ?it/s]



changed_only_1_to_n: 2
changed_only_1_to_1: 1712
alternative_target_1_to_1: 2907
alternative_target_1_to_n: 1
matching_1_to_0: 53
matching_1_to_1: 18856
matching_1_to_n: 2
input_identifiers: 21819
Source release: (38, 93)


Lambrechts_2021:   0%|          | 0/33538 [00:00<?, ?it/s]



changed_only_1_to_n: 15
changed_only_1_to_1: 2693
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 80
matching_1_to_1: 25091
matching_1_to_n: 15
input_identifiers: 33538
Source release: (38, 93)


Zhang_2021:   0%|          | 0/18474 [00:00<?, ?it/s]

changed_only_1_to_n: 1
changed_only_1_to_1: 806
alternative_target_1_to_1: 99
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 18364
matching_1_to_n: 1
input_identifiers: 18474
Source release: (38, 87)


Duong_lungMAP_unpubl:   0%|          | 0/27678 [00:00<?, ?it/s]

changed_only_1_to_n: 6
changed_only_1_to_1: 2293
alternative_target_1_to_1: 5994
alternative_target_1_to_n: 10
matching_1_to_0: 51
matching_1_to_1: 21617
matching_1_to_n: 6
input_identifiers: 27678
Source release: (38, 93)


Janssen_2020:   0%|          | 0/33538 [00:00<?, ?it/s]



changed_only_1_to_n: 15
changed_only_1_to_1: 2693
alternative_target_1_to_1: 8342
alternative_target_1_to_n: 10
matching_1_to_0: 80
matching_1_to_1: 25091
matching_1_to_n: 15
input_identifiers: 33538
Source release: (38, 93)


Sun_2020:   0%|          | 0/26578 [00:00<?, ?it/s]

changed_only_1_to_n: 6
changed_only_1_to_1: 2192
alternative_target_1_to_1: 5417
alternative_target_1_to_n: 8
matching_1_to_0: 50
matching_1_to_1: 21097
matching_1_to_n: 6
input_identifiers: 26578
Source release: (38, 93)


Gomperts_2021:   0%|          | 0/31229 [00:00<?, ?it/s]



changed_only_1_to_n: 14
changed_only_1_to_1: 4744
alternative_target_1_to_1: 5910
alternative_target_1_to_n: 28
matching_1_to_0: 348
matching_1_to_1: 24929
matching_1_to_n: 14
input_identifiers: 31229
Source release: (38, 89)


Eils_2020:   0%|          | 0/32738 [00:00<?, ?it/s]



changed_only_1_to_n: 17
changed_only_1_to_1: 5448
alternative_target_1_to_1: 7519
alternative_target_1_to_n: 44
matching_1_to_0: 612
matching_1_to_1: 24546
matching_1_to_n: 17
input_identifiers: 32738
Source release: (37, 79)


Schiller_2020:   0%|          | 0/32104 [00:00<?, ?it/s]



changed_only_1_to_n: 8
changed_only_1_to_1: 4469
alternative_target_1_to_1: 6431
alternative_target_1_to_n: 26
matching_1_to_0: 533
matching_1_to_1: 25106
matching_1_to_n: 8
input_identifiers: 32104
Source release: (38, 89)


Misharin_Budinger_2018:   0%|          | 0/27181 [00:00<?, ?it/s]

changed_only_1_to_n: 7
changed_only_1_to_1: 3464
alternative_target_1_to_1: 5009
alternative_target_1_to_n: 43
matching_1_to_0: 177
matching_1_to_1: 21945
matching_1_to_n: 7
input_identifiers: 27181
Source release: (38, 84)


Shalek_2018:   0%|          | 0/25328 [00:00<?, ?it/s]



changed_only_1_to_n: 7
changed_only_1_to_1: 3647
alternative_target_1_to_1: 3642
alternative_target_1_to_n: 33
matching_1_to_0: 492
matching_1_to_1: 21154
matching_1_to_n: 7
input_identifiers: 25328
Source release: (37, 79)


Schiller_2021:   0%|          | 0/17533 [00:00<?, ?it/s]

changed_only_1_to_n: 0
changed_only_1_to_1: 497
alternative_target_1_to_1: 193
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 17330
matching_1_to_n: 0
input_identifiers: 17533
Source release: (38, 99)


Peer_Massague_2020:   0%|          | 0/19222 [00:00<?, ?it/s]

changed_only_1_to_n: 4
changed_only_1_to_1: 1545
alternative_target_1_to_1: 1247
alternative_target_1_to_n: 11
matching_1_to_0: 63
matching_1_to_1: 17897
matching_1_to_n: 4
input_identifiers: 19222
Source release: (38, 86)


Lafyatis_2019:   0%|          | 0/22164 [00:00<?, ?it/s]



changed_only_1_to_n: 6
changed_only_1_to_1: 1032
alternative_target_1_to_1: 442
alternative_target_1_to_n: 2
matching_1_to_0: 21
matching_1_to_1: 21693
matching_1_to_n: 6
input_identifiers: 22164
Source release: (38, 84)


Tata_unpubl:   0%|          | 0/31915 [00:00<?, ?it/s]



changed_only_1_to_n: 11
changed_only_1_to_1: 1954
alternative_target_1_to_1: 7632
alternative_target_1_to_n: 6
matching_1_to_0: 36
matching_1_to_1: 24230
matching_1_to_n: 11
input_identifiers: 31915
Source release: (38, 93)


Xu_2020:   0%|          | 0/32738 [00:00<?, ?it/s]



changed_only_1_to_n: 17
changed_only_1_to_1: 5448
alternative_target_1_to_1: 7519
alternative_target_1_to_n: 44
matching_1_to_0: 612
matching_1_to_1: 24546
matching_1_to_n: 17
input_identifiers: 32738
Source release: (37, 79)


Sims_2019:   0%|          | 0/60725 [00:00<?, ?it/s]

changed_only_1_to_n: 985
changed_only_1_to_1: 8062
alternative_target_1_to_1: 16431
alternative_target_1_to_n: 265
matching_1_to_0: 1461
matching_1_to_1: 41583
matching_1_to_n: 985
input_identifiers: 60725




Source release: (38, 83)


Schultze_unpubl:   0%|          | 0/24532 [00:00<?, ?it/s]

changed_only_1_to_n: 5
changed_only_1_to_1: 2070
alternative_target_1_to_1: 3820
alternative_target_1_to_n: 6
matching_1_to_0: 46
matching_1_to_1: 20655
matching_1_to_n: 5
input_identifiers: 24532
Source release: (38, 91)


In [8]:
time_suffix = time.strftime("%Y%m%d-%H%M%S", time.gmtime())
file_path = os.path.join(local_dir, f"results_for_hlca_datasets_{final_database}_{time_suffix}.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(result, handle)
    print(f"Saved: {file_path}")

Saved: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_datasets_HGNC Symbol_20221101-175029.pk


Run the ID conversion with Ensembl gene ID

In [9]:
result = dict()
final_database="ensembl_gene"

for dataset_name in adata_dict:
    
    adata = sc.read(adata_dict[dataset_name][0])
    gene_list = list(adata.var.index)
    
    matching = idt.convert_identifier_multiple(gene_list, final_database=final_database, pbar_prefix=dataset_name)
    binned_conversions = idt.classify_multiple_conversion(matching)
    
    idt.print_binned_conversion(binned_conversions)
    print(f"Source release: {idt.infer_identifier_source(gene_list)}")
    
    result[dataset_name] = binned_conversions

Kaminski_2020:   0%|          | 0/45947 [00:00<?, ?it/s]

changed_only_1_to_n: 498
changed_only_1_to_1: 45351
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 98
matching_1_to_1: 45351
matching_1_to_n: 498
input_identifiers: 45947
Source release: (38, 94)


Meyer_2021:   0%|          | 0/20922 [00:00<?, ?it/s]

changed_only_1_to_n: 213
changed_only_1_to_1: 20692
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 17
matching_1_to_1: 20692
matching_1_to_n: 213
input_identifiers: 20922
Source release: (38, 84)


MeyerNikolic_unpubl:   0%|          | 0/33582 [00:00<?, ?it/s]



changed_only_1_to_n: 290
changed_only_1_to_1: 33186
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 106
matching_1_to_1: 33186
matching_1_to_n: 290
input_identifiers: 33582
Source release: (38, 93)


Barbry_unpubl:   0%|          | 0/16859 [00:00<?, ?it/s]



changed_only_1_to_n: 101
changed_only_1_to_1: 16738
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 20
matching_1_to_1: 16738
matching_1_to_n: 101
input_identifiers: 16859
Source release: (38, 98)


Regev_2021:   0%|          | 0/30983 [00:00<?, ?it/s]



changed_only_1_to_n: 222
changed_only_1_to_1: 30648
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 113
matching_1_to_1: 30648
matching_1_to_n: 222
input_identifiers: 30983
Source release: (38, 93)


Thienpont_2018:   0%|          | 0/27958 [00:00<?, ?it/s]

changed_only_1_to_n: 246
changed_only_1_to_1: 27541
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 171
matching_1_to_1: 27541
matching_1_to_n: 246
input_identifiers: 27958
Source release: (38, 84)


Budinger_2020:   0%|          | 0/26316 [00:00<?, ?it/s]

changed_only_1_to_n: 194
changed_only_1_to_1: 26066
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 56
matching_1_to_1: 26066
matching_1_to_n: 194
input_identifiers: 26316
Source release: (38, 93)


Banovich_Kropski_2020:   0%|          | 0/33694 [00:00<?, ?it/s]



changed_only_1_to_n: 336
changed_only_1_to_1: 33096
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 262
matching_1_to_1: 33096
matching_1_to_n: 336
input_identifiers: 33694
Source release: (38, 84)


Sheppard_2020:   0%|          | 0/27147 [00:00<?, ?it/s]

changed_only_1_to_n: 239
changed_only_1_to_1: 26742
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 166
matching_1_to_1: 26742
matching_1_to_n: 239
input_identifiers: 27147
Source release: (38, 84)


Wunderink_2021:   0%|          | 0/21819 [00:00<?, ?it/s]



changed_only_1_to_n: 169
changed_only_1_to_1: 21597
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 53
matching_1_to_1: 21597
matching_1_to_n: 169
input_identifiers: 21819
Source release: (38, 93)


Lambrechts_2021:   0%|          | 0/33538 [00:00<?, ?it/s]



changed_only_1_to_n: 290
changed_only_1_to_1: 33168
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 80
matching_1_to_1: 33168
matching_1_to_n: 290
input_identifiers: 33538
Source release: (38, 93)


Zhang_2021:   0%|          | 0/18474 [00:00<?, ?it/s]

changed_only_1_to_n: 159
changed_only_1_to_1: 18305
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 18305
matching_1_to_n: 159
input_identifiers: 18474
Source release: (38, 87)


Duong_lungMAP_unpubl:   0%|          | 0/27678 [00:00<?, ?it/s]

changed_only_1_to_n: 180
changed_only_1_to_1: 27447
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 51
matching_1_to_1: 27447
matching_1_to_n: 180
input_identifiers: 27678
Source release: (38, 93)


Janssen_2020:   0%|          | 0/33538 [00:00<?, ?it/s]



changed_only_1_to_n: 290
changed_only_1_to_1: 33168
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 80
matching_1_to_1: 33168
matching_1_to_n: 290
input_identifiers: 33538
Source release: (38, 93)


Sun_2020:   0%|          | 0/26578 [00:00<?, ?it/s]

changed_only_1_to_n: 161
changed_only_1_to_1: 26367
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 50
matching_1_to_1: 26367
matching_1_to_n: 161
input_identifiers: 26578
Source release: (38, 93)


Gomperts_2021:   0%|          | 0/31229 [00:00<?, ?it/s]



changed_only_1_to_n: 274
changed_only_1_to_1: 30607
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 348
matching_1_to_1: 30607
matching_1_to_n: 274
input_identifiers: 31229
Source release: (38, 89)


Eils_2020:   0%|          | 0/32738 [00:00<?, ?it/s]



changed_only_1_to_n: 284
changed_only_1_to_1: 31842
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 612
matching_1_to_1: 31842
matching_1_to_n: 284
input_identifiers: 32738
Source release: (37, 79)


Schiller_2020:   0%|          | 0/32104 [00:00<?, ?it/s]



changed_only_1_to_n: 263
changed_only_1_to_1: 31308
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 533
matching_1_to_1: 31308
matching_1_to_n: 263
input_identifiers: 32104
Source release: (38, 89)


Misharin_Budinger_2018:   0%|          | 0/27181 [00:00<?, ?it/s]

changed_only_1_to_n: 251
changed_only_1_to_1: 26753
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 177
matching_1_to_1: 26753
matching_1_to_n: 251
input_identifiers: 27181
Source release: (38, 84)


Shalek_2018:   0%|          | 0/25328 [00:00<?, ?it/s]



changed_only_1_to_n: 198
changed_only_1_to_1: 24638
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 492
matching_1_to_1: 24638
matching_1_to_n: 198
input_identifiers: 25328
Source release: (37, 79)


Schiller_2021:   0%|          | 0/17533 [00:00<?, ?it/s]

changed_only_1_to_n: 139
changed_only_1_to_1: 17384
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 10
matching_1_to_1: 17384
matching_1_to_n: 139
input_identifiers: 17533
Source release: (38, 99)


Peer_Massague_2020:   0%|          | 0/19222 [00:00<?, ?it/s]

changed_only_1_to_n: 155
changed_only_1_to_1: 19004
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 63
matching_1_to_1: 19004
matching_1_to_n: 155
input_identifiers: 19222
Source release: (38, 86)


Lafyatis_2019:   0%|          | 0/22164 [00:00<?, ?it/s]



changed_only_1_to_n: 258
changed_only_1_to_1: 21885
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 21
matching_1_to_1: 21885
matching_1_to_n: 258
input_identifiers: 22164
Source release: (38, 84)


Tata_unpubl:   0%|          | 0/31915 [00:00<?, ?it/s]



changed_only_1_to_n: 280
changed_only_1_to_1: 31599
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 36
matching_1_to_1: 31599
matching_1_to_n: 280
input_identifiers: 31915
Source release: (38, 93)


Xu_2020:   0%|          | 0/32738 [00:00<?, ?it/s]



changed_only_1_to_n: 284
changed_only_1_to_1: 31842
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 612
matching_1_to_1: 31842
matching_1_to_n: 284
input_identifiers: 32738
Source release: (37, 79)


Sims_2019:   0%|          | 0/60725 [00:00<?, ?it/s]

changed_only_1_to_n: 1830
changed_only_1_to_1: 57434
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 1461
matching_1_to_1: 57434
matching_1_to_n: 1830
input_identifiers: 60725




Source release: (38, 83)


Schultze_unpubl:   0%|          | 0/24532 [00:00<?, ?it/s]

changed_only_1_to_n: 196
changed_only_1_to_1: 24290
alternative_target_1_to_1: 0
alternative_target_1_to_n: 0
matching_1_to_0: 46
matching_1_to_1: 24290
matching_1_to_n: 196
input_identifiers: 24532
Source release: (38, 91)


### Save

In [10]:
time_suffix = time.strftime("%Y%m%d-%H%M%S", time.gmtime())
file_path = os.path.join(local_dir, f"results_for_hlca_datasets_{final_database}_{time_suffix}.pk")
with open(file_path, 'wb') as handle:
    pickle.dump(result, handle)
    print(f"Saved: {file_path}")

Saved: /lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_datasets_ensembl_gene_20221101-191345.pk
