## Gene names processing

In this notebook we are digesting idtrack's output. We take the dictionaries and create a table of genes-ensembl pairs to determine what the final gene name will be. In the process we also find repeated gene names that will be merged, per dataset.

__author__ = "Ciro Ramírez-Suástegui"

__copyright__ = "Copyright 2022-11-09, Helmholtz Zentrum Muenchen"

__license__ = "GPL"

__version__ = "0.0.9"

__email__ = "ciro.suastegui@helmholtz-muenchen.de, ksuasteguic@gmail.com"

__status__ = "Prototype"

#### Structure

* [Global variables and paths](#bullet1)
* [Loading data](#bullet2)
* [Pre-processing](#bullet3)
* [Main](#bullet4)
* [Conclusions](#bullet5)
* [Save](#bullet6)

### Environment setup

In [1]:
%load_ext autoreload
%autoreload 2
import importlib

spam_spec = importlib.util.find_spec("lab_black")
if spam_spec is not None:
    %load_ext lab_black

In [2]:
# basic modules
import warnings, os, re
import time, sys

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [3]:
# in-house/developing modules
# tools modules
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from tqdm import tqdm
import pickle

In [4]:
sc.logging.print_versions()

-----
anndata     0.8.0
scanpy      1.9.1
-----
7b32b9a39ad70713acde__mypyc NA
PIL                         9.2.0
autoreload                  NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
black                       22.6.0
blib2to3                    NA
cffi                        1.15.1
click                       8.1.3
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.3
decorator                   5.1.1
defusedxml                  0.7.1
entrypoints                 0.4
h5py                        3.7.0
igraph                      0.9.11
ipykernel                   6.16.0
ipython_genutils            0.2.0
jedi                        0.18.1
joblib                      1.1.0
jupyter_server              1.21.0
kiwisolver                  1.4.4
lab_black                   NA
leidenalg                   0.8.10
llvmlite                    0.39.0
matplotlib    

In [5]:
print("Environment:", re.sub(".os.py", "", os.__file__))
parentpath = lambda _path, n: os.sep.join(_path.split(os.sep)[:-n])
print("Working at:", re.sub(parentpath(os.getcwd(), 2), "", os.getcwd()))

Environment: /home/icb/ciro.suastegui/miniconda3/envs/HLCA_basic/lib/python3.7
Working at: /notebooks/3_atlas_extension


In [6]:
def make_symbol(x):
    return re.sub(
        "^ENSG[0-9]{10,}\.[0-9]{1,}_|_ENSG[0-9]{10,}",  # "ENSG.*_|_ENSG[0-9]{10,}",
        "",
        re.sub(  # some artifacts from idtrack returns the mapping structure
            ".*\n",
            "",
            re.sub(  # which contains several names
                ".* {1,}",
                "",
                re.sub(
                    "\nName.*",
                    "",
                    x,
                ),
            ),
        ),
    )

### Global variables and paths <a class="anchor" id="bullet1"></a>

In [7]:
feats_dir = "../../data/HLCA_extended/extension_datasets/features/"
feats_inp = (
    "../../data/HLCA_extended/extension_datasets/features/all_update-with-core.pk"
)
adata_split = {
    "HLCA": {
        "study": [
            "Nawijn_2021_HLCA",
            "Barbry_Leroy_2020_HLCA",
            "Meyer_2019_HLCA",
            "Banovich_Kropski_2020_HLCA",
            "Seibold_2020_HLCA",
            "Jain_Misharin_2021_HLCA",
            "Teichmann_Meyer_2019_HLCA",
            "Misharin_2021_HLCA",
            "Lafyatis_Rojas_2019_HLCA",
            "Krasnow_2020_HLCA",
            "Misharin_Budinger_2018_HLCA",
        ]
    }
}

### Loading data <a class="anchor" id="bullet2"></a>

In [8]:
results_core = pd.read_pickle(
    "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_core_only_HGNC Symbol_20221110-094335.pk"
)
results_idtrack = pd.read_pickle(
    "/lustre/groups/ml01/workspace/kemal.inecik/idtrack_temp/results_for_hlca_datasets_HGNC Symbol_20221101-175029.pk"
)

In [9]:
results_core.keys()
results_core["HLCA_txt"][:3]

dict_keys(['HLCA_txt'])

[{'target_id': ['TSPAN6'],
  'last_node': [('ENSG00000000003.15', 'TSPAN6')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'TSPAN6',
  'query_id': 'TSPAN6',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False},
 {'target_id': ['TNMD'],
  'last_node': [('ENSG00000000005.6', 'TNMD')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'TNMD',
  'query_id': 'TNMD',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False},
 {'target_id': ['DPM1'],
  'last_node': [('ENSG00000000419.14', 'DPM1')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'DPM1',
  'query_id': 'DPM1',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False}]

In [10]:
len(results_idtrack)
results_idtrack.keys()
results_idtrack["Kaminski_2020"].keys()
len(results_idtrack["Kaminski_2020"]["input_identifiers"])
results_idtrack["Kaminski_2020"]["input_identifiers"][:3]

27

dict_keys(['Kaminski_2020', 'Meyer_2021', 'MeyerNikolic_unpubl', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl'])

dict_keys(['changed_only_1_to_n', 'changed_only_1_to_1', 'alternative_target_1_to_1', 'alternative_target_1_to_n', 'matching_1_to_0', 'matching_1_to_1', 'matching_1_to_n', 'input_identifiers'])

45947

[{'target_id': ['TSPAN6'],
  'last_node': [('ENSG00000000003.15', 'TSPAN6')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'TSPAN6',
  'query_id': 'TSPAN6',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False},
 {'target_id': ['TNMD'],
  'last_node': [('ENSG00000000005.6', 'TNMD')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'TNMD',
  'query_id': 'TNMD',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False},
 {'target_id': ['DPM1'],
  'last_node': [('ENSG00000000419.14', 'DPM1')],
  'final_database': 'HGNC Symbol',
  'graph_id': 'DPM1',
  'query_id': 'DPM1',
  'no_corresponding': False,
  'no_conversion': False,
  'no_target': False}]

### Pre-processing <a class="anchor" id="bullet3"></a>

In [11]:
for i in results_core.keys():
    ds = i.replace("_txt", "")
    print(ds)
    results_idtrack[ds] = dict()
    results_idtrack[ds]["input_identifiers"] = results_core[i]

HLCA


In [12]:
results_idtrack.keys()

dict_keys(['Kaminski_2020', 'Meyer_2021', 'MeyerNikolic_unpubl', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl', 'HLCA'])

In [13]:
ds_rename = {"MeyerNikolic_unpubl": "Meyer_Nikolic_2022"}
results_idtrack[list(ds_rename.values())[0]] = results_idtrack[
    list(ds_rename.keys())[0]
]
del results_idtrack[list(ds_rename.keys())[0]]
results_idtrack.keys()

dict_keys(['Kaminski_2020', 'Meyer_2021', 'Barbry_unpubl', 'Regev_2021', 'Thienpont_2018', 'Budinger_2020', 'Banovich_Kropski_2020', 'Sheppard_2020', 'Wunderink_2021', 'Lambrechts_2021', 'Zhang_2021', 'Duong_lungMAP_unpubl', 'Janssen_2020', 'Sun_2020', 'Gomperts_2021', 'Eils_2020', 'Schiller_2020', 'Misharin_Budinger_2018', 'Shalek_2018', 'Schiller_2021', 'Peer_Massague_2020', 'Lafyatis_2019', 'Tata_unpubl', 'Xu_2020', 'Sims_2019', 'Schultze_unpubl', 'HLCA', 'Meyer_Nikolic_2022'])

#### Feature names standardisation

In [14]:
final_simple = dict()
temp = list(results_idtrack.keys())
for ds in temp:
    ds_n = len(results_idtrack[ds]["input_identifiers"])
    print(f"\033[0;34m====================\033[0m {ds_n} | {ds}")
    ds_dict = {"original": [], "ensembl_id": [], "new": []}
    for i in results_idtrack[ds]["input_identifiers"]:
        # assert i["query_id"] not in final_simple[ds]
        ensembl_id = ""
        symbol = ""
        if len(i["last_node"]) > 0:
            ensembl_id = i["last_node"][0][0]
        if len(i["target_id"]) > 0:
            if not re.match(r"^ENSG", i["target_id"][0]):
                symbol = i["target_id"][0]
            else:
                symbol = ensembl_id
        else:
            symbol = i["query_id"]
        ds_dict["original"].append(i["query_id"])
        ds_dict["ensembl_id"].append(ensembl_id)
        ds_dict["new"].append(symbol)
    repeats = {x: count for x, count in Counter(ds_dict["new"]).items() if count >= 2}
    repeats = list(repeats.keys())
    final_simple[ds] = pd.DataFrame(
        {
            "original": ds_dict["original"],
            "ensembl_id": ds_dict["ensembl_id"],
            "new": ds_dict["new"],
            "repeat": [i in repeats for i in ds_dict["new"]],
        }
    )



In [15]:
print(ds)
temp = (
    final_simple[ds]
    .loc[~final_simple[ds]["ensembl_id"].isin([""]), :]
    .loc[~final_simple[ds]["new"].isin([""]), :]
    .loc[final_simple[ds]["repeat"], :]
    .sort_values(by=["new"])
)
temp

Meyer_Nikolic_2022


Unnamed: 0,original,ensembl_id,new,repeat
79,ABCF2,ENSG00000033050.9,ABCF2,True
80,ABCF2-1,ENSG00000033050.9,ABCF2,True
81,ABCF2.1,ENSG00000033050.9,ABCF2,True
6826,ACSL6,ENSG00000164398.15,ACSL6,True
3172,AC026398.1,ENSG00000164398.15,ACSL6,True
...,...,...,...,...
165,AC002310.4,ENSG00000261459.1,ZNF747,True
1275,AC008770.2,ENSG00000267179.1,ZNF763,True
33433,ZNF763,ENSG00000197054.12,ZNF763,True
816,AC006978.2,ENSG00000180233.11,ZNRF2,True


In [16]:
feature_old = {}
for ds in results_idtrack.keys():
    print(f"\033[94m*******************\033[0m {ds}")
    with open(f"{feats_dir}/{ds}.txt") as f:
        feature_old[ds] = [line[:-1] for line in f]
    temp = list(feature_old[ds])
    tvar = list(final_simple[ds]["ensembl_id"])
    brie = list(final_simple[ds]["original"])
    ephe = list(final_simple[ds]["new"])
    print(f"({len(temp)}) read:", temp[:3])
    print(f"({len(tvar)}) ense:", tvar[:3])
    print(f"({len(brie)}) orig:", brie[:3])
    print(f"({len(ephe)}) symb:", ephe[:3])

[94m*******************[0m Kaminski_2020
(45947) read: ['TSPAN6', 'TNMD', 'DPM1']
(45947) ense: ['ENSG00000000003.15', 'ENSG00000000005.6', 'ENSG00000000419.14']
(45947) orig: ['TSPAN6', 'TNMD', 'DPM1']
(45947) symb: ['TSPAN6', 'TNMD', 'DPM1']
[94m*******************[0m Meyer_2021
(20922) read: ['FAM87B', 'LINC00115', 'FAM41C']
(20922) ense: ['ENSG00000177757.2', 'ENSG00000225880.6', 'ENSG00000230368.2']
(20922) orig: ['FAM87B', 'LINC00115', 'FAM41C']
(20922) symb: ['FAM87B', 'LINC00115', 'FAM41C']
[94m*******************[0m Barbry_unpubl
(16859) read: ['ST6GALNAC6', 'TAF10', 'AL031595.3']
(16859) ense: ['ENSG00000160408.16', 'ENSG00000166337.11', 'ENSG00000280434.1']
(16859) orig: ['ST6GALNAC6', 'TAF10', 'AL031595.3']
(16859) symb: ['ST6GALNAC6', 'TAF10', 'ENSG00000280434.1']
[94m*******************[0m Regev_2021
(30983) read: ['A1BG', 'A1BG-AS1', 'A1CF']
(30983) ense: ['ENSG00000121410.12', 'ENSG00000268895.6', 'ENSG00000148584.15']
(30983) orig: ['A1BG', 'A1BG-AS1', 'A1CF']


### Main <a class="anchor" id="bullet4"></a>

Expand shared ones

In [17]:
feature_conversion = final_simple.copy()
for ds_merged in adata_split.keys():
    print(f"\033[1m\033[93mExpanding\033[0m {ds_merged}")
    ds_column = list(adata_split[ds_merged].keys())[0]
    for ds in adata_split[ds_merged][ds_column]:
        feature_conversion[ds] = feature_conversion[ds_merged]
    del feature_conversion[ds_merged]

[1m[93mExpanding[0m HLCA


Concatenate to pivot create a table with studies as columns.

In [19]:
temp = list(final_simple.keys())
feature_conversion_all = final_simple[temp[0]].copy()
feature_conversion_all["study"] = temp[0]
for ds in temp[1:]:
    tvar = final_simple[ds].copy()
    tvar["study"] = ds.replace("HLCA", "Core")
    feature_conversion_all = pd.concat(
        [feature_conversion_all, tvar], axis=0, ignore_index=True
    )
feature_conversion_all

Unnamed: 0,original,ensembl_id,new,repeat,study
0,TSPAN6,ENSG00000000003.15,TSPAN6,False,Kaminski_2020
1,TNMD,ENSG00000000005.6,TNMD,False,Kaminski_2020
2,DPM1,ENSG00000000419.14,DPM1,False,Kaminski_2020
3,SCYL3,ENSG00000000457.14,SCYL3,False,Kaminski_2020
4,C1orf112,ENSG00000000460.17,C1ORF112,False,Kaminski_2020
...,...,...,...,...,...
810964,ZYG11A,ENSG00000203995.10,ZYG11A,False,Meyer_Nikolic_2022
810965,ZYG11B,ENSG00000162378.13,ZYG11B,False,Meyer_Nikolic_2022
810966,ZYX,ENSG00000285443.2,ZYX,False,Meyer_Nikolic_2022
810967,ZZEF1,ENSG00000074755.15,ZZEF1,False,Meyer_Nikolic_2022


In [20]:
temp = len(set(feature_conversion_all["new"].tolist()))
print(f"Total: {temp}")

Total: 59574


In [21]:
temp = feature_conversion_all.loc[
    feature_conversion_all["new"].isin(["HLA-B"]), :
]  # GTF2H2
temp.sort_values(by=["study"])

Unnamed: 0,original,ensembl_id,new,repeat,study
179604,HLA-B,ENSG00000224608.10,HLA-B,False,Banovich_Kropski_2020
70495,HLA-B,ENSG00000224608.10,HLA-B,False,Barbry_unpubl
155894,HLA-B,ENSG00000224608.10,HLA-B,False,Budinger_2020
768997,HLA-B,ENSG00000224608.10,HLA-B,False,Core
318193,HLA-B,ENSG00000224608.10,HLA-B,False,Duong_lungMAP_unpubl
433274,HLA-B,ENSG00000224608.10,HLA-B,False,Eils_2020
397023,HLA-B,ENSG00000224608.10,HLA-B,False,Gomperts_2021
341942,HLA-B,ENSG00000224608.10,HLA-B,False,Janssen_2020
20934,HLA-B,ENSG00000224608.10,HLA-B,False,Kaminski_2020
583854,HLA-B,ENSG00000224608.10,HLA-B,False,Lafyatis_2019


In [22]:
artifacts = [" " in i or "\n" in i for i in feature_conversion_all["new"]]
feature_conversion_all.loc[artifacts, :]

Unnamed: 0,original,ensembl_id,new,repeat,study


In [23]:
feature_conversion_all.loc[artifacts & feature_conversion_all["repeat"], :].sort_values(
    by=["new", "study"]
)

Unnamed: 0,original,ensembl_id,new,repeat,study


In [24]:
feats_out_table = re.sub("-with-core.pk", "_table_flat.csv", feats_inp)
feats_out_table

'../../data/HLCA_extended/extension_datasets/features/all_update_table_flat.csv'

In [25]:
feature_conversion_all.to_csv(feats_out_table)

In [23]:
%%time
feature_conversion_all["old_aggr"] = feature_conversion_all.groupby(["new", "study"])[
    "original"
].transform(lambda x: ";".join(x))

CPU times: user 58 s, sys: 1.23 s, total: 59.2 s
Wall time: 59.4 s


In [24]:
feature_conversion_all.loc[feature_conversion_all["repeat"], :].sort_values(
    by=["new", "study"]
)

Unnamed: 0,original,ensembl_id,new,repeat,study,old_aggr
411948,AADACL2-AS1,ENSG00000262466.5,AADACL2-AS1,True,Gomperts_2021,AADACL2-AS1;RP11-454C18.2
421058,RP11-454C18.2,ENSG00000262466.5,AADACL2-AS1,True,Gomperts_2021,AADACL2-AS1;RP11-454C18.2
671366,AC069067.1,ENSG00000262466.5,AADACL2-AS1,True,Sims_2019,AC069067.1;AADACL2-AS1
717146,AADACL2-AS1,ENSG00000262466.5,AADACL2-AS1,True,Sims_2019,AC069067.1;AADACL2-AS1
408801,AATBC,ENSG00000215458.8,AATBC,True,Gomperts_2021,AATBC;LOC284837
...,...,...,...,...,...,...
412680,ZZZ3,ENSG00000036549.13,ZZZ3,True,Gomperts_2021,AC118549.1;ZZZ3
476416,ZZZ3,ENSG00000036549.13,ZZZ3,True,Schiller_2020,ZZZ3;AC118549.1
482250,AC118549.1,ENSG00000036549.13,ZZZ3,True,Schiller_2020,ZZZ3;AC118549.1
398996,DKC1,ENSG00000130826.19,dkc1,True,Gomperts_2021,DKC1;MIR644B


In [25]:
df = feature_conversion_all.drop_duplicates(subset=["new", "study"])
df.shape

(787718, 6)

In [26]:
df.loc[df["repeat"], :].sort_values(by=["new", "study"])

Unnamed: 0,original,ensembl_id,new,repeat,study,old_aggr
411948,AADACL2-AS1,ENSG00000262466.5,AADACL2-AS1,True,Gomperts_2021,AADACL2-AS1;RP11-454C18.2
671366,AC069067.1,ENSG00000262466.5,AADACL2-AS1,True,Sims_2019,AC069067.1;AADACL2-AS1
408801,AATBC,ENSG00000215458.8,AATBC,True,Gomperts_2021,AATBC;LOC284837
456790,AP001053.11,ENSG00000215458.8,AATBC,True,Schiller_2020,AP001053.11;AATBC
670632,AC092375.1,ENSG00000257408.2,ABCA3P1,True,Sims_2019,AC092375.1;RP11-645C24.4
...,...,...,...,...,...,...
397476,ZUP1,ENSG00000153975.10,ZUP1,True,Gomperts_2021,ZUP1;ZUFSP
476405,ZUFSP,ENSG00000153975.10,ZUP1,True,Schiller_2020,ZUFSP;ZUP1
392189,AC118549.1,ENSG00000036549.13,ZZZ3,True,Gomperts_2021,AC118549.1;ZZZ3
476416,ZZZ3,ENSG00000036549.13,ZZZ3,True,Schiller_2020,ZZZ3;AC118549.1


In [27]:
feature_conversion_df = df.pivot(
    index="new", columns="study", values="old_aggr"
).reset_index()
feature_conversion_df.shape
feature_conversion_df

(59574, 29)

study,new,Banovich_Kropski_2020,Barbry_unpubl,Budinger_2020,Core,Duong_lungMAP_unpubl,Eils_2020,Gomperts_2021,Janssen_2020,Kaminski_2020,...,Schultze_unpubl,Shalek_2018,Sheppard_2020,Sims_2019,Sun_2020,Tata_unpubl,Thienpont_2018,Wunderink_2021,Xu_2020,Zhang_2021
0,1-DEC,,,,,,,1-DEC,,,...,,,,,,,,,,
1,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,...,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG
2,A1BG-AS1,A1BG-AS1,,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,...,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1
3,A1CF,A1CF,,,A1CF,A1CF,A1CF,A1CF,A1CF,A1CF,...,,,A1CF,A1CF,A1CF,A1CF,A1CF,,A1CF,A1CF
4,A2M,A2M,,A2M,A2M,A2M,A2M,A2M,A2M,A2M,...,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M,A2M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59569,rab1b,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,...,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B,RAB1B
59570,stc1,STC1,,STC1,STC1,STC1,STC1,STC1,STC1,STC1,...,STC1,STC1,STC1,STC1,STC1,STC1,STC1,,STC1,STC1
59571,stxbp1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,...,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1,STXBP1
59572,tec,TEC,TEC,TEC,TEC,TEC,TEC,TEC,TEC,TEC,...,TEC,TEC,TEC,TEC,TEC,TEC,TEC,TEC,TEC,TEC


In [28]:
feature_conversion_df.loc[feature_conversion_df["new"].isin(["HLA-B"]), :]

study,new,Banovich_Kropski_2020,Barbry_unpubl,Budinger_2020,Core,Duong_lungMAP_unpubl,Eils_2020,Gomperts_2021,Janssen_2020,Kaminski_2020,...,Schultze_unpubl,Shalek_2018,Sheppard_2020,Sims_2019,Sun_2020,Tata_unpubl,Thienpont_2018,Wunderink_2021,Xu_2020,Zhang_2021
29576,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,...,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B,HLA-B


### Conclusions <a class="anchor" id="bullet5"></a>

### Save <a class="anchor" id="bullet6"></a>

In [29]:
feats_out_table = re.sub("-with-core.pk", "_table.csv", feats_inp)
feats_out_table

'../../data/HLCA_extended/extension_datasets/features/all_update_table.csv'

In [30]:
feature_conversion_df.to_csv(feats_out_table)

In [31]:
feats_out_pk = re.sub("-with-core.pk", "_digested.pk", feats_inp)
feats_out_pk

'../../data/HLCA_extended/extension_datasets/features/all_update_digested.pk'

In [32]:
with open(f"{feats_out_pk}", "wb") as handle:
    pickle.dump(feature_conversion, handle, protocol=pickle.HIGHEST_PROTOCOL)

Done.