In [None]:
import re
from urllib.parse import urlparse

import pandas as pd
import requests
from datasets import Dataset, load_dataset
from tqdm import tqdm

In [None]:
ds = load_dataset(
    "vevotx/Tahoe-100M",
    name="cell_line_metadata",
    split="train",
)
if isinstance(ds, Dataset):
    tahoe_cell_line = pd.DataFrame(ds.to_dict())
else:
    raise TypeError("Expected a `Dataset`, got something else.")


# cellosaurus ID
cellosaurus_ids = tahoe_cell_line["Cell_ID_Cellosaur"].unique().tolist()

## CHEMBL API (Deprecated, jump to [Cellosaurus API](#cellosaurus-api))
Because we are not sure if the tahoe cell line metadata `cell_name` is unique onto the [cell line ontology search](https://www.ebi.ac.uk/ols4/ontologies), let's try mapping the cellosaur ID to an ontology entry. To do so:
- Let's map the `Cell_ID_Cellosaur` and onto a usable ID on [cell line ontology search](https://www.ebi.ac.uk/ols4/ontologies), like `CLO` or `EFO`.
- Let's use [CHEMBL web service](https://chembl.gitbook.io/chembl-interface-documentation/web-services/chembl-data-web-services), for [cell lines](https://www.ebi.ac.uk/chembl/api/data/cell_line?cellosaurus_id__in=CVCL_0023,CVCL_1381) that provides such mapping.
- Alternatively, a [CHEMBL browser](https://www.ebi.ac.uk/chembl/explore/cell_lines/) is provided.

In [None]:
# Construct the query
cell_ids = ",".join(cellosaurus_ids)
url = f"https://www.ebi.ac.uk/chembl/api/data/cell_line?cellosaurus_id__in={cell_ids}"


# Make the request
def recursive_cell_line_request(url, headers={"Accept": "application/json"}, processed=0, pbar=None):
    if pbar is None:
        total = len(re.findall(r"CVCL_\w+", url))
        pbar = tqdm(total=total, desc="Fetching cell lines from CHEMBL")
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    data = response.json()
    cell_lines_list = data["cell_lines"]
    pbar.update(len(cell_lines_list))
    processed += len(cell_lines_list)
    if data["page_meta"]["next"] is None:
        pbar.close()
        return cell_lines_list

    url_parsed = urlparse(url)
    next_url = f"{url_parsed.scheme}://{url_parsed.netloc}{data['page_meta']['next']}"
    return cell_lines_list + recursive_cell_line_request(
        next_url, headers, processed, pbar
    )

cell_lines_list = recursive_cell_line_request(
    url=url, headers={"Accept": "application/json"}
)

tahoe_chembl_df = pd.DataFrame(cell_lines_list)
tahoe_chembl_df = tahoe_chembl_df.where(tahoe_chembl_df.notna()) # make None to NaN
no_res_cells = [cell for cell in cellosaurus_ids if cell not in tahoe_chembl_df["cellosaurus_id"].to_list()]
print(
    f"There are {len(no_res_cells)}/{len(cellosaurus_ids)} cell lines " \
    "that didn't map to any record in CHEMBL database.\n\n" \
    f"Those are: {no_res_cells}"
)

In [None]:
tahoe_chembl_df

## CELLOSAURUS API

This API is prefered over [CHEMBL API] as cellosaurus ids are guaranteed to map to a database entry which was not the case for CHEMBL.

[Description of the cellosaurus API fields](https://api.cellosaurus.org/api-fields)
- id: Recommended name. Most frequently the name of the cell line as provided in the original publication.
- ac: Primary accession. It is the unique identifier of the cell line.
- sx: Sex of the individual from which the cell line originates.
- ag: Age at sampling time of the individual from which the cell line was established.
- ca: Category to which a cell line belongs, one of 14 defined terms. Example: cancer cell line, hybridoma, transformed cell line.
- dr: Cross-references to external resources: cell lines catalogs, databases, resources listing cell lines as samples or to ontologies. 
  - This is what will contain the `EFO` or the `CLO`
- cell-type: Cell type from which the cell line is derived.
- derived-from-site: Body part (tissue or organ) the cell line is derived from.
- di: Disease(s) suffered by the individual from which the cell line originated with its NCI Thesaurus or ORDO identifier.
- hi: Parent cell line from which the cell line originates.

There is many more metadata, but those has been chosen.
The query link is then construct as follow:
```txt
### Example
https://api.cellosaurus.org/cell-line/CVCL_0023?fields=id,ac,sx,ag,ca,dr,cell-type,derived-from-site,di,hi,ch&format=json
```

Compared to [Chembl API](#chembl-api-deprecated-jump-to-cellosaurus-api), it is not possible to query ID in batch and we loop over cellosaurus_ids.

In [None]:
# Construct the query
url = "https://api.cellosaurus.org/cell-line/{cell_id}?fields=id,ac,sx,ag,ca,dr,cell-type,derived-from-site,di,hi,ch&format=json"
headers={"Accept": "application/json"}


data_rows = []
for cell_id in tqdm(cellosaurus_ids):
    response = requests.get(url.format(cell_id=cell_id), headers=headers)
    data = response.json()["Cellosaurus"]["cell-line-list"][0]

    # accession-list is a list 'type' and 'value'. It is always unique and correspond to the primary number
    accession_list = data.pop("accession-list", None)
    if accession_list:
        data.update(
            {"cellosaurus_id": accession_list[0]["value"]}
        )
    # name-list	is a list of 'type' and 'value' (only interested in value). It is always unique.
    name_list = data.pop("name-list", None)
    if name_list:
        data.update(
            {"name": name_list[0]["value"]}
        )

    # child-list is a list of dict with 'accession' and 'name'
    child_list = data.pop("child-list", None)
    res_ids, res_names = [], []
    if child_list:
        for child in child_list:
            acc = child.get("accession", None)
            name = child.get("name", None)
            if acc["type"] == "primary":
                res_ids.append(acc.get("value", None))
                res_names.append(name.get("value", None))
        data.update({
            "child_ids": res_ids,
            "child_names": res_names,
        })

    # derived-from is a list of dict with 'accession' and 'label' (as name)
    parent_list = data.pop("derived-from", None)
    res_ids, res_names = [], []
    if parent_list:
        for parent in parent_list:
            res_ids.append(parent.get("accession", None))
            res_names.append(parent.get("label", None))
        data.update({
            "parent_ids": res_ids,
            "parent_names": res_names,
        })

    # disease-list is a list of dict with 'accession', 'database', 'label' (as name)
    disease_list = data.pop("disease-list", None)
    res_ids, res_databases, res_names = [], [], []
    if disease_list:
        for disease in disease_list:
            res_ids.append(disease.get("accession", None))
            res_databases.append(disease.get("database", None))
            res_names.append(disease.get("label", None))
        data.update({
            "disease_ids": res_ids,
            "disease_databases": res_databases,
            "disease_names": res_names,
        })
    # xref-list is a list of dict with 'accession', 'database' etc...
    external_id = data.pop("xref-list", None)
    database_res = {"BTO": [], "EFO": [], "CLO": [], "ChEMBL-Cells": []}
    if external_id:
        for ext in external_id:
            if ext["database"] in database_res.keys():
                database_res[ext["database"]].append(ext.get("accession", None))
        data.update({
            "BTO_id": database_res["BTO"],
            "EFO_id": database_res["EFO"],
            "CLO_id": database_res["CLO"],
            "CHEMBL_id": database_res["ChEMBL-Cells"]
        })
    # derived-from-site-list is a list of dict with 'value' and many more.
    organ = data.pop("derived-from-site-list", None)
    res = []
    if organ:
        for org in organ:
            res.append(org.get("label", None))
        data.update({
            "organ": res
        })
    # cell-type is a dict with 'value' and 'xref' which is a dict with accession id, database, database.
    cell_type = data.pop("cell-type", None)
    if cell_type:
        data.update({
            "cell_type": cell_type["xref"]["label"],
            "cell_type_id": cell_type["xref"]["accession"],
            "cell_type_database": cell_type["xref"]["database"]
        })
    data_rows.append(data)


tahoe_cellosaurus_df = pd.DataFrame(data_rows)
tahoe_cellosaurus_df = tahoe_cellosaurus_df.where( 
    tahoe_cellosaurus_df.map(
        lambda x: pd.notna(x) # make None to NaN
        if not isinstance(x, list) else len(x) != 0) # remove empty list
).map(lambda x: x if not isinstance(x, list) else (x[0] if len(x) == 1 else x)) # remove empty list