In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

import kraft

In [None]:
directory_path = "/home/kwat/garden/data/cancer_cell_line/ctrp/"

kraft.path.path(directory_path)

In [None]:
kraft.internet.download_and_extract(
    "ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip",
    directory_path,
)

kraft.internet.download(
    "https://github.com/remontoire-pac/ctrp-reference/raw/master/auc/new-abs-auc-with-qc.txt",
    directory_path,
)

In [None]:
compound_ids, compounds, genes, activities = (
    pd.read_csv(
        "{}/v20.meta.per_compound.txt".format(directory_path),
        sep="\t",
        usecols=(0, 1, 6, 7),
    )
    .to_numpy()
    .T
)

## Make _x\_compound

In [None]:
assert not kraft.array.check_has_duplicate(compounds)

In [None]:
gene_x_compound = kraft.series.binarize(pd.Series(genes, index=compounds))

gene_x_compound.index.name = "Gene"

gene_x_compound.to_csv("{}/gene_x_compound.tsv".format(directory_path), sep="\t")

gene_x_compound

In [None]:
activity_x_compound = kraft.series.binarize(pd.Series(activities, index=compounds))

activity_x_compound.index.name = "Activity"

activity_x_compound.to_csv(
    "{}/activity_x_compound.tsv".format(directory_path), sep="\t"
)

activity_x_compound

## Make compound_x_cell_line

In [None]:
cell_line_ids, cell_lines = (
    pd.read_csv(
        "{}/v20.meta.per_cell_line.txt".format(directory_path),
        sep="\t",
        usecols=(0, 1),
    )
    .to_numpy()
    .T
)

cell_lines = np.asarray(kraft.cell_line.rename(cell_lines))

In [None]:
map_scores, map_compound_ids, map_cell_line_ids = (
    pd.read_csv(
        "{}/new-abs-auc-with-qc.txt".format(directory_path), sep="\t", usecols=(0, 3, 5)
    )
    .to_numpy()
    .T
)

map_compound_ids = map_compound_ids.astype(object)

map_cell_line_ids = map_cell_line_ids.astype(object)

In [None]:
compound_id_to_name = {id_: name for id_, name in zip(compound_ids, compounds)}

cell_line_id_to_name = {id_: name for id_, name in zip(cell_line_ids, cell_lines)}

In [None]:
compounds = np.asarray(tuple(compound_id_to_name[id_] for id_ in map_compound_ids))

cell_lines = np.asarray(tuple(cell_line_id_to_name[id_] for id_ in map_cell_line_ids))

In [None]:
compound_x_cell_line = kraft.dataframe.pivot(
    compounds,
    cell_lines,
    map_scores,
    function=min,
    axis_0_name="Compound",
    axis_1_name="Cell Line",
)

compound_x_cell_line.to_csv(
    "{}/compound_x_cell_line.tsv".format(directory_path), sep="\t"
)

compound_x_cell_line