In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

import kraft

In [None]:
SETTING = kraft.json.read("setting.json")

In [None]:
directory_path = "{}/mutation/".format(SETTING["directory_path"])

kraft.path.path(directory_path)

In [None]:
for url in ("https://ndownloader.figshare.com/files/22629110",):

    print(kraft.internet.download(url, directory_path))

In [None]:
table = pd.read_csv(
    "{}/CCLE_mutations.csv".format(directory_path),
    sep="\t",
    usecols=(0, 7, 18, 32, 33),
).to_numpy()

len(table)

In [None]:
table = table[table[:, 3] == "damaging", :]

len(table)

In [None]:
genes, variant_classifications, protein_changes, cell_lines = table[:, (0, 1, 2, 4)].T

In [None]:
cell_lines = kraft.name_biology.name_cell_lines(cell_lines)

cell_line_to_i = kraft.array.map_int(cell_lines)[0]

len(cell_line_to_i)

In [None]:
gene_to_i = kraft.array.map_int(genes)[0]

len(gene_to_i)

In [None]:
def combine(gene, str_):

    if not isinstance(str_, str):

        str_ = ""

    return "{}_{}".format(gene, str_)

In [None]:
gene_variant_classifications = np.asarray(
    tuple(
        combine(gene, variant_classification)
        for gene, variant_classification in zip(genes, variant_classifications)
    )
)

gene_variant_classification_to_i = kraft.array.map_int(gene_variant_classifications)[0]

len(gene_variant_classification_to_i)

In [None]:
gene_protein_changes = np.asarray(
    tuple(
        combine(gene, protein_change)
        for gene, protein_change in zip(genes, protein_changes)
    )
)

gene_protein_change_to_i = kraft.array.map_int(gene_protein_changes)[0]

len(gene_protein_change_to_i)

In [None]:
gene_x_cell_line = np.full((len(gene_to_i), len(cell_line_to_i)), 0)

gene_variant_classification_x_cell_line = np.full(
    (len(gene_variant_classification_to_i), len(cell_line_to_i)), 0
)

gene_protein_change_x_cell_line = np.full(
    (len(gene_protein_change_to_i), len(cell_line_to_i)), 0
)

In [None]:
for gene, gene_variant_classification, gene_protein_change, cell_line in zip(
    genes, gene_variant_classifications, gene_protein_changes, cell_lines
):

    cell_line_i = cell_line_to_i[cell_line]

    gene_x_cell_line[gene_to_i[gene], cell_line_i] += 1

    if not isinstance(gene_variant_classification, str):

        gene_variant_classification = ""

    gene_variant_classification_x_cell_line[
        gene_variant_classification_to_i[gene_variant_classification], cell_line_i,
    ] += 1

    if not isinstance(gene_protein_change, str):

        gene_protein_change = ""

    gene_protein_change_x_cell_line[
        gene_protein_change_to_i[gene_protein_change], cell_line_i
    ] += 1

In [None]:
gene_x_cell_line = pd.DataFrame(
    gene_x_cell_line, index=gene_to_i, columns=cell_line_to_i
)

gene_x_cell_line.index.name = "Gene"

gene_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(gene_x_cell_line)

gene_x_cell_line.to_csv("{}/gene_x_cell_line.tsv".format(directory_path), sep="\t")

gene_x_cell_line

In [None]:
gene_x_cell_line_x_01 = gene_x_cell_line.clip(upper=1)

gene_x_cell_line_x_01.to_csv(
    "{}/gene_x_cell_line_x_01.tsv".format(directory_path), sep="\t"
)

gene_x_cell_line_x_01

In [None]:
gene_variant_classification_x_cell_line = pd.DataFrame(
    gene_variant_classification_x_cell_line,
    index=gene_variant_classification_to_i,
    columns=cell_line_to_i,
)

gene_variant_classification_x_cell_line.index.name = "Gene Variant Classification"

gene_variant_classification_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(gene_variant_classification_x_cell_line)

gene_variant_classification_x_cell_line.to_csv(
    "{}/gene_variant_classification_x_cell_line.tsv".format(directory_path), sep="\t"
)

gene_variant_classification_x_cell_line

In [None]:
gene_protein_change_x_cell_line = pd.DataFrame(
    gene_protein_change_x_cell_line,
    index=gene_protein_change_to_i,
    columns=cell_line_to_i,
)

gene_protein_change_x_cell_line.index.name = "Protein Change"

gene_protein_change_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(gene_protein_change_x_cell_line)

gene_protein_change_x_cell_line.to_csv(
    "{}/gene_protein_change_x_cell_line.tsv".format(directory_path), sep="\t"
)

gene_protein_change_x_cell_line