In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

import kraft

In [None]:
directory_path = "/home/kwat/garden/data/cancer_cell_line/mutation/"

kraft.path.path(directory_path)

In [None]:
table = pd.read_csv(
    "{}/CCLE_mutations.csv".format(directory_path),
    sep="\t",
    usecols=(0, 7, 13, 18, 32, 33),
).to_numpy()

len(table)

In [None]:
table = table[table[:, 4] == "damaging", :]

len(table)

In [None]:
table[:, 5] = kraft.cell_line.rename(table[:, 5])

cell_lines = table[:, 5]

cell_line_to_i = kraft.array.map_int(cell_lines)[0]

len(cell_line_to_i)

In [None]:
genes = table[:, 0]

gene_to_i = kraft.array.map_int(genes)[0]

len(gene_to_i)

In [None]:
def combine(gene_str):

    gene, str_ = gene_str

    if not isinstance(str_, str):

        str_ = ""

    return np.asarray("{}_{}".format(gene, str_), dtype="object")

In [None]:
variant_classifications = np.apply_along_axis(combine, 1, table[:, (0, 1)])

variant_classification_to_i = kraft.array.map_int(variant_classifications)[0]

len(variant_classification_to_i)

In [None]:
protein_changes = np.apply_along_axis(combine, 1, table[:, (0, 3)])

protein_change_to_i = kraft.array.map_int(protein_changes)[0]

len(protein_change_to_i)

In [None]:
gene_x_cell_line = np.full((len(gene_to_i), len(cell_line_to_i)), 0)

variant_classification_x_cell_line = np.full(
    (len(variant_classification_to_i), len(cell_line_to_i)), 0
)

protein_change_x_cell_line = np.full((len(protein_change_to_i), len(cell_line_to_i)), 0)

In [None]:
for gene, variant_classification, protein_change, cell_line in table[:, (0, 1, 3, 5)]:

    cell_line_i = cell_line_to_i[cell_line]

    gene_x_cell_line[gene_to_i[gene], cell_line_i] += 1

    if not isinstance(variant_classification, str):

        variant_classification = ""

    variant_classification_x_cell_line[
        variant_classification_to_i[str(combine((gene, variant_classification)))],
        cell_line_i,
    ] += 1

    if not isinstance(protein_change, str):

        protein_change = ""

    protein_change_x_cell_line[
        protein_change_to_i[str(combine((gene, protein_change)))], cell_line_i
    ] += 1

In [None]:
gene_x_cell_line = pd.DataFrame(
    gene_x_cell_line, index=gene_to_i, columns=cell_line_to_i
)

gene_x_cell_line.index.name = "Gene"

gene_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(gene_x_cell_line)

gene_x_cell_line.to_csv("{}/gene_x_cell_line.tsv".format(directory_path), sep="\t")

gene_x_cell_line

In [None]:
gene_x_cell_line_x_01 = gene_x_cell_line.clip(upper=1)

gene_x_cell_line_x_01.to_csv(
    "{}/gene_x_cell_line_x_01.tsv".format(directory_path), sep="\t"
)

gene_x_cell_line_x_01

In [None]:
variant_classification_x_cell_line = pd.DataFrame(
    variant_classification_x_cell_line,
    index=variant_classification_to_i,
    columns=cell_line_to_i,
)

variant_classification_x_cell_line.index.name = "Variant Classification"

variant_classification_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(variant_classification_x_cell_line)

variant_classification_x_cell_line.to_csv(
    "{}/gene_variant_classification_x_cell_line.tsv".format(directory_path), sep="\t"
)

variant_classification_x_cell_line

In [None]:
protein_change_x_cell_line = pd.DataFrame(
    protein_change_x_cell_line, index=protein_change_to_i, columns=cell_line_to_i,
)

protein_change_x_cell_line.index.name = "Protein Change"

protein_change_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(protein_change_x_cell_line)

protein_change_x_cell_line.to_csv(
    "{}/gene_protein_change_x_cell_line.tsv".format(directory_path), sep="\t"
)

protein_change_x_cell_line