In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

import kraft

In [None]:
SETTING = kraft.json.read("setting.json")

In [None]:
directory_path = "{}/rnai/".format(SETTING["directory_path"])

kraft.path.path(directory_path)

In [None]:
for url, file_name in (
    ("https://ndownloader.figshare.com/files/11489723", "shRNAmapping.csv"),
    (
        "https://ndownloader.figshare.com/files/11489654",
        "achilles55kbatch1repcollapsedlfc.csv",
    ),
    (
        "https://ndownloader.figshare.com/files/11489657",
        "achilles55kbatch2repcollapsedlfc.csv",
    ),
    (
        "https://ndownloader.figshare.com/files/11489660",
        "achilles98krepcollapsedlfc.csv",
    ),
    ("https://ndownloader.figshare.com/files/11489702", "drivepoolalfcmat.csv"),
    ("https://ndownloader.figshare.com/files/11489705", "drivepoolblfcmat.csv"),
    ("https://ndownloader.figshare.com/files/13515395", None),
):

    print(kraft.internet.download(url, directory_path, file_name=file_name))

## Log fold change

In [None]:
barcode_to_combinations = {}

for barcode, gene in pd.read_csv(
    "{}/shRNAmapping.csv".format(directory_path), usecols=(0, 1)
).to_numpy():

    combination = "{}_{}".format(barcode, gene)

    if barcode not in barcode_to_combinations:

        barcode_to_combinations[barcode] = []

    barcode_to_combinations[barcode].append(combination)

len(barcode_to_combinations)

In [None]:
def make_barcode_gene_x_(barcode_x_):

    axis_0_labels = []

    rows = []

    matrix = barcode_x_.to_numpy()

    for i, barcode in enumerate(barcode_x_.index.to_numpy()):

        for label in barcode_to_combinations.get(barcode, [barcode]):

            axis_0_labels.append(label)

            rows.append(matrix[i, :])

    barcode_gene_x_ = pd.DataFrame(
        rows, index=axis_0_labels, columns=barcode_x_.columns
    )

    barcode_gene_x_.index.name = "Barcode Gene"

    return barcode_gene_x_

### 50K

In [None]:
dataframe_1 = pd.read_csv(
    "{}/achilles55kbatch1repcollapsedlfc.csv".format(directory_path), index_col=0
)

print(dataframe_1.shape)

dataframe_2 = pd.read_csv(
    "{}/achilles55kbatch2repcollapsedlfc.csv".format(directory_path), index_col=0
)

print(dataframe_2.shape)

_50k_x_cell_line = pd.concat((dataframe_1, dataframe_2))

print(_50k_x_cell_line.shape)

_50k_x_cell_line = _50k_x_cell_line.groupby(level=0).median()

print(_50k_x_cell_line.shape)

_50k_x_cell_line = make_barcode_gene_x_(_50k_x_cell_line)

_50k_x_cell_line.columns = kraft.name_biology.name_cell_lines(
    _50k_x_cell_line.columns.to_numpy()
)

kraft.dataframe.error_axes(_50k_x_cell_line)

_50k_x_cell_line

### 100K

In [None]:
_100k_x_cell_line = pd.read_csv(
    "{}/achilles98krepcollapsedlfc.csv".format(directory_path), index_col=0
)

print(_100k_x_cell_line.shape)

_100k_x_cell_line = make_barcode_gene_x_(_100k_x_cell_line)

_100k_x_cell_line.columns = kraft.name_biology.name_cell_lines(
    _100k_x_cell_line.columns.to_numpy()
)

kraft.dataframe.error_axes(_100k_x_cell_line)

_100k_x_cell_line

### Drive

In [None]:
dataframe_a = pd.read_csv("{}/drivepoolalfcmat.csv".format(directory_path), index_col=0)

print(dataframe_a.shape)

dataframe_b = pd.read_csv("{}/drivepoolblfcmat.csv".format(directory_path), index_col=0)

print(dataframe_b.shape)

drive_x_cell_line = pd.concat((dataframe_a, dataframe_b))

print(drive_x_cell_line.shape)

drive_x_cell_line = drive_x_cell_line.groupby(level=0).median()

print(drive_x_cell_line.shape)

drive_gene_x_cell_line = make_barcode_gene_x_(drive_x_cell_line)

drive_gene_x_cell_line.columns = kraft.name_biology.name_cell_lines(
    drive_gene_x_cell_line.columns.to_numpy()
)

kraft.dataframe.error_axes(drive_gene_x_cell_line)

drive_gene_x_cell_line

## Demeter (50K, 100K, and Drive)

In [None]:
gene_x_cell_line = pd.read_csv(
    "{}/D2_combined_gene_dep_scores.csv".format(directory_path), index_col=0
)

gene_x_cell_line.index = (
    label.split()[0] for label in gene_x_cell_line.index.to_numpy()
)

gene_x_cell_line.columns = kraft.name_biology.name_cell_lines(
    gene_x_cell_line.columns.to_numpy()
)

gene_x_cell_line.index.name = "Gene"

gene_x_cell_line.columns.name = "Cell Line"

kraft.dataframe.error_axes(gene_x_cell_line)

gene_x_cell_line.to_csv("{}/gene_x_cell_line.tsv".format(directory_path), sep="\t")

gene_x_cell_line