In [None]:
import gzip
import os
import tarfile

import pandas as pd
import scanpy as sc
from utils import download_file

In [None]:
name = "ReplogleNorman2020_E7"
dir_path = name
os.makedirs(name=dir_path, exist_ok=True)

Download the raw data and extract the downloaded tar file:

In [None]:
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE146194&format=file"
tar_file_path = os.path.join(dir_path, "GSE146194_RAW.tar")
extract_dir_path = os.path.join(dir_path, "GSE146194_RAW")

download_file(url=url, path=tar_file_path)

with tarfile.open(name=tar_file_path, mode="r") as tar:
    tar.extractall(path=extract_dir_path, filter=None)

Load the data for experiment 7 (CRISPRi multiplexing):

In [None]:
adata = sc.read_10x_mtx(
    path=extract_dir_path,
    var_names="gene_ids",
    cache=False,
    prefix="GSM4367985_exp7.",
)

Add info from the `*_cell_identities.csv.gz` file to `adata`:

In [None]:
cell_identities_file_path = os.path.join(
    extract_dir_path, "GSM4367985_exp7.cell_identities.csv.gz"
)
barcodes_file_path = os.path.join(extract_dir_path, "GSM4367985_exp7.barcodes.tsv.gz")

with gzip.open(filename=cell_identities_file_path, mode="r") as cell_identities_file:
    cell_identities_df = pd.read_csv(filepath_or_buffer=cell_identities_file)
    with gzip.open(filename=barcodes_file_path, mode="r") as barcodes_file:
        barcodes_df = pd.read_csv(
            filepath_or_buffer=barcodes_file, header=None, names=["cell_barcode"]
        )
        merged_df = pd.merge(
            left=barcodes_df,
            right=cell_identities_df,
            on="cell_barcode",
            how="left",
        )

        # Ensure the merged_df index matches the obs_names of adata.
        merged_df.set_index(keys="cell_barcode", inplace=True)

        # Convert all columns to strings.
        merged_df = merged_df.astype(dtype=str)

        # Add the merged_df as obs to adata.
        adata.obs = merged_df

Add info from the `*_features.tsv.gz` file to `adata`:

In [None]:
features_file_path = os.path.join(extract_dir_path, "GSM4367985_exp7.features.tsv.gz")

with gzip.open(filename=features_file_path, mode="r") as features_file:
    features_df = pd.read_csv(
        filepath_or_buffer=features_file,
        header=None,
        names=["gene_id", "gene_name", "gene_expression"],
    )

    # Ensure the merged_df index matches the obs_names of adata.
    features_df.set_index(keys="cell_barcode", inplace=True)

    # Convert all columns to strings.
    features_df = features_df.astype(dtype=str)

    # Add the features_df as var to adata.
    adata.var = features_df

In [None]:
# TODO: In adata.obs:
# - Rename "gene" | "type" | "target" -> "condition"
# - Rename "cell_barcode" | "type" | "gene_type" -> "cell_type"

Save the data to an H5AD file:

In [None]:
h5ad_file_path = os.path.join(dir_path, "adata.h5ad")
adata.write(filename=h5ad_file_path)