In [1]:
from pathlib import Path
import pandas as pd
import hashlib
from tqdm import tqdm
import concurrent.futures

In [2]:
def get_md5sum(file_path: Path) -> str:
    """Compute MD5 checksum for a file."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # 65536
        # 4096
        for chunk in iter(lambda: f.read(131072), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


# def compute_md5_parallel(file_paths: list[Path]) -> list[str]:
#     """Compute MD5 checksums for multiple files in parallel, with a progress bar."""
#     # Initialize a ProcessPoolExecutor
#     with concurrent.futures.ProcessPoolExecutor() as executor:
#         # Create a list to hold the futures
#         futures = [executor.submit(get_md5sum, file_path) for file_path in file_paths]

#         # Use tqdm to create a progress bar for the futures
#         results = []
#         for future in tqdm(
#             concurrent.futures.as_completed(futures),
#             total=len(file_paths),
#             desc="Computing MD5",
#         ):
#             results.append(future.result())

#     return results

In [3]:
SYMLINK_SCRNASEQ_FOLDER = Path(
    "/data/torsten/lara-haematopoesis-mouse/geo_upload_space/scrna_seq"
)
SYMLINK_SCRNASEQ_FOLDER.mkdir(parents=True, exist_ok=True)

## Create symlinks

### RAW data

In [4]:
ROOT_RAW_DATA_DIR = Path(
    "/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse"
)

In [5]:
fastq_files = []
for p in ROOT_RAW_DATA_DIR.rglob("*"):
    if p.is_file() and (p.suffixes == [".fastq", ".gz"]):
        fastq_files.append(p)

In [6]:
sample_to_raw_files = {}
parent_to_fastq = {}
raw_md5sums = {}

for p in tqdm(fastq_files):
    stem = p.name.split(".", maxsplit=1)[0]
    run_name = p.parts[-3]
    new_name = stem + "_" + run_name + ".fastq.gz"
    if new_name.startswith("3"):
        new_name = new_name[1:]
    new_file = Path(SYMLINK_SCRNASEQ_FOLDER / new_name)
    if not new_file.is_file():
        new_file.symlink_to(p)

    sample = p.parts[-4]
    sample_to_raw_files.setdefault(sample, []).append(new_file.name)
    parent_to_fastq.setdefault(p.parent, []).append(new_file.name)
    # this is slow:
    # raw_md5sums[new_file.name] = get_md5sum(p)

for val in parent_to_fastq.values():
    assert len(set(val)) == 2

100%|██████████| 80/80 [00:03<00:00, 24.65it/s]


### Processed data

In [7]:
ROOT_PROCESSED_DATA_DIR = Path(
    "/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_processed"
)

In [8]:
processed_files = []
for p in ROOT_PROCESSED_DATA_DIR.rglob("*"):
    processed_files.append(p)

In [9]:
parsed_md5sums = {}
for p in tqdm(processed_files):
    new_name = p.name
    new_file = Path(SYMLINK_SCRNASEQ_FOLDER / new_name)
    if not new_file.is_file():
        new_file.symlink_to(p)
    # parsed_md5sums[new_file.name] = get_md5sum(p)

100%|██████████| 20/20 [00:00<00:00, 20.90it/s]


## Prepare GEO metadata table
### SAMPLES

In [10]:
sample_names = [
    "Activated_OP1L_NM_NA_Rep1",
    "Activated_OP1L_NM_NA_Rep2",
    "Quiescent_OP1L_NM_NA_Rep1",
    "Quiescent_OP1L_NM_NA_Rep2",
    "exVivo_OP2_IL1b_1",
    "exVivo_OP2_IL1b_2",
    "exVivo_OP2_resting_1",
    "exVivo_OP2_resting_2",
    "exVivo_OP2_TGFb_1",
    "exVivo_OP2_TGFb_2",
]

In [11]:
sample_to_p_files = {}
for s in sample_names:
    p_files = [
        f"{s}_filtered_feature_bc_matrix.h5",
        f"{s}_protospacer_calls_per_cell.csv",
    ]
    sample_to_p_files[s] = p_files

In [12]:
samples = pd.DataFrame(
    {
        "*library name": sample_names,
        "*title": sample_names,
        "*organism": "Mus musculus",
        "**tissue": "Heart",
        "**cell line": None,
        "**cell type": "Fibroblast",
        "genotype": None,
        "treatment": None,
        "*molecule": None,
        "*single or paired-end": "paired-end",
        "*instrument model": "Illumina NextSeq 2000",
        "description": None,
    }
)

In [13]:
processed_files = pd.Series(sample_to_p_files).apply(pd.Series)
processed_files.columns = processed_files.shape[1] * ["*processed data file"]

raw_files = pd.Series(sample_to_raw_files).apply(pd.Series)
raw_files.columns = raw_files.shape[1] * ["raw file"]

In [14]:
samples = samples.join(processed_files, on="*library name").join(
    raw_files, on="*library name"
)
samples.to_excel("SAMPLES.xlsx")

### PAIRED-END EXPERIMENTS

In [15]:
p_e_exps = pd.Series(parent_to_fastq).apply(pd.Series).reset_index(drop=True)
p_e_exps.columns = ["file name 1", "file name 2"]
p_e_exps.to_excel("PAIRED_END.xlsx")

### MD5 check sumns

In [16]:
pd.Series(raw_md5sums, name="file checksum").rename_axis(
    "file name"
).to_frame().to_excel("MD5_RAW_FILES.xlsx")
pd.Series(parsed_md5sums, name="file checksum").rename_axis(
    "file name"
).to_frame().to_excel("MD5_PARSED_FILES.xlsx")

In [17]:
raw_md5sums

{}