In [1]:
from pathlib import Path
import pandas as pd
import hashlib
from tqdm import tqdm

In [2]:
def get_md5sum(file_path: Path) -> str:
    """Compute MD5 checksum for a file."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # 65536
        # 4096
        for chunk in iter(lambda: f.read(131072), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [3]:
SYMLINK_SCRNASEQ_FOLDER = Path(
    "/data/torsten/lara-haematopoesis-mouse/geo_upload_space/chip_seq"
)
SYMLINK_SCRNASEQ_FOLDER.mkdir(parents=True, exist_ok=True)

## Create symlinks

### RAW data

In [15]:
ROOT_RAW_DATA_DIR = Path(
    "/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip"
)

Laura had already filled in the metadata except for the checksums.
So I just need to keep the filenames as they are and compute the checksums.

In [16]:
fastq_files = []
for p in ROOT_RAW_DATA_DIR.rglob("*"):
    if p.is_file() and (p.suffixes == [".fastq", ".gz"]):
        fastq_files.append(p)

In [9]:
raw_md5sums = {}
for p in tqdm(fastq_files):
    raw_md5sums[p.name] = get_md5sum(p)


100%|██████████| 8/8 [03:52<00:00, 29.01s/it]


In [17]:
for p in tqdm(fastq_files):
    new_file = Path(SYMLINK_SCRNASEQ_FOLDER / p.name)
    if not new_file.is_file():
        new_file.symlink_to(p)

100%|██████████| 8/8 [00:00<00:00, 16.95it/s]


### Processed data

In [10]:
ROOT_PROCESSED_DATA_DIR = Path(
    "/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed"
)

In [11]:
processed_files = []
for p in ROOT_PROCESSED_DATA_DIR.rglob("*"):
    processed_files.append(p)

In [12]:
processed_files

[PosixPath('/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed/ChIP-Seq.pf.res_dds_TGFbvsNormal.csv'),
 PosixPath('/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed/Normal_H2azAc_ChIP18_sample.bw'),
 PosixPath('/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed/Normal_H2azAc_ChIP19_sample.bw'),
 PosixPath('/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed/TGFb_H2azAc_ChIP18_sample.bw'),
 PosixPath('/data/buckets/rrx-datascience-dev/torsten/lara-haematopoesis-mouse_chip_processed/TGFb_H2azAc_ChIP19_sample.bw')]

In [13]:
parsed_md5sums = {}
for p in tqdm(processed_files):
    parsed_md5sums[p.name] = get_md5sum(p)

100%|██████████| 5/5 [00:33<00:00,  6.71s/it]


In [18]:
for p in tqdm(processed_files):
    new_file = Path(SYMLINK_SCRNASEQ_FOLDER / p.name)
    if not new_file.is_file():
        new_file.symlink_to(p)

100%|██████████| 5/5 [00:00<00:00, 21.55it/s]


## Prepare GEO metadata table

### MD5 check sumns

In [14]:
pd.Series(raw_md5sums, name="file checksum").rename_axis(
    "file name"
).to_frame().to_excel("MD5_RAW_FILES.xlsx")
pd.Series(parsed_md5sums, name="file checksum").rename_axis(
    "file name"
).to_frame().to_excel("MD5_PARSED_FILES.xlsx")

In [17]:
raw_md5sums

{}