## Data Cleaning

- Cleaning the MH dataset to be raw counts only for future processing

In [3]:
import pandas as pd
from pathlib import Path
import os

# Ensure you are always in the parent dir
os.chdir('/home/kyan/git/cv-scdl3991/')
data_path = Path('data/MH/Moffitt_and_Bambah-Mukku_et_al_merfish_all_cells.csv')

In [4]:
# Load the CSV file, csvs are pretty inefficient so this takes a while

df = pd.read_csv(data_path)

# Remove columns 2 to 8 (index 1 to 7)
df = df.drop(df.columns[1:9], axis=1)

In [6]:
# Save the modified DataFrame back to a new CSV file
output_file = data_path.parent / 'MH_merfish_raw_counts.csv'
df.to_csv(output_file, index=False)

In [1]:
import scanpy as sc
import pandas as pd
import math
import numpy as np
from sklearn import metrics
import torch
# Plotting

import seaborn as sns

# System
from pathlib import Path
import os
from GraphST import GraphST
from GraphST.utils import clustering
import itertools

import pyreadr

# Ensure you are always in the parent dir
os.chdir('/home/kyan/git/cv-scdl3991/')
# data_path = Path('data/MH/MH_raw_counts.csv')
data_path = Path('data/')
output_path = Path('outputs/clustering/')
# Warnings 
import warnings
warnings.simplefilter("ignore")

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# setting seed
torch.manual_seed(17)

<torch._C.Generator at 0x7fb5e65db830>

In [14]:
# Function to load all valid DLPC datasets from the DPLC directory
def load_dlpc_datasets(dlpc_dir):
    dlpc_dir = Path(dlpc_dir)
    datasets = []

    # Iterate through all directories in the DPLC folder
    for folder in dlpc_dir.iterdir():
        if folder.is_dir():  # Check if it's a directory
            patient_id = folder.name
            count_file = folder / (patient_id + "_filtered_feature_bc_matrix.h5")
            if count_file.exists():  # Check if the data file exists
                print(f"Loading data from {folder.name}...")
                adata = sc.read_visium(folder, count_file=f"{patient_id}_filtered_feature_bc_matrix.h5")
                adata.uns['name'] = folder.name
                datasets.append(adata)
            else:
                print(f"Skipping folder {folder.name}: no valid data file found.")
    
    return datasets

In [38]:
res = load_dlpc_datasets(data_path/"DLPC")

Loading data from 151671...
Loading data from 151509...
Loading data from 151675...
Loading data from 151507...
Loading data from 151669...
Skipping folder DLPFC12: no valid data file found.
Loading data from 151670...
Loading data from 151674...
Loading data from 151676...
Loading data from 151508...
Loading data from 151510...
Loading data from 151673...
Loading data from 151672...


In [18]:
res[1]

AnnData object with n_obs × n_vars = 4789 × 33538
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial', 'name'
    obsm: 'spatial'

In [4]:
dlpc_dir = Path(data_path/"DLPC")

In [5]:
datasets = []

In [7]:
for folder in dlpc_dir.iterdir():
    print(folder)
    print(folder.name)

data/DLPC/151671
151671
data/DLPC/151509
151509
data/DLPC/151675
151675
data/DLPC/151507
151507
data/DLPC/151669
151669
data/DLPC/DLPFC12
DLPFC12
data/DLPC/151670
151670
data/DLPC/151674
151674
data/DLPC/.DS_Store
.DS_Store
data/DLPC/151676
151676
data/DLPC/151508
151508
data/DLPC/151510
151510
data/DLPC/.DS_Store:Zone.Identifier
.DS_Store:Zone.Identifier
data/DLPC/151673
151673
data/DLPC/151672
151672


In [24]:
data_path_MH = Path('data/MH')
def load_MH_datasets(MH_dir):
    datasets = []
    # Files must follow the naming scheme MH_{sample}.h5ad
    for file in MH_dir.glob("MH_*.h5ad"):
        print(f"Loading data from {file.stem}...")
        adata = sc.read_h5ad(file)
        adata.uns['name'] = file.stem
        datasets.append(adata)
    return datasets

In [39]:
res1 = load_MH_datasets(data_path_MH)

Loading data from MH_11...
Loading data from MH_5...
Loading data from MH_6...


In [40]:
res1.extend(res)

In [45]:
len(res)

12

In [46]:
len(res[1:])

11