In [19]:
import os
import pandas as pd

# --- CONFIG ---
ROOT_DIR = "/mnt/d/YC.Liu/UNI2_h_features"
OUTPUT_CSV = "univ2_paths.csv"

def generate_csv():
    data = []
    
    print(f"üìÇ Scanning: {ROOT_DIR}")

    # os.walk is better than nested loops because it handles any folder depth automatically
    for root, dirs, files in os.walk(ROOT_DIR):
        for file in files:
            if file.endswith(".h5"):
                # Get full path
                full_path = os.path.join(root, file)
                
                # Get Slide ID (filename without .h5)
                slide_id = file.replace('.h5', '')
                
                # Append to list
                data.append({
                    "slide_id": slide_id,
                    "path": full_path
                })

    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    if not df.empty:
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"‚úÖ Success! Found {len(df)} slides.")
        print(f"üìù Saved to: {os.path.abspath(OUTPUT_CSV)}")
        print("\nPreview:")
        print(df.head())
    else:
        print("‚ùå No .h5 files found. Check your directory path.")

generate_csv()

üìÇ Scanning: /mnt/d/YC.Liu/UNI2_h_features
‚úÖ Success! Found 28000 slides.
üìù Saved to: /mnt/e/YC.Liu/OceanLIT/data/uni_v2_features_prep/univ2_paths.csv

Preview:
                                 slide_id  \
0  01BR001-0684a407-f446-486d-9160-b483cb   
1  01BR001-09b84012-cb83-4c2e-a33b-ebdb1b   
2  01BR001-17d55745-4ab3-4a80-a182-af938b   
3  01BR001-45196bfe-2f9f-41ff-beea-df78b6   
4  01BR001-4ffefc66-d0ba-4a36-b4fa-35bd91   

                                                path  
0  /mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca...  
1  /mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca...  
2  /mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca...  
3  /mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca...  
4  /mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca...  


In [20]:
# Filter to TCGA + CPTAC only (more homogeneous)
df = pd.read_csv("univ2_paths.csv")
df_filtered = df[df['path'].str.contains('TCGA|CPTAC')]
df_filtered.to_csv("ssl_tcga_cptac.csv", index=False)

In [21]:
import pandas as pd

df_filtered = pd.read_csv("ssl_tcga_cptac.csv")
df_filtered.head()

cancer_type = set()

"""
/mnt/d/YC.Liu/UNI2_h_features/CPTAC/cptac_brca/01BR001-0684a407-f446-486d-9160-b483cb.h5
/mnt/d/YC.Liu/UNI2_h_features/TCGA/TCGA-CESC/TCGA-FU-A40J-01Z-00-DX1.E75FB5CE-00E2-4566-894E-566D73CF5DF0.h5
"""

for path in df_filtered['path']:
    if path.startswith('/mnt/d/YC.Liu/UNI2_h_features/TCGA/'):
        parts = path.split('/mnt/d/YC.Liu/UNI2_h_features/TCGA/')[1].split('/')
        # print(parts)
        cancer = parts[0].split('-')[1].upper()
        if cancer == "BRCA_IDC" or cancer == "BRCA_OTHERS":
            cancer = "BRCA"

        cancer_type.add(cancer)
            
    elif path.startswith('/mnt/d/YC.Liu/UNI2_h_features/CPTAC/'):
        parts = path.split('/mnt/d/YC.Liu/UNI2_h_features/CPTAC/')[1].split('/')
        # print(parts)
        cancer = parts[0].split('_')[1].upper()
        cancer_type.add(cancer)
        
    # locate this rows and add cancer type column
    df_filtered.loc[df_filtered['path'] == path, 'cancer_type'] = cancer

print(sorted(cancer_type))

['ACC', 'BLCA', 'BRCA', 'CCRCC', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA', 'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LGG', 'LIHC', 'LSCC', 'LUAD', 'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PDA', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM']


In [22]:
# check counts of cancer types
print(df_filtered['cancer_type'].value_counts())
print(len(cancer_type))

cancer_type
BRCA     1779
LUAD     1656
GBM      1319
LSCC     1081
HNSC      862
LGG       844
COAD      814
CCRCC     783
UCEC      661
SARC      600
PDA       557
KIRC      519
THCA      518
LUSC      512
SKCM      475
BLCA      457
PRAD      449
STAD      400
LIHC      372
OV        329
KIRP      297
CESC      279
TGCT      254
ACC       227
PAAD      203
PCPG      196
THYM      180
READ      158
ESCA      158
KICH      109
MESO       87
UCS        87
UVM        80
DLBC       44
CHOL       39
Name: count, dtype: int64
35


In [23]:
df_filtered.head()
df_filtered.to_csv("ssl_tcga_cptac_with_cancer_type.csv", index=False)

In [24]:
import os
import h5py
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import umap

# --- CONFIGURATION ---
CSV_PATH = "ssl_tcga_cptac_with_cancer_type.csv"   # The CSV generated by previous script
OUTPUT_DIR = "stats_output_ssl"    # Where to save plots
Sample_Ratio = 0.05            # Percentage of slides to sample for UMAP (to save memory)
Patches_Per_Slide = 50         # Patches to take per sampled slide for UMAP
Feature_Key = 'features'       # Key inside H5 file
"""
BRCA     1779
LUAD     1656
GBM      1319
LSCC     1081
HNSC      862
LGG       844
COAD      814
CCRCC     783
UCEC      661
SARC      600

"""

cancer_type_to_vis = ["BRCA", "LUAD", "GBM", "LSCC", "HNSC", "LGG", "COAD", "CCRCC", "UCEC", "SARC"]

def analyze_dataset():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # 1. Load the CSV
    if not os.path.exists(CSV_PATH):
        print(f"‚ùå Error: {CSV_PATH} not found. Run your generation script first.")
        return
    
    df = pd.read_csv(CSV_PATH)
    print(f"üìä Loaded dataset with {len(df)} slides.")

    # Storage for stats
    patch_counts = []
    
    # Storage for UMAP
    # We will sample features into this list
    umap_features = []
    umap_labels = [] # To color by dataset (CPTAC/TCGA/PANDA) if available in path

    print("üöÄ Starting analysis...")
    
    # 2. Iterate and Collect Stats
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Scanning Files"):
        path = row['path']
        slide_id = row['slide_id']
        
        try:
            with h5py.File(path, 'r') as f:
                if Feature_Key not in f:
                    continue
                
                # Get shape without loading everything to RAM
                # Shape is typically (1, Num_Patches, Dim) or (Num_Patches, Dim)
                dset = f[Feature_Key]
                shape = dset.shape
                
                # Handle Dimensions
                if len(shape) == 3:
                    n_patches = shape[1]
                else:
                    n_patches = shape[0]
                
                patch_counts.append(n_patches)

                # --- Sampling for UMAP ---
                # Only sample if we selected this slide (random or every Nth)
                # To keep it fast, we process every 10th slide for UMAP, or use random probability
                if np.random.rand() < Sample_Ratio:
                    cancer_type = row['cancer_type']
                    if cancer_type not in cancer_type_to_vis:
                        continue
                    
                    # Load actual data only for sampled slides
                    feats = dset[:]
                    
                    # Fix dims [1, N, D] -> [N, D]
                    if feats.ndim == 3:
                        feats = feats[0]
                    
                    # Subsample patches from this slide
                    curr_n = feats.shape[0]
                    if curr_n > Patches_Per_Slide:
                        indices = np.random.choice(curr_n, Patches_Per_Slide, replace=False)
                        sampled_feats = feats[indices]
                    else:
                        sampled_feats = feats

                    umap_features.append(sampled_feats)
                    
                    cancer_type = row['cancer_type']
                    
                    # Repeat label for N patches
                    umap_labels.extend([cancer_type] * len(sampled_feats))
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {slide_id}: {e}")

    # 3. Generate Statistics
    patch_counts = np.array(patch_counts)
    print("\n" + "="*40)
    print("üìà DATASET STATISTICS")
    print("="*40)
    print(f"Total Slides:      {len(patch_counts)}")
    print(f"Total Patches:     {patch_counts.sum():,}")
    print(f"Avg Patches/Slide: {patch_counts.mean():.2f}")
    print(f"Median Patches:    {np.median(patch_counts):.2f}")
    print(f"Min Patches:       {patch_counts.min()}")
    print(f"Max Patches:       {patch_counts.max()}")
    print("="*40 + "\n")

    # 4. Plot Histogram (Patch Distribution)
    plt.figure(figsize=(10, 6))
    sns.histplot(patch_counts, bins=50, kde=True, color='skyblue')
    plt.title(f"Distribution of Patches per Slide (N={len(patch_counts)})")
    plt.xlabel("Number of Patches")
    plt.ylabel("Count")
    plt.grid(axis='y', alpha=0.5)
    hist_path = os.path.join(OUTPUT_DIR, "patch_distribution.png")
    plt.savefig(hist_path)
    print(f"üñºÔ∏è  Histogram saved to: {hist_path}")
    plt.close()

    # 5. Compute & Plot UMAP
    if len(umap_features) > 0:
        print("\nüß† Computing UMAP Projection...")
        X = np.vstack(umap_features)
        
        # Check if we have too much data (limit to 20k points for speed)
        if X.shape[0] > 20000:
            print(f"   (Subsampling UMAP points from {X.shape[0]} to 20,000 for speed)")
            idx = np.random.choice(X.shape[0], 20000, replace=False)
            X = X[idx]
            plot_labels = np.array(umap_labels)[idx]
        else:
            plot_labels = np.array(umap_labels)

        # Fit UMAP
        reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine')
        embedding = reducer.fit_transform(X)

        # Plot
        plt.figure(figsize=(12, 10))
        
        # Convert labels to numeric codes for coloring
        unique_labels = np.unique(plot_labels)
        
        if len(unique_labels) > 1 and unique_labels[0] != "Unknown":
            # Color by Dataset Category
            for lbl in unique_labels:
                mask = plot_labels == lbl
                plt.scatter(embedding[mask, 0], embedding[mask, 1], label=lbl, s=3, alpha=0.6)
            plt.legend(markerscale=5)
        else:
            # Simple Density Coloring
            plt.scatter(embedding[:, 0], embedding[:, 1], c='purple', s=2, alpha=0.3)
            
        plt.title(f"UMAP Projection of Feature Embeddings\n(Sampled {len(X)} patches)")
        plt.axis('off')
        umap_path = os.path.join(OUTPUT_DIR, "umap_projection.png")
        plt.savefig(umap_path)
        print(f"üñºÔ∏è  UMAP Plot saved to: {umap_path}")
        plt.close()
    else:
        print("‚ö†Ô∏è Not enough data sampled for UMAP.")

analyze_dataset()

üìä Loaded dataset with 17385 slides.
üöÄ Starting analysis...


Scanning Files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17385/17385 [19:24<00:00, 14.92it/s]  



üìà DATASET STATISTICS
Total Slides:      17385
Total Patches:     146,192,632
Avg Patches/Slide: 8409.12
Median Patches:    6926.00
Min Patches:       6
Max Patches:       67820

üñºÔ∏è  Histogram saved to: stats_output_ssl/patch_distribution.png

üß† Computing UMAP Projection...
   (Subsampling UMAP points from 25234 to 20,000 for speed)
üñºÔ∏è  UMAP Plot saved to: stats_output_ssl/umap_projection.png
