---
Goal: Overlap Analysis between different methods 

Perv: 01_Hill_AF_QC_comparison_preprocessing.ipynb (importing QC analaysis and merging )

Next: 03_Hill_AF_QC_comparison_preprocessing.ipynb (Filtering)

---

---
# 1- Import merged data

In [3]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Set up plotting parameters
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white', frameon=False)
sns.set_style("whitegrid")

# Define paths
output_base = "/work/archive/farhadie/public_studies/Hill/QC_comparison_analysis"
data_dir = os.path.join(output_base, 'processed_data')
results_dir = os.path.join(output_base, 'results')
figures_dir = os.path.join(output_base, 'figures')

# Find the most recent merged file
print("="*60)
print("Loading Merged Data")
print("="*60)

# List all h5ad files in the data directory
h5ad_files = [f for f in os.listdir(data_dir) if f.endswith('.h5ad') and 'merged' in f]

if len(h5ad_files) == 0:
    print("❌ No merged h5ad files found!")
    print(f"Looking in: {data_dir}")
else:
    # Sort by modification time (most recent first)
    h5ad_files.sort(key=lambda x: os.path.getmtime(os.path.join(data_dir, x)), reverse=True)
    
    print(f"Found {len(h5ad_files)} merged file(s):")
    for i, f in enumerate(h5ad_files):
        file_path = os.path.join(data_dir, f)
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
        mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
        print(f"  [{i}] {f}")
        print(f"      Size: {file_size:.1f} MB, Modified: {mod_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Load the most recent file
    selected_file = h5ad_files[0]
    file_path = os.path.join(data_dir, selected_file)
    
    print(f"\n{'='*60}")
    print(f"Loading: {selected_file}")
    print(f"{'='*60}")
    
    adata_merged = sc.read_h5ad(file_path)
    
    print(f"\n✓ Data loaded successfully!")
    print(f"\n{'='*60}")
    print("AnnData Object Summary:")
    print(f"{'='*60}")
    print(adata_merged)
    
    # Display basic statistics
    print(f"\n{'='*60}")
    print("Dataset Statistics:")
    print(f"{'='*60}")
    print(f"  Total cells: {adata_merged.n_obs:,}")
    print(f"  Total genes: {adata_merged.n_vars:,}")
    print(f"  Samples: {adata_merged.obs['sample_id'].nunique()}")
    
    # AF status distribution
    print(f"\n  AF Status:")
    for status, count in adata_merged.obs['af_status'].value_counts().items():
        print(f"    {status}: {count:,} cells ({count/adata_merged.n_obs*100:.1f}%)")
    
    # Sex distribution
    print(f"\n  Sex:")
    for sex, count in adata_merged.obs['sex'].value_counts().items():
        print(f"    {sex}: {count:,} cells ({count/adata_merged.n_obs*100:.1f}%)")
    
    # Check available QC flags
    print(f"\n{'='*60}")
    print("Available QC Flags:")
    print(f"{'='*60}")
    qc_flags = [col for col in adata_merged.obs.columns if 'flag' in col]
    for flag in sorted(qc_flags):
        n_flagged = adata_merged.obs[flag].sum()
        print(f"  {flag}: {n_flagged:,} ({n_flagged/adata_merged.n_obs*100:.1f}%)")
    
    # Check available metadata columns
    print(f"\n{'='*60}")
    print("Metadata Columns in .obs:")
    print(f"{'='*60}")
    print(adata_merged.obs.columns.tolist())
    
    # Check if QC metrics are available
    print(f"\n{'='*60}")
    print("QC Metrics Available:")
    print(f"{'='*60}")
    qc_metrics = ['total_counts', 'n_genes_by_counts', 'pct_counts_mt', 
                  'log1p_total_counts', 'log1p_n_genes_by_counts']
    for metric in qc_metrics:
        if metric in adata_merged.obs.columns:
            print(f"  ✓ {metric}")
        else:
            print(f"  ✗ {metric}")
    
    # Memory usage
    print(f"\n{'='*60}")
    print("Memory Usage:")
    print(f"{'='*60}")
    memory_mb = adata_merged.X.data.nbytes / (1024 * 1024) if hasattr(adata_merged.X, 'data') else adata_merged.X.nbytes / (1024 * 1024)
    print(f"  Matrix size: {memory_mb:.1f} MB")

# Create a backup copy (optional)
print(f"\n{'='*60}")
print("Creating backup copy for safety...")
print(f"{'='*60}")
adata_backup = adata_merged.copy()
print("✓ Backup created: adata_backup")

  from pkg_resources import get_distribution, DistributionNotFound


Loading Merged Data
Found 1 merged file(s):
  [0] Hill_merged_with_QC_flags_20251017.h5ad
      Size: 3454.3 MB, Modified: 2025-10-17 12:26:24

Loading: Hill_merged_with_QC_flags_20251017.h5ad

✓ Data loaded successfully!

AnnData Object Summary:
AnnData object with n_obs × n_vars = 262003 × 38606
    obs: 'fraction_unspliced', 'n_genes_by_counts', 'total_counts', 'total_counts_nuclear', 'pct_counts_nuclear', 'score_nuclear', 'total_counts_MT', 'pct_counts_MT', 'score_MT', 'total_counts_CM_cyto', 'pct_counts_CM_cyto', 'score_CM_cyto', 'total_counts_CM_nucl', 'pct_counts_CM_nucl', 'score_CM_nucl', 'total_counts_VEC', 'pct_counts_VEC', 'score_VEC', 'total_counts_PER', 'pct_counts_PER', 'score_PER', 'total_counts_SMC', 'pct_counts_SMC', 'score_SMC', 'total_counts_AD', 'pct_counts_AD', 'score_AD', 'total_counts_SC', 'pct_counts_SC', 'score_SC', 'total_counts_N', 'pct_counts_N', 'score_N', 'total_counts_EEC', 'pct_counts_EEC', 'score_EEC', 'total_counts_FB', 'pct_counts_FB', 'score_FB', 'to

---
# 2- Preprocessing for UMAP

In [4]:
print("="*60)
print("Starting Preprocessing Pipeline (Memory Optimized)")
print("="*60)
print(f"\nStarting with {adata_merged.n_obs:,} cells and {adata_merged.n_vars:,} genes")

# Make a working copy
adata = adata_merged.copy()

# ====================================================================
# Step 1: Basic filtering on genes (NOT cells)
# ====================================================================
print(f"\n{'='*60}")
print("Step 1: Gene Filtering")
print(f"{'='*60}")

# Keep genes expressed in at least 3 cells
sc.pp.filter_genes(adata, min_cells=3)
print(f"  Genes after min_cells filter: {adata.n_vars:,}")

# ====================================================================
# Step 2: Normalization
# ====================================================================
print(f"\n{'='*60}")
print("Step 2: Normalization")
print(f"{'='*60}")

# Store raw counts for later use
adata.layers['counts'] = adata.X.copy()
print("  ✓ Raw counts saved to .layers['counts']")

# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)
print("  ✓ Normalized to 10,000 counts per cell")

# Log transform
sc.pp.log1p(adata)
print("  ✓ Log1p transformation applied")

# ====================================================================
# Step 3: Highly Variable Genes Selection
# ====================================================================
print(f"\n{'='*60}")
print("Step 3: Highly Variable Genes")
print(f"{'='*60}")

# Identify highly variable genes
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    batch_key='sample_id',
    flavor='seurat_v3',
    subset=False
)

n_hvg = adata.var['highly_variable'].sum()
print(f"  Highly variable genes identified: {n_hvg}")

# Save a copy before scaling
adata.raw = adata
print("  ✓ Unscaled data saved to .raw")

# ====================================================================
# Step 4: Subset to HVGs and Scale (MEMORY EFFICIENT)
# ====================================================================
print(f"\n{'='*60}")
print("Step 4: Subsetting to HVGs and Scaling")
print(f"{'='*60}")

# IMPORTANT: Subset to HVGs BEFORE scaling to save memory
adata_hvg = adata[:, adata.var['highly_variable']].copy()
print(f"  ✓ Subset to {adata_hvg.n_vars} highly variable genes")

# NOW scale only the HVGs (much smaller matrix)
# Skip regress_out to save memory - batch correction will handle this
sc.pp.scale(adata_hvg, max_value=10)
print(f"  ✓ Scaled HVGs to unit variance (max_value=10)")
print(f"  ⚠️  Skipped regress_out to conserve memory - Harmony will handle batch effects")

# ====================================================================
# Step 5: PCA
# ====================================================================
print(f"\n{'='*60}")
print("Step 5: PCA")
print(f"{'='*60}")

# Run PCA on the scaled HVGs
sc.tl.pca(adata_hvg, svd_solver='arpack', n_comps=50)
print(f"  ✓ PCA computed with 50 components")

# Transfer PCA to original object
adata.obsm['X_pca'] = adata_hvg.obsm['X_pca']
adata.uns['pca'] = adata_hvg.uns['pca']
adata.varm['PCs'] = np.zeros((adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable'], :] = adata_hvg.varm['PCs']

# Plot variance ratio
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
variance_ratio = adata.uns['pca']['variance_ratio']
ax.plot(range(1, len(variance_ratio) + 1), variance_ratio, 'o-')
ax.set_xlabel('PC')
ax.set_ylabel('Variance Ratio')
ax.set_title('PCA Variance Ratio')
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(figures_dir, '01_pca_variance_ratio.png'), dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ PCA variance plot saved")

# Clean up to save memory
del adata_hvg
import gc
gc.collect()
print(f"  ✓ Memory cleanup done")

# ====================================================================
# Step 6: Batch Correction with Harmony
# ====================================================================
print(f"\n{'='*60}")
print("Step 6: Batch Correction (Harmony)")
print(f"{'='*60}")

try:
    import harmonypy
    
    # Run Harmony integration
    sc.external.pp.harmony_integrate(
        adata,
        key='sample_id',
        basis='X_pca',
        adjusted_basis='X_pca_harmony',
        max_iter_harmony=20
    )
    print(f"  ✓ Harmony batch correction completed")
    pca_key = 'X_pca_harmony'
    
except ImportError:
    print("  ⚠️  harmonypy not available, using uncorrected PCA")
    pca_key = 'X_pca'
except Exception as e:
    print(f"  ⚠️  Harmony failed: {e}")
    print(f"  Using uncorrected PCA")
    pca_key = 'X_pca'

# ====================================================================
# Step 7: Neighbor Graph
# ====================================================================
print(f"\n{'='*60}")
print("Step 7: Computing Neighbor Graph")
print(f"{'='*60}")

# Compute neighbors
sc.pp.neighbors(
    adata,
    n_neighbors=15,
    n_pcs=30,
    use_rep=pca_key,
    metric='cosine'
)
print(f"  ✓ Neighbor graph computed (n_neighbors=15, n_pcs=30)")

# ====================================================================
# Step 8: UMAP
# ====================================================================
print(f"\n{'='*60}")
print("Step 8: UMAP Embedding")
print(f"{'='*60}")

# Compute UMAP
sc.tl.umap(adata, min_dist=0.3, spread=1.0)
print(f"  ✓ UMAP computed")

# ====================================================================
# Step 9: Leiden Clustering (for reference)
# ====================================================================
print(f"\n{'='*60}")
print("Step 9: Leiden Clustering")
print(f"{'='*60}")

# Run Leiden clustering at different resolutions
for resolution in [0.5, 1.0, 1.5]:
    sc.tl.leiden(adata, resolution=resolution, key_added=f'leiden_res{resolution}')
    n_clusters = adata.obs[f'leiden_res{resolution}'].nunique()
    print(f"  Resolution {resolution}: {n_clusters} clusters")

# ====================================================================
# Summary
# ====================================================================
print(f"\n{'='*60}")
print("Preprocessing Complete!")
print(f"{'='*60}")
print(f"  Final cells: {adata.n_obs:,}")
print(f"  Final genes: {adata.n_vars:,}")
print(f"  HVGs: {adata.var['highly_variable'].sum()}")
print(f"  PCA components: {adata.obsm['X_pca'].shape[1]}")
print(f"  UMAP computed: {'X_umap' in adata.obsm}")
print(f"  Batch correction used: {pca_key}")

# Save preprocessed data
preprocessed_file = os.path.join(data_dir, f'Hill_preprocessed_with_UMAP_{datetime.now().strftime("%Y%m%d")}.h5ad')
adata.write_h5ad(preprocessed_file)
print(f"\n✓ Preprocessed data saved to:")
print(f"  {preprocessed_file}")

# Quick visualization - Basic UMAP
print(f"\n{'='*60}")
print("Creating Basic UMAP Visualizations")
print(f"{'='*60}")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Sample ID
sc.pl.umap(adata, color='sample_id', ax=axes[0,0], show=False, 
           legend_fontsize=4, title='Sample ID', frameon=False)

# AF status
sc.pl.umap(adata, color='af_status', ax=axes[0,1], show=False, title='AF Status', 
           palette={'case': 'red', 'control': 'blue'}, frameon=False)

# Sex
sc.pl.umap(adata, color='sex', ax=axes[0,2], show=False, title='Sex',
           palette={'m': 'steelblue', 'f': 'pink'}, frameon=False)

# Leiden clusters
sc.pl.umap(adata, color='leiden_res1.0', ax=axes[1,0], show=False, 
           title='Leiden Clusters (res=1.0)', frameon=False)

# Total counts
sc.pl.umap(adata, color='total_counts', ax=axes[1,1], show=False, 
           title='Total Counts', cmap='viridis', frameon=False)

# MT percentage
sc.pl.umap(adata, color='pct_counts_mt', ax=axes[1,2], show=False, 
           title='MT Percentage', cmap='Reds', frameon=False)

plt.tight_layout()
plt.savefig(os.path.join(figures_dir, '02_basic_UMAP_overview.png'), dpi=300, bbox_inches='tight')
plt.close()

print(f"✓ Basic UMAP visualizations saved")

Starting Preprocessing Pipeline (Memory Optimized)

Starting with 262,003 cells and 38,606 genes

Step 1: Gene Filtering
filtered out 2968 genes that are detected in less than 3 cells
  Genes after min_cells filter: 35,638

Step 2: Normalization
  ✓ Raw counts saved to .layers['counts']
normalizing counts per cell
    finished (0:00:01)
  ✓ Normalized to 10,000 counts per cell
  ✓ Log1p transformation applied

Step 3: Highly Variable Genes
extracting highly variable genes


  return fn(*args_all, **kw)


--> added
    'highly_variable', boolean vector (adata.var)
    'highly_variable_rank', float vector (adata.var)
    'means', float vector (adata.var)
    'variances', float vector (adata.var)
    'variances_norm', float vector (adata.var)
  Highly variable genes identified: 2000
  ✓ Unscaled data saved to .raw

Step 4: Subsetting to HVGs and Scaling
  ✓ Subset to 2000 highly variable genes


  return dispatch(args[0].__class__)(*args, **kw)


  ✓ Scaled HVGs to unit variance (max_value=10)
  ⚠️  Skipped regress_out to conserve memory - Harmony will handle batch effects

Step 5: PCA
computing PCA
    with n_comps=50
    finished (0:00:50)
  ✓ PCA computed with 50 components
  ✓ PCA variance plot saved
  ✓ Memory cleanup done

Step 6: Batch Correction (Harmony)


2025-10-17 16:33:49,356 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2025-10-17 16:34:01,184 - harmonypy - INFO - sklearn.KMeans initialization complete.
2025-10-17 16:34:01,901 - harmonypy - INFO - Iteration 1 of 20
2025-10-17 16:34:59,155 - harmonypy - INFO - Iteration 2 of 20
2025-10-17 16:35:55,684 - harmonypy - INFO - Iteration 3 of 20
2025-10-17 16:36:52,345 - harmonypy - INFO - Converged after 3 iterations


  ✓ Harmony batch correction completed

Step 7: Computing Neighbor Graph
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:56)
  ✓ Neighbor graph computed (n_neighbors=15, n_pcs=30)

Step 8: UMAP Embedding
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm)
    'umap', UMAP parameters (adata.uns) (0:02:17)
  ✓ UMAP computed

Step 9: Leiden Clustering
running Leiden clustering



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=resolution, key_added=f'leiden_res{resolution}')


    finished: found 28 clusters and added
    'leiden_res0.5', the cluster labels (adata.obs, categorical) (0:08:05)
  Resolution 0.5: 28 clusters
running Leiden clustering
    finished: found 32 clusters and added
    'leiden_res1.0', the cluster labels (adata.obs, categorical) (0:06:42)
  Resolution 1.0: 32 clusters
running Leiden clustering
    finished: found 42 clusters and added
    'leiden_res1.5', the cluster labels (adata.obs, categorical) (0:06:18)
  Resolution 1.5: 42 clusters

Preprocessing Complete!
  Final cells: 262,003
  Final genes: 35,638
  HVGs: 2000
  PCA components: 50
  UMAP computed: True
  Batch correction used: X_pca_harmony

✓ Preprocessed data saved to:
  /work/archive/farhadie/public_studies/Hill/QC_comparison_analysis/processed_data/Hill_preprocessed_with_UMAP_20251017.h5ad

Creating Basic UMAP Visualizations
✓ Basic UMAP visualizations saved


In [5]:
!ls -lh  /work/archive/farhadie/public_studies/Hill/QC_comparison_analysis/

total 14K
-rw-r--r-- 1 farhadie domain users 793 Oct 17 12:26 analysis_log_20251017.txt
drwxr-xr-x 2 farhadie domain users   4 Oct 17 17:01 figures
drwxr-xr-x 2 farhadie domain users   5 Oct 17 17:01 processed_data
drwxr-xr-x 2 farhadie domain users   2 Oct 17 12:26 results


In [14]:
# ====================================================================
# Individual UMAP Visualizations
# ====================================================================

print("="*60)
print("Creating Individual UMAP Plots")
print("="*60)

# Define variables to plot
plot_vars = [
    'sample_id',
    'af_status', 
    'sex',
    'leiden_res1.0',
    'pct_counts_mt',
    'fraction_unspliced',
    'pct_counts_nuclear',
    'pct_counts_CM_cyto',
    'pct_counts_CM_nucl',
    'qclus',
    'doublet_score',
    'combined_qc_5pct_flag',
    'combined_qc_10pct_flag',
    'combined_qc_20pct_flag',
    'combined_qc_25pct_flag'
]

# Define color palettes for categorical variables
color_palettes = {
    'af_status': {'case': '#E74C3C', 'control': '#3498DB'},
    'sex': {'m': '#5DADE2', 'f': '#EC7063'}
}

# Create individual UMAPs
for var in plot_vars:
    print(f"  Creating UMAP for: {var}")
    
    # Determine if variable is categorical or continuous
    is_categorical = adata.obs[var].dtype.name in ['category', 'object', 'bool']
    
    # Set figure size
    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
    
    # Plot parameters
    plot_params = {
        'show': False,
        'ax': ax,
        'title': var.replace('_', ' ').title(),
        'frameon': False
    }
    
    # Special handling for boolean flags - convert to string first
    if adata.obs[var].dtype.name == 'bool':
        # Create a temporary string version for plotting
        temp_col = f'{var}_str'
        adata.obs[temp_col] = adata.obs[var].map({True: 'Flagged', False: 'Pass'})
        adata.obs[temp_col] = pd.Categorical(adata.obs[temp_col])
        
        # Use the string version for plotting
        plot_params['palette'] = {'Flagged': '#E74C3C', 'Pass': '#95A5A6'}
        plot_params['legend_loc'] = 'right margin'
        
        # Plot with temp column
        sc.pl.umap(adata, color=temp_col, **plot_params)
        
        # Clean up temp column
        adata.obs.drop(columns=[temp_col], inplace=True)
        
    else:
        # Add palette if available for non-boolean categorical
        if var in color_palettes:
            plot_params['palette'] = color_palettes[var]
        
        # Add cmap for continuous variables
        if not is_categorical:
            plot_params['cmap'] = 'viridis'
            plot_params['vmax'] = 'p99'  # Clip at 99th percentile
        
        # Plot
        sc.pl.umap(adata, color=var, **plot_params)
    
    # Save as PNG
    png_file = os.path.join(figures_dir, f'UMAP_{var}.png')
    plt.savefig(png_file, dpi=300, bbox_inches='tight')
    
    # Save as PDF
    pdf_file = os.path.join(figures_dir, f'UMAP_{var}.pdf')
    plt.savefig(pdf_file, bbox_inches='tight')
    
    plt.close()

print(f"✓ All individual UMAPs saved to {figures_dir}")

# ====================================================================
# Highest Expressed Genes
# ====================================================================

print(f"\n{'='*60}")
print("Creating Highest Expressed Genes Plot")
print(f"{'='*60}")

fig = sc.pl.highest_expr_genes(adata, n_top=20, show=False)

# Save
png_file = os.path.join(figures_dir, 'highest_expr_genes_top20.png')
pdf_file = os.path.join(figures_dir, 'highest_expr_genes_top20.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Highest expressed genes plot saved")

# ====================================================================
# QC Metrics Distributions
# ====================================================================

print(f"\n{'='*60}")
print("Creating QC Metrics Distribution Plots")
print(f"{'='*60}")

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), dpi=150, sharey=False)

# Distribution plots
sns.histplot(adata.obs['n_genes_by_counts'], ax=ax1, stat='density', bins=100, kde=True)
ax1.set_xlabel('Number of Genes')
ax1.set_title('Genes per Cell')

sns.histplot(adata.obs['total_counts'], ax=ax2, stat='density', bins=100, kde=True)
ax2.set_xlabel('Total Counts')
ax2.set_title('UMI Counts per Cell')

sns.histplot(adata.obs['pct_counts_mt'], ax=ax3, stat='density', bins=100, kde=True)
ax3.set_xlabel('MT Percentage')
ax3.set_title('Mitochondrial %')

# Add common y-label
fig.text(-0.01, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
fig.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'QC_metrics_distributions.png')
pdf_file = os.path.join(figures_dir, 'QC_metrics_distributions.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ QC metrics distribution plot saved")

# ====================================================================
# Scatter Plot: Total Counts vs Genes
# ====================================================================

print(f"\n{'='*60}")
print("Creating Scatter Plot: Total Counts vs Genes")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(8, 6), dpi=150)

sc.pl.scatter(
    adata, 
    x='total_counts', 
    y='n_genes_by_counts', 
    color='pct_counts_mt',
    ax=ax,
    show=False,
    title='QC Metrics: Counts vs Genes',
    frameon=False
)

# Save
png_file = os.path.join(figures_dir, 'scatter_counts_vs_genes_by_MT.png')
pdf_file = os.path.join(figures_dir, 'scatter_counts_vs_genes_by_MT.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Scatter plot saved")

# ====================================================================
# Summary
# ====================================================================

print(f"\n{'='*60}")
print("All Visualizations Complete!")
print(f"{'='*60}")
print(f"Output directory: {figures_dir}")
print(f"\nFiles created:")
print(f"  - {len(plot_vars)} individual UMAP plots (PNG + PDF)")
print(f"  - 1 highest expressed genes plot (PNG + PDF)")
print(f"  - 1 QC metrics distribution plot (PNG + PDF)")
print(f"  - 1 scatter plot (PNG + PDF)")
print(f"\nTotal files: {(len(plot_vars) + 3) * 2}")

Creating Individual UMAP Plots
  Creating UMAP for: sample_id
  Creating UMAP for: af_status
  Creating UMAP for: sex
  Creating UMAP for: leiden_res1.0
  Creating UMAP for: pct_counts_mt
  Creating UMAP for: fraction_unspliced
  Creating UMAP for: pct_counts_nuclear
  Creating UMAP for: pct_counts_CM_cyto
  Creating UMAP for: pct_counts_CM_nucl
  Creating UMAP for: qclus
  Creating UMAP for: doublet_score
  Creating UMAP for: combined_qc_5pct_flag
  Creating UMAP for: combined_qc_10pct_flag
  Creating UMAP for: combined_qc_20pct_flag
  Creating UMAP for: combined_qc_25pct_flag
✓ All individual UMAPs saved to /work/archive/farhadie/public_studies/Hill/QC_comparison_analysis/figures

Creating Highest Expressed Genes Plot
normalizing counts per cell
    finished (0:00:00)
✓ Highest expressed genes plot saved

Creating QC Metrics Distribution Plots
✓ QC metrics distribution plot saved

Creating Scatter Plot: Total Counts vs Genes
✓ Scatter plot saved

All Visualizations Complete!
Output d

---
# 3- Comparison between different QC approaches 

In [29]:
# ====================================================================
# Correct Multi-way Overlap Analysis
# ====================================================================

print("="*60)
print("Multi-way Overlap Analysis: QClus vs 4 Combined QC Methods")
print("="*60)

# ====================================================================
# Define 5 independent QC methods
# ====================================================================

print("\nDefining 5 independent QC methods...")

qc_methods = {
    'QClus': set(adata.obs[adata.obs['qclus'] != 'passed'].index),
    'Combined_5pct': set(adata.obs[adata.obs['combined_qc_5pct_flag'] == True].index),
    'Combined_10pct': set(adata.obs[adata.obs['combined_qc_10pct_flag'] == True].index),
    'Combined_20pct': set(adata.obs[adata.obs['combined_qc_20pct_flag'] == True].index),
    'Combined_25pct': set(adata.obs[adata.obs['combined_qc_25pct_flag'] == True].index)
}

total_cells = adata.n_obs

# Print sizes (all should be constant)
print("\nQC Method Sizes (all FIXED):")
for method, cells in qc_methods.items():
    print(f"  {method}: {len(cells):,} cells ({len(cells)/total_cells*100:.2f}%)")

# ====================================================================
# Calculate all pairwise overlaps
# ====================================================================

print(f"\n{'='*60}")
print("Pairwise Overlaps")
print(f"{'='*60}")

method_names = list(qc_methods.keys())
overlap_matrix = np.zeros((5, 5), dtype=int)

for i, method1 in enumerate(method_names):
    for j, method2 in enumerate(method_names):
        overlap_matrix[i, j] = len(qc_methods[method1] & qc_methods[method2])

# Create overlap dataframe
overlap_df = pd.DataFrame(overlap_matrix, index=method_names, columns=method_names)
print("\nOverlap Matrix:")
print(overlap_df)

# Save
csv_file = os.path.join(results_dir, 'pairwise_overlap_matrix.csv')
overlap_df.to_csv(csv_file)
print(f"\n✓ Overlap matrix saved: {csv_file}")

# ====================================================================
# Calculate Jaccard similarity
# ====================================================================

print(f"\n{'='*60}")
print("Jaccard Similarity (Overlap / Union)")
print(f"{'='*60}")

jaccard_matrix = np.zeros((5, 5))

for i, method1 in enumerate(method_names):
    for j, method2 in enumerate(method_names):
        if i == j:
            jaccard_matrix[i, j] = 1.0
        else:
            intersection = len(qc_methods[method1] & qc_methods[method2])
            union = len(qc_methods[method1] | qc_methods[method2])
            jaccard_matrix[i, j] = intersection / union if union > 0 else 0

jaccard_df = pd.DataFrame(jaccard_matrix, index=method_names, columns=method_names)
print("\nJaccard Similarity Matrix:")
print(jaccard_df.round(3))

# Save
csv_file = os.path.join(results_dir, 'jaccard_similarity_matrix.csv')
jaccard_df.to_csv(csv_file)
print(f"\n✓ Jaccard similarity saved: {csv_file}")

# ====================================================================
# Prepare data for UpSet plot in R
# ====================================================================

print(f"\n{'='*60}")
print("Preparing Data for UpSet Plot")
print(f"{'='*60}")

# Create membership matrix
upset_data = []

for cell in adata.obs.index:
    row = {
        'cell_barcode': cell,
        'QClus': cell in qc_methods['QClus'],
        'Combined_5pct': cell in qc_methods['Combined_5pct'],
        'Combined_10pct': cell in qc_methods['Combined_10pct'],
        'Combined_20pct': cell in qc_methods['Combined_20pct'],
        'Combined_25pct': cell in qc_methods['Combined_25pct']
    }
    upset_data.append(row)

upset_df = pd.DataFrame(upset_data)

# Save
csv_file = os.path.join(results_dir, 'upset_5way_data.csv')
upset_df.to_csv(csv_file, index=False)
print(f"✓ UpSet data saved: {csv_file}")

# Calculate intersection counts
print(f"\nCalculating all intersections...")
intersection_counts = upset_df.groupby([
    'QClus', 'Combined_5pct', 'Combined_10pct', 'Combined_20pct', 'Combined_25pct'
]).size().reset_index(name='Count')

intersection_counts = intersection_counts[intersection_counts['Count'] > 0].sort_values('Count', ascending=False)

print(f"\nTop 15 intersections:")
print(intersection_counts.head(15).to_string(index=False))

# Save
csv_file = os.path.join(results_dir, 'intersection_counts_5way.csv')
intersection_counts.to_csv(csv_file, index=False)
print(f"\n✓ Intersection counts saved: {csv_file}")

# ====================================================================
# Prepare data for Venn diagrams (pairwise comparisons with QClus)
# ====================================================================

print(f"\n{'='*60}")
print("Preparing Venn Diagram Data (QClus vs each Combined)")
print(f"{'='*60}")

venn_data = []

for combined_method in ['Combined_5pct', 'Combined_10pct', 'Combined_20pct', 'Combined_25pct']:
    qclus_set = qc_methods['QClus']
    combined_set = qc_methods[combined_method]
    
    overlap = qclus_set & combined_set
    qclus_only = qclus_set - combined_set
    combined_only = combined_set - qclus_set
    union = qclus_set | combined_set
    
    venn_data.append({
        'Comparison': f'QClus vs {combined_method}',
        'Method1': 'QClus',
        'Method2': combined_method,
        'Method1_Total': len(qclus_set),
        'Method2_Total': len(combined_set),
        'Method1_Only': len(qclus_only),
        'Method2_Only': len(combined_only),
        'Overlap': len(overlap),
        'Union': len(union),
        'Jaccard': len(overlap) / len(union)
    })
    
    print(f"\n{combined_method}:")
    print(f"  QClus only: {len(qclus_only):,}")
    print(f"  {combined_method} only: {len(combined_only):,}")
    print(f"  Overlap: {len(overlap):,}")
    print(f"  Jaccard: {len(overlap) / len(union):.3f}")

venn_df = pd.DataFrame(venn_data)

# Save
csv_file = os.path.join(results_dir, 'venn_pairwise_data.csv')
venn_df.to_csv(csv_file, index=False)
print(f"\n✓ Venn pairwise data saved: {csv_file}")

# ====================================================================
# Visualize overlap heatmap
# ====================================================================

print(f"\n{'='*60}")
print("Creating Overlap Heatmap")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(10, 9))

# Plot heatmap
im = ax.imshow(overlap_matrix, cmap='YlOrRd', aspect='auto')

# Set ticks
ax.set_xticks(np.arange(len(method_names)))
ax.set_yticks(np.arange(len(method_names)))
ax.set_xticklabels(method_names, fontsize=11)
ax.set_yticklabels(method_names, fontsize=11)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Add text annotations
for i in range(len(method_names)):
    for j in range(len(method_names)):
        value = overlap_matrix[i, j]
        percentage = (value / total_cells) * 100
        
        # Determine text color
        text_color = "white" if value > overlap_matrix.max() / 2 else "black"
        
        # Add text
        text = ax.text(j, i, f'{value:,}\n({percentage:.1f}%)',
                      ha="center", va="center", color=text_color,
                      fontsize=9, fontweight='bold')

ax.set_title('5-way Overlap Matrix: QClus vs Combined QC Methods', 
             fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Number of Cells')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'overlap_heatmap_5way.png')
pdf_file = os.path.join(figures_dir, 'overlap_heatmap_5way.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Overlap heatmap saved")

# ====================================================================
# Visualize Jaccard similarity heatmap
# ====================================================================

print(f"\n{'='*60}")
print("Creating Jaccard Similarity Heatmap")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(10, 9))

# Plot heatmap
im = ax.imshow(jaccard_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)

# Set ticks
ax.set_xticks(np.arange(len(method_names)))
ax.set_yticks(np.arange(len(method_names)))
ax.set_xticklabels(method_names, fontsize=11)
ax.set_yticklabels(method_names, fontsize=11)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Add text annotations
for i in range(len(method_names)):
    for j in range(len(method_names)):
        value = jaccard_matrix[i, j]
        
        # Determine text color
        text_color = "white" if value > 0.5 else "black"
        
        # Add text
        text = ax.text(j, i, f'{value:.3f}',
                      ha="center", va="center", color=text_color,
                      fontsize=10, fontweight='bold')

ax.set_title('Jaccard Similarity Between QC Methods', 
             fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Jaccard Index')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'jaccard_similarity_heatmap_5way.png')
pdf_file = os.path.join(figures_dir, 'jaccard_similarity_heatmap_5way.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Jaccard similarity heatmap saved")

# ====================================================================
# Generate updated R script
# ====================================================================

print(f"\n{'='*60}")
print("Generating R Script for Venn and UpSet Plots")
print(f"{'='*60}")

r_script = f"""
# R script for creating Venn diagrams and UpSet plot
# 5-way comparison: QClus vs 4 Combined QC methods

library(VennDiagram)
library(UpSetR)
library(tidyverse)

# Set paths
results_dir <- "{results_dir}"
figures_dir <- "{figures_dir}"

# ====================================================================
# Load data
# ====================================================================

venn_data <- read.csv(file.path(results_dir, "venn_pairwise_data.csv"))
upset_data <- read.csv(file.path(results_dir, "upset_5way_data.csv"))

# ====================================================================
# Create pairwise Venn diagrams (QClus vs each Combined)
# ====================================================================

comparisons <- c("Combined_5pct", "Combined_10pct", "Combined_20pct", "Combined_25pct")
labels <- c("5%", "10%", "20%", "25%")

for(i in 1:length(comparisons)) {{
  
  row <- venn_data[venn_data$Method2 == comparisons[i], ]
  
  venn.plot <- draw.pairwise.venn(
    area1 = row$Method1_Total,
    area2 = row$Method2_Total,
    cross.area = row$Overlap,
    category = c("QClus", paste0("Combined QC (", labels[i], " SoupX)")),
    fill = c("#E74C3C", "#3498DB"),
    alpha = 0.5,
    cat.fontface = "bold",
    cat.cex = 1.3,
    cex = 1.6,
    cat.pos = c(-20, 20),
    cat.dist = 0.05,
    euler.d = TRUE,
    scaled = TRUE
  )
  
  # Save PNG
  png(file.path(figures_dir, paste0("venn_QClus_vs_Combined_", labels[i], ".png")),
      width = 900, height = 900, res = 150)
  grid.draw(venn.plot)
  dev.off()
  
  # Save PDF
  pdf(file.path(figures_dir, paste0("venn_QClus_vs_Combined_", labels[i], ".pdf")),
      width = 9, height = 9)
  grid.draw(venn.plot)
  dev.off()
  
  cat("Created Venn diagram: QClus vs Combined", labels[i], "\\n")
}}

# ====================================================================
# Create 5-way UpSet plot
# ====================================================================

# Prepare data
upset_matrix <- upset_data %>%
  select(-cell_barcode) %>%
  mutate(across(everything(), as.numeric))

# Create UpSet plot
png(file.path(figures_dir, "upset_5way_QClus_vs_CombinedQC.png"),
    width = 1400, height = 900, res = 150)

upset(upset_matrix,
      sets = c("QClus", "Combined_5pct", "Combined_10pct", "Combined_20pct", "Combined_25pct"),
      order.by = "freq",
      mainbar.y.label = "Number of Cells",
      sets.x.label = "Total Cells Flagged",
      text.scale = c(1.5, 1.3, 1.3, 1.2, 1.5, 1.3),
      point.size = 4,
      line.size = 1.2,
      set_size.show = TRUE,
      set_size.angles = 0,
      mb.ratio = c(0.6, 0.4))

dev.off()

# Save PDF
pdf(file.path(figures_dir, "upset_5way_QClus_vs_CombinedQC.pdf"),
    width = 14, height = 9)

upset(upset_matrix,
      sets = c("QClus", "Combined_5pct", "Combined_10pct", "Combined_20pct", "Combined_25pct"),
      order.by = "freq",
      mainbar.y.label = "Number of Cells",
      sets.x.label = "Total Cells Flagged",
      text.scale = c(1.5, 1.3, 1.3, 1.2, 1.5, 1.3),
      point.size = 4,
      line.size = 1.2,
      set_size.show = TRUE,
      set_size.angles = 0,
      mb.ratio = c(0.6, 0.4))

dev.off()

cat("\\n5-way UpSet plot created\\n")
cat("All visualizations complete!\\n")
"""

# Save R script
r_script_file = os.path.join(results_dir, 'create_venn_upset_5way.R')
with open(r_script_file, 'w') as f:
    f.write(r_script)

print(f"✓ R script saved: {r_script_file}")

# ====================================================================
# Summary
# ====================================================================

print(f"\n{'='*60}")
print("5-way Analysis Complete!")
print(f"{'='*60}")
print(f"\nQC Method Sizes (FIXED):")
for method, cells in qc_methods.items():
    print(f"  {method}: {len(cells):,} cells")

print(f"\nFiles created:")
print(f"  - pairwise_overlap_matrix.csv")
print(f"  - jaccard_similarity_matrix.csv")
print(f"  - upset_5way_data.csv")
print(f"  - intersection_counts_5way.csv")
print(f"  - venn_pairwise_data.csv")
print(f"  - create_venn_upset_5way.R")
print(f"  - 2 heatmaps (overlap + Jaccard)")

print(f"\nTo create Venn and UpSet plots in R:")
print(f"  source('{r_script_file}')")

Multi-way Overlap Analysis: QClus vs 4 Combined QC Methods

Defining 5 independent QC methods...

QC Method Sizes (all FIXED):
  QClus: 113,366 cells (43.27%)
  Combined_5pct: 38,581 cells (14.73%)
  Combined_10pct: 50,387 cells (19.23%)
  Combined_20pct: 74,457 cells (28.42%)
  Combined_25pct: 86,575 cells (33.04%)

Pairwise Overlaps

Overlap Matrix:
                 QClus  Combined_5pct  Combined_10pct  Combined_20pct  \
QClus           113366          28442           34982           45585   
Combined_5pct    28442          38581           38581           38581   
Combined_10pct   34982          38581           50387           50387   
Combined_20pct   45585          38581           50387           74457   
Combined_25pct   50456          38581           50387           74457   

                Combined_25pct  
QClus                    50456  
Combined_5pct            38581  
Combined_10pct           50387  
Combined_20pct           74457  
Combined_25pct           86575  

✓ Overla

---
# 4- Comparison with orginal paper

In [34]:
# ====================================================================
# Add 'paper' column
# ====================================================================

print("="*60)
print("Adding 'paper' column to AnnData (Fast)")
print("="*60)

# Load paper metadata
metadata_path = "/work/archive/public_studies/Hill/cellranger_runs/GSE255612_AF_snRNA_MetaData.txt"
paper_metadata = pd.read_csv(metadata_path, sep="\t", low_memory=False)

# Remove TYPE row
paper_metadata = paper_metadata.drop(0).reset_index(drop=True)

print(f"\nPaper metadata: {len(paper_metadata):,} cells")
print(f"AnnData: {adata.n_obs:,} cells")

# Extract first 16 characters from paper barcodes (before -1-X)
paper_barcodes_16 = set([bc[:16] for bc in paper_metadata['NAME'].values])
print(f"\nUnique 16-char barcodes in paper: {len(paper_barcodes_16):,}")

# Extract first 16 characters from adata barcodes (before _sampleID)
print("\nExtracting barcodes from adata...")
adata_barcodes_16 = [bc.split('_')[0][:16] for bc in adata.obs.index]

# Fast matching using set
print("Matching barcodes...")
adata.obs['paper'] = [bc in paper_barcodes_16 for bc in adata_barcodes_16]

# Summary
n_in_paper = sum(adata.obs['paper'])
n_not_in_paper = len(adata.obs) - n_in_paper

print(f"\nResults:")
print(f"  In paper: {n_in_paper:,} ({n_in_paper/adata.n_obs*100:.2f}%)")
print(f"  Not in paper: {n_not_in_paper:,} ({n_not_in_paper/adata.n_obs*100:.2f}%)")

# Convert to categorical
adata.obs['paper'] = pd.Categorical(adata.obs['paper'])

# Quick stats
print(f"\n{'='*60}")
print("Paper vs AF status:")
print(pd.crosstab(adata.obs['paper'], adata.obs['af_status']))

print(f"\n{'='*60}")
print("QC metrics comparison:")
for metric in ['total_counts', 'n_genes_by_counts', 'pct_counts_mt']:
    paper_mean = adata.obs[adata.obs['paper'].astype(bool)][metric].mean()
    non_paper_mean = adata.obs[~adata.obs['paper'].astype(bool)][metric].mean()
    print(f"  {metric}: Paper={paper_mean:.1f}, Non-paper={non_paper_mean:.1f}")

# UMAP
print(f"\n{'='*60}")
print("Creating UMAP...")

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
adata.obs['paper_str'] = adata.obs['paper'].astype(bool).map({True: 'In Paper', False: 'Not in Paper'})

sc.pl.umap(adata, color='paper_str', 
           palette={'In Paper': '#2ECC71', 'Not in Paper': '#95A5A6'},
           ax=ax, show=False, title='Paper Status', frameon=False)

plt.savefig(os.path.join(figures_dir, 'UMAP_paper_status.png'), dpi=300, bbox_inches='tight')
plt.savefig(os.path.join(figures_dir, 'UMAP_paper_status.pdf'), bbox_inches='tight')
plt.close()

adata.obs.drop(columns=['paper_str'], inplace=True)
print("✓ UMAP saved")

# Save
output_file = os.path.join(data_dir, f'Hill_preprocessed_with_paper_{datetime.now().strftime("%Y%m%d")}.h5ad')
adata.write_h5ad(output_file)
print(f"\n✓ Saved: {output_file}")

print(f"\n{'='*60}")
print("Done!")
print(f"{'='*60}")

Adding 'paper' column to AnnData (Fast)

Paper metadata: 179,697 cells
AnnData: 262,003 cells

Unique 16-char barcodes in paper: 175,275

Extracting barcodes from adata...
Matching barcodes...

Results:
  In paper: 178,930 (68.29%)
  Not in paper: 83,073 (31.71%)

Paper vs AF status:
af_status   case  control
paper                    
False      43892    39181
True       95964    82966

QC metrics comparison:
  total_counts: Paper=3172.8, Non-paper=3757.0
  n_genes_by_counts: Paper=1653.7, Non-paper=1738.4
  pct_counts_mt: Paper=1.0, Non-paper=2.6

Creating UMAP...
✓ UMAP saved

✓ Saved: /work/archive/farhadie/public_studies/Hill/QC_comparison_analysis/processed_data/Hill_preprocessed_with_paper_20251018.h5ad

Done!


In [39]:
# ====================================================================
# Overlap Analysis: 6-way comparison (including Paper FILTERED)
# ====================================================================

print("="*60)
print("6-way Overlap Analysis: QClus + Combined QC + Paper (FILTERED)")
print("="*60)

# ====================================================================
# Define 6 independent QC methods - CORRECTED PAPER
# ====================================================================

print("\nDefining 6 independent QC/filter methods...")

qc_methods = {
    'QClus': set(adata.obs[adata.obs['qclus'] != 'passed'].index),
    'Combined_5pct': set(adata.obs[adata.obs['combined_qc_5pct_flag'] == True].index),
    'Combined_10pct': set(adata.obs[adata.obs['combined_qc_10pct_flag'] == True].index),
    'Combined_20pct': set(adata.obs[adata.obs['combined_qc_20pct_flag'] == True].index),
    'Combined_25pct': set(adata.obs[adata.obs['combined_qc_25pct_flag'] == True].index),
    'Paper_Filtered': set(adata.obs[~adata.obs['paper'].astype(bool)].index)  # NOT in paper = filtered
}

total_cells = adata.n_obs

# Print sizes
print("\nMethod Sizes (all FIXED):")
for method, cells in qc_methods.items():
    print(f"  {method}: {len(cells):,} cells ({len(cells)/total_cells*100:.2f}%)")

# ====================================================================
# Calculate pairwise overlaps - 6x6 matrix
# ====================================================================

print(f"\n{'='*60}")
print("Calculating Pairwise Overlaps")
print(f"{'='*60}")

method_names = list(qc_methods.keys())
n_methods = len(method_names)

# Overlap matrix (intersection counts)
overlap_matrix = np.zeros((n_methods, n_methods), dtype=int)

for i, method1 in enumerate(method_names):
    for j, method2 in enumerate(method_names):
        overlap_matrix[i, j] = len(qc_methods[method1] & qc_methods[method2])

# Create overlap dataframe
overlap_df = pd.DataFrame(overlap_matrix, index=method_names, columns=method_names)

print("\nOverlap Matrix (cell counts):")
print(overlap_df)

# Save
csv_file = os.path.join(results_dir, 'pairwise_overlap_matrix_6way.csv')
overlap_df.to_csv(csv_file)
print(f"\n✓ Overlap matrix saved: {csv_file}")

# ====================================================================
# Calculate Jaccard similarity
# ====================================================================

print(f"\n{'='*60}")
print("Calculating Jaccard Similarity")
print(f"{'='*60}")

jaccard_matrix = np.zeros((n_methods, n_methods))

for i, method1 in enumerate(method_names):
    for j, method2 in enumerate(method_names):
        if i == j:
            jaccard_matrix[i, j] = 1.0
        else:
            intersection = len(qc_methods[method1] & qc_methods[method2])
            union = len(qc_methods[method1] | qc_methods[method2])
            jaccard_matrix[i, j] = intersection / union if union > 0 else 0

jaccard_df = pd.DataFrame(jaccard_matrix, index=method_names, columns=method_names)

print("\nJaccard Similarity Matrix:")
print(jaccard_df.round(3))

# Save
csv_file = os.path.join(results_dir, 'jaccard_similarity_matrix_6way.csv')
jaccard_df.to_csv(csv_file)
print(f"\n✓ Jaccard similarity saved: {csv_file}")

# ====================================================================
# Overlap percentage matrix (% of row method that overlaps with column)
# ====================================================================

print(f"\n{'='*60}")
print("Calculating Overlap Percentages")
print(f"{'='*60}")

overlap_pct_matrix = np.zeros((n_methods, n_methods))

for i, method1 in enumerate(method_names):
    for j, method2 in enumerate(method_names):
        if len(qc_methods[method1]) > 0:
            overlap_pct_matrix[i, j] = (overlap_matrix[i, j] / len(qc_methods[method1])) * 100
        else:
            overlap_pct_matrix[i, j] = 0

overlap_pct_df = pd.DataFrame(overlap_pct_matrix, index=method_names, columns=method_names)

print("\nOverlap Percentage Matrix (% of row found in column):")
print(overlap_pct_df.round(1))

# Save
csv_file = os.path.join(results_dir, 'overlap_percentage_matrix_6way.csv')
overlap_pct_df.to_csv(csv_file)
print(f"\n✓ Overlap percentage saved: {csv_file}")

# ====================================================================
# Visualize overlap count heatmap
# ====================================================================

print(f"\n{'='*60}")
print("Creating Overlap Count Heatmap")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(12, 10))

# Plot heatmap
im = ax.imshow(overlap_matrix, cmap='YlOrRd', aspect='auto')

# Set ticks
ax.set_xticks(np.arange(n_methods))
ax.set_yticks(np.arange(n_methods))
ax.set_xticklabels(method_names, fontsize=11)
ax.set_yticklabels(method_names, fontsize=11)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Add text annotations
for i in range(n_methods):
    for j in range(n_methods):
        value = overlap_matrix[i, j]
        percentage = (value / total_cells) * 100
        
        # Determine text color
        text_color = "white" if value > overlap_matrix.max() / 2 else "black"
        
        # Add text
        text = ax.text(j, i, f'{value:,}\n({percentage:.1f}%)',
                      ha="center", va="center", color=text_color,
                      fontsize=9, fontweight='bold')

ax.set_title('6-way Overlap: QClus + Combined QC + Paper Filtered', 
             fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Number of Cells')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'overlap_heatmap_6way_counts.png')
pdf_file = os.path.join(figures_dir, 'overlap_heatmap_6way_counts.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Overlap count heatmap saved")

# ====================================================================
# Visualize Jaccard similarity heatmap
# ====================================================================

print(f"\n{'='*60}")
print("Creating Jaccard Similarity Heatmap")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(12, 10))

# Plot heatmap
im = ax.imshow(jaccard_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)

# Set ticks
ax.set_xticks(np.arange(n_methods))
ax.set_yticks(np.arange(n_methods))
ax.set_xticklabels(method_names, fontsize=11)
ax.set_yticklabels(method_names, fontsize=11)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Add text annotations
for i in range(n_methods):
    for j in range(n_methods):
        value = jaccard_matrix[i, j]
        
        # Determine text color
        text_color = "white" if value > 0.5 else "black"
        
        # Add text
        text = ax.text(j, i, f'{value:.3f}',
                      ha="center", va="center", color=text_color,
                      fontsize=10, fontweight='bold')

ax.set_title('Jaccard Similarity Between QC Methods', 
             fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Jaccard Index (Intersection/Union)')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'jaccard_similarity_heatmap_6way.png')
pdf_file = os.path.join(figures_dir, 'jaccard_similarity_heatmap_6way.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Jaccard similarity heatmap saved")

# ====================================================================
# Visualize overlap percentage heatmap
# ====================================================================

print(f"\n{'='*60}")
print("Creating Overlap Percentage Heatmap")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(12, 10))

# Plot heatmap
im = ax.imshow(overlap_pct_matrix, cmap='Blues', aspect='auto', vmin=0, vmax=100)

# Set ticks
ax.set_xticks(np.arange(n_methods))
ax.set_yticks(np.arange(n_methods))
ax.set_xticklabels(method_names, fontsize=11)
ax.set_yticklabels(method_names, fontsize=11)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Add text annotations
for i in range(n_methods):
    for j in range(n_methods):
        value = overlap_pct_matrix[i, j]
        
        # Determine text color
        text_color = "white" if value > 50 else "black"
        
        # Add text
        text = ax.text(j, i, f'{value:.1f}%',
                      ha="center", va="center", color=text_color,
                      fontsize=9, fontweight='bold')

ax.set_title('Overlap Percentage: % of Row Method Found in Column Method', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Column Method', fontsize=12, fontweight='bold')
ax.set_ylabel('Row Method (% of these cells also in column)', fontsize=12, fontweight='bold')

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Percentage (%)')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'overlap_percentage_heatmap_6way.png')
pdf_file = os.path.join(figures_dir, 'overlap_percentage_heatmap_6way.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Overlap percentage heatmap saved")

# ====================================================================
# Detailed overlap report
# ====================================================================

print(f"\n{'='*60}")
print("Generating Detailed Overlap Report")
print(f"{'='*60}")

# Pairwise comparisons with Paper_Filtered
paper_comparisons = []

for method in ['QClus', 'Combined_5pct', 'Combined_10pct', 'Combined_20pct', 'Combined_25pct']:
    method_set = qc_methods[method]
    paper_set = qc_methods['Paper_Filtered']
    
    overlap = method_set & paper_set
    method_only = method_set - paper_set
    paper_only = paper_set - method_set
    union = method_set | paper_set
    
    jaccard = len(overlap) / len(union) if len(union) > 0 else 0
    
    paper_comparisons.append({
        'Method': method,
        'Method_Total': len(method_set),
        'Paper_Filtered_Total': len(paper_set),
        'Overlap': len(overlap),
        'Method_Only': len(method_only),
        'Paper_Filtered_Only': len(paper_only),
        'Union': len(union),
        'Jaccard': jaccard,
        'Pct_Method_in_Paper_Filtered': (len(overlap) / len(method_set) * 100) if len(method_set) > 0 else 0,
        'Pct_Paper_Filtered_in_Method': (len(overlap) / len(paper_set) * 100) if len(paper_set) > 0 else 0
    })

paper_comp_df = pd.DataFrame(paper_comparisons)

print("\nComparison with Paper Filtered Cells:")
print(paper_comp_df.to_string(index=False))

# Save
csv_file = os.path.join(results_dir, 'paper_filtered_comparison_detailed.csv')
paper_comp_df.to_csv(csv_file, index=False)
print(f"\n✓ Paper filtered comparison saved: {csv_file}")

# ====================================================================
# Summary
# ====================================================================

print(f"\n{'='*60}")
print("6-way Analysis Complete!")
print(f"{'='*60}")

print(f"\nFiles created:")
print(f"  - pairwise_overlap_matrix_6way.csv")
print(f"  - jaccard_similarity_matrix_6way.csv")
print(f"  - overlap_percentage_matrix_6way.csv")
print(f"  - paper_filtered_comparison_detailed.csv")
print(f"  - 3 heatmaps (counts, Jaccard, percentage) - PNG + PDF each")

print(f"\nTotal files: 10 (4 CSV + 6 images)")

6-way Overlap Analysis: QClus + Combined QC + Paper (FILTERED)

Defining 6 independent QC/filter methods...

Method Sizes (all FIXED):
  QClus: 113,366 cells (43.27%)
  Combined_5pct: 38,581 cells (14.73%)
  Combined_10pct: 50,387 cells (19.23%)
  Combined_20pct: 74,457 cells (28.42%)
  Combined_25pct: 86,575 cells (33.04%)
  Paper_Filtered: 83,073 cells (31.71%)

Calculating Pairwise Overlaps

Overlap Matrix (cell counts):
                 QClus  Combined_5pct  Combined_10pct  Combined_20pct  \
QClus           113366          28442           34982           45585   
Combined_5pct    28442          38581           38581           38581   
Combined_10pct   34982          38581           50387           50387   
Combined_20pct   45585          38581           50387           74457   
Combined_25pct   50456          38581           50387           74457   
Paper_Filtered   57721          28521           36978           47788   

                Combined_25pct  Paper_Filtered  
QClus      

---
# 5- Cell typing

In [42]:
# ====================================================================
# Cell Type Assignment Based on Marker Scores
# ====================================================================

print("="*60)
print("Cell Type Assignment Using Marker Scores")
print("="*60)

# ====================================================================
# Define valid cell types (exclude QC metrics)
# ====================================================================

# Define cell type abbreviations and their full names
cell_type_mapping = {
    'CM_nucl': 'Cardiomyocytes (Nuclear)',
    'VEC': 'Vascular Endothelial Cells',
    'PER': 'Pericytes',
    'SMC': 'Smooth Muscle Cells',
    'AD': 'Adipocytes',
    'SC': 'Schwann Cells',
    'N': 'Neuronal',
    'EEC': 'Endocardial Endothelial Cells',
    'FB': 'Fibroblasts',
    'L': 'Lymphocytes',
    'MESO': 'Mesothelial',
    'MP': 'Myeloid/Phagocytes'
}

# Get ONLY the cell type score columns (exclude nuclear, MT, etc.)
cell_types = list(cell_type_mapping.keys())
score_columns = [f'score_{ct}' for ct in cell_types]

print(f"\nUsing {len(cell_types)} valid cell types:")
for ct in cell_types:
    print(f"  - {ct}: {cell_type_mapping[ct]}")

# Verify all columns exist
missing_cols = [col for col in score_columns if col not in adata.obs.columns]
if missing_cols:
    print(f"\n⚠️  Warning: Missing columns: {missing_cols}")
    # Remove missing from lists
    for col in missing_cols:
        ct = col.replace('score_', '')
        cell_types.remove(ct)
        score_columns.remove(col)
        del cell_type_mapping[ct]

# ====================================================================
# Assign cell type based on highest score
# ====================================================================

print(f"\n{'='*60}")
print("Assigning Cell Types")
print(f"{'='*60}")

# Create a dataframe with ONLY valid cell type scores
scores_df = adata.obs[score_columns].copy()
scores_df.columns = cell_types

# Find cell type with maximum score for each cell
adata.obs['cell_type_assigned'] = scores_df.idxmax(axis=1)
adata.obs['cell_type_score'] = scores_df.max(axis=1)

# Add full names
adata.obs['cell_type_full'] = adata.obs['cell_type_assigned'].map(cell_type_mapping)

print(f"\nCell type distribution:")
cell_type_counts = adata.obs['cell_type_assigned'].value_counts()
for ct, count in cell_type_counts.items():
    full_name = cell_type_mapping.get(ct, ct)
    print(f"  {ct:15s} ({full_name:40s}): {count:6,} cells ({count/adata.n_obs*100:5.2f}%)")

# ====================================================================
# Calculate mean scores per cluster
# ====================================================================

print(f"\n{'='*60}")
print("Calculating Mean Scores per Cluster (Leiden res=1.0)")
print(f"{'='*60}")

# Group by leiden clusters and calculate mean scores
cluster_scores = adata.obs.groupby('leiden_res1.0')[score_columns].mean()
cluster_scores.columns = cell_types

# For each cluster, find the cell type with highest mean score
cluster_assignments = cluster_scores.idxmax(axis=1)
cluster_max_scores = cluster_scores.max(axis=1)

# Create summary table
cluster_summary = pd.DataFrame({
    'Cluster': cluster_scores.index,
    'Assigned_Cell_Type': cluster_assignments.values,
    'Max_Score': cluster_max_scores.values,
    'N_Cells': adata.obs.groupby('leiden_res1.0').size().values
})

# Add full names
cluster_summary['Cell_Type_Full'] = cluster_summary['Assigned_Cell_Type'].map(cell_type_mapping)

# Add all scores for reference
for ct in cell_types:
    cluster_summary[f'Score_{ct}'] = cluster_scores[ct].values

# Sort by cluster number
cluster_summary['Cluster'] = cluster_summary['Cluster'].astype(int)
cluster_summary = cluster_summary.sort_values('Cluster')

print("\nCluster Assignment Summary:")
print(cluster_summary[['Cluster', 'Assigned_Cell_Type', 'Cell_Type_Full', 'Max_Score', 'N_Cells']].to_string(index=False))

# Save full table
csv_file = os.path.join(results_dir, 'cluster_cell_type_assignments.csv')
cluster_summary.to_csv(csv_file, index=False)
print(f"\n✓ Full cluster assignment table saved: {csv_file}")

# ====================================================================
# Create heatmap of cluster scores
# ====================================================================

print(f"\n{'='*60}")
print("Creating Cluster Score Heatmap")
print(f"{'='*60}")

# Prepare data for heatmap
heatmap_data = cluster_scores.T  # Transpose so cell types are rows

fig, ax = plt.subplots(1, 1, figsize=(max(14, len(cluster_scores) * 0.5), 12))

# Plot heatmap
im = ax.imshow(heatmap_data, cmap='RdYlBu_r', aspect='auto')

# Set ticks
ax.set_xticks(np.arange(len(cluster_scores)))
ax.set_yticks(np.arange(len(cell_types)))
ax.set_xticklabels(cluster_scores.index, fontsize=9)
ax.set_yticklabels([f"{ct} - {cell_type_mapping[ct][:30]}" for ct in cell_types], fontsize=9)

# Rotate x labels
plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

# Add text annotations - show only for top 3 scores per cluster
for j in range(len(cluster_scores)):
    # Get top 3 scores for this cluster
    col_values = heatmap_data.iloc[:, j]
    top3_indices = col_values.nlargest(3).index
    
    for i in range(len(cell_types)):
        value = heatmap_data.iloc[i, j]
        
        # Only annotate if in top 3
        if cell_types[i] in [ct for ct in top3_indices]:
            text_color = "white" if value > heatmap_data.values.mean() else "black"
            text = ax.text(j, i, f'{value:.1f}',
                          ha="center", va="center", color=text_color,
                          fontsize=7, fontweight='bold')

ax.set_xlabel('Leiden Cluster (res=1.0)', fontsize=12, fontweight='bold')
ax.set_ylabel('Cell Type', fontsize=12, fontweight='bold')
ax.set_title('Mean Cell Type Marker Scores per Cluster', fontsize=14, fontweight='bold', pad=20)

# Add colorbar
cbar = fig.colorbar(im, ax=ax, label='Mean Score')

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'cluster_celltype_score_heatmap.png')
pdf_file = os.path.join(figures_dir, 'cluster_celltype_score_heatmap.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Heatmap saved")

# ====================================================================
# Create UMAP with cell type annotations
# ====================================================================

print(f"\n{'='*60}")
print("Creating Cell Type UMAPs")
print(f"{'='*60}")

# Generate color palette for cell types
n_celltypes = len(cell_type_counts)
colors = plt.cm.tab20(np.linspace(0, 1, min(n_celltypes, 20)))
color_dict = dict(zip(cell_type_counts.index, [f'#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}' for c in colors]))

# Create figure with two panels
fig = plt.figure(figsize=(22, 8))

# Panel 1: UMAP colored by cell type (left, larger)
ax1 = plt.subplot(1, 2, 1)
sc.pl.umap(
    adata,
    color='cell_type_assigned',
    palette=color_dict,
    ax=ax1,
    show=False,
    title='Cell Type Assignment (Based on Marker Scores)',
    frameon=False,
    legend_loc='none'
)

# Panel 2: Legend with full names (right)
ax2 = plt.subplot(1, 2, 2)
ax2.axis('off')

# Create legend with sorted cell types by count
legend_elements = []
for ct in cell_type_counts.index:
    count = cell_type_counts[ct]
    full_name = cell_type_mapping.get(ct, ct)
    label = f'{ct}: {full_name}\n    ({count:,} cells, {count/adata.n_obs*100:.1f}%)'
    legend_elements.append(mpatches.Patch(facecolor=color_dict[ct], label=label))

ax2.legend(
    handles=legend_elements,
    loc='center left',
    fontsize=10,
    frameon=True,
    title='Cell Types',
    title_fontsize=12
)

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'UMAP_cell_types_with_legend.png')
pdf_file = os.path.join(figures_dir, 'UMAP_cell_types_with_legend.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ UMAP with legend saved")

# Also create a version with legend on data
fig, ax = plt.subplots(1, 1, figsize=(14, 11))

sc.pl.umap(
    adata,
    color='cell_type_assigned',
    palette=color_dict,
    ax=ax,
    show=False,
    title='Cell Type Assignment',
    frameon=False,
    legend_loc='right margin',
    legend_fontsize=8
)

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'UMAP_cell_types_simple.png')
pdf_file = os.path.join(figures_dir, 'UMAP_cell_types_simple.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Simple UMAP saved")

# ====================================================================
# Create UMAP colored by cell type score
# ====================================================================

print(f"\n{'='*60}")
print("Creating Score Distribution UMAP")
print(f"{'='*60}")

fig, ax = plt.subplots(1, 1, figsize=(11, 9))

sc.pl.umap(
    adata,
    color='cell_type_score',
    cmap='viridis',
    ax=ax,
    show=False,
    title='Cell Type Assignment Score (Confidence)',
    frameon=False,
    vmin='p5',
    vmax='p95'
)

plt.tight_layout()

# Save
png_file = os.path.join(figures_dir, 'UMAP_cell_type_scores.png')
pdf_file = os.path.join(figures_dir, 'UMAP_cell_type_scores.pdf')
plt.savefig(png_file, dpi=300, bbox_inches='tight')
plt.savefig(pdf_file, bbox_inches='tight')
plt.close()

print(f"✓ Score UMAP saved")

# ====================================================================
# Save updated AnnData
# ====================================================================

print(f"\n{'='*60}")
print("Saving Updated AnnData with Cell Type Annotations")
print(f"{'='*60}")

output_file = os.path.join(data_dir, f'Hill_preprocessed_with_celltypes_{datetime.now().strftime("%Y%m%d")}.h5ad')
adata.write_h5ad(output_file)
print(f"✓ Saved: {output_file}")

# ====================================================================
# Summary
# ====================================================================

print(f"\n{'='*60}")
print("Cell Type Assignment Complete!")
print(f"{'='*60}")

print(f"\nNew columns added to adata.obs:")
print(f"  - cell_type_assigned: Short cell type name")
print(f"  - cell_type_full: Full cell type name")
print(f"  - cell_type_score: Assignment confidence score")

print(f"\nFiles created:")
print(f"  - cluster_cell_type_assignments.csv")
print(f"  - cluster_celltype_score_heatmap (PNG + PDF)")
print(f"  - UMAP_cell_types_with_legend (PNG + PDF)")
print(f"  - UMAP_cell_types_simple (PNG + PDF)")
print(f"  - UMAP_cell_type_scores (PNG + PDF)")

print(f"\nTotal files: 9 (1 CSV + 8 images)")

Cell Type Assignment Using Marker Scores

Using 12 valid cell types:
  - CM_nucl: Cardiomyocytes (Nuclear)
  - VEC: Vascular Endothelial Cells
  - PER: Pericytes
  - SMC: Smooth Muscle Cells
  - AD: Adipocytes
  - SC: Schwann Cells
  - N: Neuronal
  - EEC: Endocardial Endothelial Cells
  - FB: Fibroblasts
  - L: Lymphocytes
  - MESO: Mesothelial
  - MP: Myeloid/Phagocytes

Assigning Cell Types

Cell type distribution:
  FB              (Fibroblasts                             ): 69,912 cells (26.68%)
  CM_nucl         (Cardiomyocytes (Nuclear)                ): 53,788 cells (20.53%)
  MP              (Myeloid/Phagocytes                      ): 32,598 cells (12.44%)
  VEC             (Vascular Endothelial Cells              ): 30,119 cells (11.50%)
  AD              (Adipocytes                              ): 18,325 cells ( 6.99%)
  PER             (Pericytes                               ): 14,546 cells ( 5.55%)
  SMC             (Smooth Muscle Cells                     ): 13,567 cells

  cluster_scores = adata.obs.groupby('leiden_res1.0')[score_columns].mean()
  'N_Cells': adata.obs.groupby('leiden_res1.0').size().values


✓ Heatmap saved

Creating Cell Type UMAPs
✓ UMAP with legend saved
✓ Simple UMAP saved

Creating Score Distribution UMAP
✓ Score UMAP saved

Saving Updated AnnData with Cell Type Annotations
✓ Saved: /work/archive/farhadie/public_studies/Hill/QC_comparison_analysis/processed_data/Hill_preprocessed_with_celltypes_20251018.h5ad

Cell Type Assignment Complete!

New columns added to adata.obs:
  - cell_type_assigned: Short cell type name
  - cell_type_full: Full cell type name
  - cell_type_score: Assignment confidence score

Files created:
  - cluster_cell_type_assignments.csv
  - cluster_celltype_score_heatmap (PNG + PDF)
  - UMAP_cell_types_with_legend (PNG + PDF)
  - UMAP_cell_types_simple (PNG + PDF)
  - UMAP_cell_type_scores (PNG + PDF)

Total files: 9 (1 CSV + 8 images)


In [30]:
! head /work/archive/public_studies/Hill/cellranger_runs/GSE255612_AF_snRNA_MetaData.txt

NAME	biosample_id	donor_id	cell_type	sex	af	n_umi	n_genes	cellranger_percent_mito	exon_prop	entropy	doublet_score	species	species__ontology_label	disease	disease__ontology_label	organ	organ__ontology_label	library_preparation_protocol	library_preparation_protocol__ontology_label
TYPE	group	group	group	group	group	numeric	numeric	numeric	numeric	numeric	numeric	group	group	group	group	group	group	group	group
AAACCCAAGGCAATGC-1-0	1279_3n	P1279_3n	Neuronal	f	case	1240.0	967	0.020967742428183556	0.14444444444444443	8.155590143642492	0.039999999999999994	NCBITaxon_9606	Homo sapiens	MONDO:0004981	atrial fibrillation	UBERON:0002079	left cardiac atrium	EFO_0009922	10x 3' v3
AAACCCACAGCCGTCA-1-0	1279_3n	P1279_3n	Cardiomyocytes	f	case	2357.0	1250	0.005091217812150717	0.10105376925155364	7.385689439881429	0.03571428571428572	NCBITaxon_9606	Homo sapiens	MONDO:0004981	atrial fibrillation	UBERON:0002079	left cardiac atrium	EFO_0009922	10x 3' v3
AAACCCAGTGAATGTA-1-0	1279_3n	P1279_3n	Macrophages	f	cas

In [31]:
adata.obs

Unnamed: 0,fraction_unspliced,n_genes_by_counts,total_counts,total_counts_nuclear,pct_counts_nuclear,score_nuclear,total_counts_MT,pct_counts_MT,score_MT,total_counts_CM_cyto,...,soupx_25pct_flag,doublet_score,doubletfinder_flag,combined_qc_5pct_flag,combined_qc_10pct_flag,combined_qc_20pct_flag,combined_qc_25pct_flag,leiden_res0.5,leiden_res1.0,leiden_res1.5
AAACCCAAGGCAATGC_1279_3n,0.779151,997,1334.0,44.779037,1.956934,2.150000,26.0,1.949026,1.625000,5.0,...,True,-0.637075,False,False,False,False,True,16,15,24
AAACCCACAGCCGTCA_1279_3n,0.802689,1306,2575.0,35.081181,1.434621,5.870833,13.0,0.504854,0.604167,177.0,...,False,-0.402647,False,False,False,False,False,0,1,2
AAACCCAGTATGGTAA_1279_3n,0.168398,1020,2721.0,12.683649,0.695114,-1.579167,782.0,28.739435,59.716346,320.0,...,True,-0.495453,False,True,True,True,True,9,6,11
AAACCCAGTCTGTCAA_1279_3n,0.595269,2215,4132.0,61.395691,1.863954,7.566667,216.0,5.227493,15.552885,119.0,...,False,0.086166,False,True,True,True,True,9,6,11
AAACCCAGTGAATGTA_1279_3n,0.804840,1904,3224.0,45.965084,1.449157,4.908333,15.0,0.465261,0.362179,17.0,...,False,-0.117070,False,False,False,False,False,25,29,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCATTCGTT_1789_1n,0.773471,917,1716.0,46.531418,2.312834,3.253333,39.0,2.272727,2.500000,151.0,...,False,-0.487418,False,False,False,False,False,18,18,19
TTTGTTGTCCCAAGCG_1789_1n,0.794635,2048,3430.0,63.703690,1.952976,10.616667,37.0,1.078717,1.325321,12.0,...,False,0.193767,False,False,False,False,False,7,5,4
TTTGTTGTCCGGCAAC_1789_1n,0.726297,458,594.0,41.604969,3.012122,1.443333,30.0,5.050505,2.141026,7.0,...,True,-0.815685,False,True,True,True,True,3,9,17
TTTGTTGTCGCCGATG_1789_1n,0.790293,3283,7620.0,59.921028,1.580311,26.453333,36.0,0.472441,1.019231,27.0,...,False,1.219335,False,False,False,False,False,5,3,1
