# Codebook Visualization & Analysis

01_vq_embedding_analysis.ipynb에서 생성한 `adata_with_vq.h5ad`를 로드하여 codebook을 시각화하고 분석합니다.

## 분석 내용
1. **Codebook Embedding Space**: PCA/UMAP, dendrogram
2. **Codebook Clustering**: Leiden clustering → code-cluster mapping
3. **Cell-level Visualization**: code_cluster vs cell_cluster 비교
4. **Special Code Identification**: cross-tissue, disease-specific codes
5. **Code Specificity Scores**: organ/disease/study specificity
6. **Code-Sample Heatmap**: sample별 code abundance

## 1. Setup & Load

In [None]:
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

import json
import numpy as np
import pandas as pd
import torch
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import warnings
warnings.filterwarnings('ignore')

# Plot settings
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['figure.dpi'] = 100
sc.settings.verbosity = 2

print(f"Project root: {PROJECT_ROOT}")

In [None]:
# ============================================================
# Configuration - 경로 수정 필요
# ============================================================

# Input (from notebook 01)
ADATA_PATH = "/home/bmi-user/workspace/data/HSvsCD/scMILDQ_Cond/results/vq_analysis/adata_with_vq.h5ad"
CODEBOOK_STATS_PATH = "/home/bmi-user/workspace/data/HSvsCD/scMILDQ_Cond/results/vq_analysis/codebook_stats_whole.csv"

# Output
OUTPUT_DIR = Path("/home/bmi-user/workspace/data/HSvsCD/scMILDQ_Cond/results/vq_analysis")
FIGURE_DIR = OUTPUT_DIR / "figures"
FIGURE_DIR.mkdir(parents=True, exist_ok=True)

# Column names
STUDY_COL = "study"
STATUS_COL = "Status"
ORGAN_COL = "Organ"
SAMPLE_COL = "sample"

# Clustering parameters
LEIDEN_RESOLUTION = 0.5  # for codebook clustering
CELL_LEIDEN_RESOLUTION = 0.5  # for cell clustering

# Thresholds for special codes
DISEASE_SPECIFIC_THRESHOLD = 0.8  # disease_ratio > 0.8 or < 0.2
TISSUE_SPECIFIC_THRESHOLD = 0.9   # organ_ratio > 0.9
CROSS_TISSUE_ENTROPY_THRESHOLD = 0.9  # normalized entropy > 0.9

print(f"Output directory: {OUTPUT_DIR}")
print(f"Figure directory: {FIGURE_DIR}")

In [None]:
# Load adata with VQ embeddings
print(f"Loading adata from: {ADATA_PATH}")
adata = sc.read_h5ad(ADATA_PATH)
print(f"Shape: {adata.n_obs} cells × {adata.n_vars} genes")
print(f"obs columns: {list(adata.obs.columns)}")
print(f"obsm keys: {list(adata.obsm.keys())}")
print(f"uns keys: {list(adata.uns.keys())}")

In [None]:
# Load codebook and stats
codebook = adata.uns['codebook']
codebook_stats = pd.read_csv(CODEBOOK_STATS_PATH)

print(f"Codebook shape: {codebook.shape}")
print(f"Codebook stats shape: {codebook_stats.shape}")

# Filter to active codes only
active_mask = codebook_stats['n_cells'] > 0
active_codes = codebook_stats[active_mask]['code_idx'].values
print(f"Active codes: {len(active_codes)} / {len(codebook_stats)}")

In [None]:
# Create AnnData for codebook (for scanpy functions)
adata_codebook = sc.AnnData(X=codebook)
adata_codebook.obs_names = [f"code_{i}" for i in range(len(codebook))]

# Add stats as obs
for col in codebook_stats.columns:
    if col != 'code_idx':
        adata_codebook.obs[col] = codebook_stats[col].values

# Filter to active codes
adata_codebook = adata_codebook[active_mask].copy()
print(f"Codebook AnnData shape: {adata_codebook.shape}")

## 2. Codebook Embedding Space

### 2.1 PCA & UMAP of Codebook

In [None]:
# PCA
sc.tl.pca(adata_codebook, n_comps=min(50, adata_codebook.n_obs - 1))
print(f"PCA variance ratio (first 10): {adata_codebook.uns['pca']['variance_ratio'][:10]}")

In [None]:
# Neighbors & UMAP
sc.pp.neighbors(adata_codebook, n_neighbors=15, n_pcs=min(30, adata_codebook.n_obs - 1))
sc.tl.umap(adata_codebook)

In [None]:
# Plot: Usage-based coloring
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Log-scaled n_cells for better visualization
adata_codebook.obs['log_n_cells'] = np.log1p(adata_codebook.obs['n_cells'])

sc.pl.umap(adata_codebook, color='log_n_cells', ax=axes[0], show=False, 
           title='Codebook UMAP - Usage (log n_cells)', cmap='viridis')
sc.pl.umap(adata_codebook, color='n_samples', ax=axes[1], show=False,
           title='Codebook UMAP - Sample Diversity', cmap='viridis')
sc.pl.umap(adata_codebook, color='is_single_sample', ax=axes[2], show=False,
           title='Codebook UMAP - Single Sample Codes')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_umap_usage.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot: Organ-based coloring
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sc.pl.umap(adata_codebook, color='organ_Skin', ax=axes[0], show=False,
           title='Codebook UMAP - Skin Ratio', cmap='Reds', vmin=0, vmax=1)
sc.pl.umap(adata_codebook, color='organ_Colon', ax=axes[1], show=False,
           title='Codebook UMAP - Colon Ratio', cmap='Blues', vmin=0, vmax=1)

# Dominant organ
adata_codebook.obs['dominant_organ'] = np.where(
    adata_codebook.obs['organ_Skin'] > adata_codebook.obs['organ_Colon'], 
    'Skin', 'Colon'
)
sc.pl.umap(adata_codebook, color='dominant_organ', ax=axes[2], show=False,
           title='Codebook UMAP - Dominant Organ', palette={'Skin': 'red', 'Colon': 'blue'})

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_umap_organ.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Plot: Disease-based coloring
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sc.pl.umap(adata_codebook, color='disease_ratio_overall', ax=axes[0], show=False,
           title='Codebook UMAP - Disease Ratio (Overall)', cmap='RdBu_r', vmin=0, vmax=1)
sc.pl.umap(adata_codebook, color='disease_ratio_skin', ax=axes[1], show=False,
           title='Codebook UMAP - Disease Ratio (Skin: HS)', cmap='RdBu_r', vmin=0, vmax=1)
sc.pl.umap(adata_codebook, color='disease_ratio_colon', ax=axes[2], show=False,
           title='Codebook UMAP - Disease Ratio (Colon: CD)', cmap='RdBu_r', vmin=0, vmax=1)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_umap_disease.png', dpi=150, bbox_inches='tight')
plt.show()

### 2.2 Dendrogram + Heatmap

In [None]:
# Hierarchical clustering of codebook
active_codebook = codebook[active_codes]
linkage_matrix = linkage(active_codebook, method='ward')

# Prepare heatmap data
heatmap_cols = ['disease_ratio_overall', 'disease_ratio_skin', 'disease_ratio_colon',
                'organ_Skin', 'organ_Colon', 'log_n_cells', 'n_samples']
heatmap_data = adata_codebook.obs[heatmap_cols].copy()

# Normalize for visualization
heatmap_data_norm = (heatmap_data - heatmap_data.min()) / (heatmap_data.max() - heatmap_data.min())

In [None]:
# Plot dendrogram with heatmap
fig = plt.figure(figsize=(16, 10))

# Dendrogram
ax_dendro = fig.add_axes([0.05, 0.1, 0.15, 0.8])
dendro = dendrogram(linkage_matrix, orientation='left', no_labels=True, ax=ax_dendro,
                    color_threshold=0, above_threshold_color='gray')
ax_dendro.set_xticks([])
ax_dendro.set_yticks([])
ax_dendro.spines['top'].set_visible(False)
ax_dendro.spines['right'].set_visible(False)
ax_dendro.spines['bottom'].set_visible(False)
ax_dendro.spines['left'].set_visible(False)

# Reorder heatmap data according to dendrogram
dendro_order = dendro['leaves']
heatmap_ordered = heatmap_data_norm.iloc[dendro_order]

# Heatmap
ax_heatmap = fig.add_axes([0.22, 0.1, 0.65, 0.8])
im = ax_heatmap.imshow(heatmap_ordered.values, aspect='auto', cmap='viridis')
ax_heatmap.set_xticks(range(len(heatmap_cols)))
ax_heatmap.set_xticklabels(heatmap_cols, rotation=45, ha='right')
ax_heatmap.set_yticks([])
ax_heatmap.set_xlabel('Features')
ax_heatmap.set_ylabel(f'Codes (n={len(active_codes)})')
ax_heatmap.set_title('Codebook Dendrogram + Feature Heatmap')

# Colorbar
ax_cbar = fig.add_axes([0.89, 0.1, 0.02, 0.8])
plt.colorbar(im, cax=ax_cbar, label='Normalized Value')

plt.savefig(FIGURE_DIR / 'codebook_dendrogram_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Codebook Clustering

### 3.1 Leiden Clustering on Codebook

In [None]:
# Leiden clustering
sc.tl.leiden(adata_codebook, resolution=LEIDEN_RESOLUTION, key_added='code_cluster')
n_clusters = adata_codebook.obs['code_cluster'].nunique()
print(f"Number of code clusters: {n_clusters}")
print(f"Cluster sizes:\n{adata_codebook.obs['code_cluster'].value_counts().sort_index()}")

In [None]:
# Plot codebook clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sc.pl.umap(adata_codebook, color='code_cluster', ax=axes[0], show=False,
           title=f'Codebook UMAP - Leiden Clusters (res={LEIDEN_RESOLUTION})', legend_loc='right margin')

# Cluster sizes bar plot
cluster_counts = adata_codebook.obs['code_cluster'].value_counts().sort_index()
axes[1].bar(cluster_counts.index, cluster_counts.values)
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Number of Codes')
axes[1].set_title('Codes per Cluster')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_leiden_clusters.png', dpi=150, bbox_inches='tight')
plt.show()

### 3.2 Code-Cluster Mapping

In [None]:
# Create code -> cluster mapping
# Note: adata_codebook only contains active codes
code_to_cluster = dict(zip(
    adata_codebook.obs_names,  # code_0, code_1, ...
    adata_codebook.obs['code_cluster']
))

# Convert to code_idx -> cluster
code_idx_to_cluster = {}
for code_name, cluster in code_to_cluster.items():
    code_idx = int(code_name.split('_')[1])
    code_idx_to_cluster[code_idx] = cluster

print(f"Mapped {len(code_idx_to_cluster)} codes to clusters")

In [None]:
# Apply to cell-level adata
# Map each cell's vq_code to its cluster
adata.obs['code_cluster'] = adata.obs['vq_code'].map(code_idx_to_cluster)

# Handle unmapped codes (inactive codes)
unmapped = adata.obs['code_cluster'].isna().sum()
if unmapped > 0:
    print(f"Warning: {unmapped} cells have unmapped codes (setting to 'inactive')")
    adata.obs['code_cluster'] = adata.obs['code_cluster'].fillna('inactive')

adata.obs['code_cluster'] = adata.obs['code_cluster'].astype(str)
print(f"Cell-level code_cluster distribution:\n{adata.obs['code_cluster'].value_counts().head(10)}")

### 3.3 Cluster Characteristics Summary

In [None]:
# Summarize cluster characteristics
cluster_summary = adata_codebook.obs.groupby('code_cluster').agg({
    'n_cells': ['sum', 'mean'],
    'n_samples': 'mean',
    'disease_ratio_overall': 'mean',
    'disease_ratio_skin': 'mean',
    'disease_ratio_colon': 'mean',
    'organ_Skin': 'mean',
    'organ_Colon': 'mean',
}).round(3)

cluster_summary.columns = ['_'.join(col).strip() for col in cluster_summary.columns.values]
cluster_summary['n_codes'] = adata_codebook.obs['code_cluster'].value_counts().sort_index()
cluster_summary = cluster_summary.reset_index()

print("=== Cluster Summary ===")
display(cluster_summary)

In [None]:
# Heatmap of cluster characteristics
plot_cols = ['disease_ratio_overall_mean', 'disease_ratio_skin_mean', 'disease_ratio_colon_mean',
             'organ_Skin_mean', 'organ_Colon_mean']
plot_data = cluster_summary.set_index('code_cluster')[plot_cols]

plt.figure(figsize=(10, 6))
sns.heatmap(plot_data, annot=True, fmt='.2f', cmap='RdBu_r', center=0.5,
            vmin=0, vmax=1, linewidths=0.5)
plt.title('Code Cluster Characteristics')
plt.ylabel('Cluster')
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_cluster_characteristics.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Cell-level Visualization

### 4.1 Cell UMAP (using X_vq)

In [None]:
# Compute cell UMAP using VQ embeddings
# Use subsampling for large datasets
MAX_CELLS_FOR_UMAP = 50000

if adata.n_obs > MAX_CELLS_FOR_UMAP:
    print(f"Subsampling {MAX_CELLS_FOR_UMAP} cells for UMAP computation...")
    sc.pp.subsample(adata, n_obs=MAX_CELLS_FOR_UMAP, random_state=42, copy=False)
    print(f"Subsampled to {adata.n_obs} cells")

# Compute neighbors and UMAP using X_vq
print("Computing neighbors...")
sc.pp.neighbors(adata, use_rep='X_vq', n_neighbors=15)
print("Computing UMAP...")
sc.tl.umap(adata)

In [None]:
# Plot cell UMAP colored by code_cluster
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

sc.pl.umap(adata, color='code_cluster', ax=axes[0], show=False,
           title='Cell UMAP - Code Cluster', legend_loc='right margin')
sc.pl.umap(adata, color=ORGAN_COL, ax=axes[1], show=False,
           title='Cell UMAP - Organ')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cell_umap_code_cluster.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# More cell UMAP plots
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

sc.pl.umap(adata, color=STATUS_COL, ax=axes[0], show=False,
           title='Cell UMAP - Status')
sc.pl.umap(adata, color=STUDY_COL, ax=axes[1], show=False,
           title='Cell UMAP - Study', legend_loc='right margin')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cell_umap_metadata.png', dpi=150, bbox_inches='tight')
plt.show()

### 4.2 Cell-level Leiden Clustering

In [None]:
# Cell-level leiden clustering
sc.tl.leiden(adata, resolution=CELL_LEIDEN_RESOLUTION, key_added='cell_cluster')
n_cell_clusters = adata.obs['cell_cluster'].nunique()
print(f"Number of cell clusters: {n_cell_clusters}")

In [None]:
# Plot comparison: code_cluster vs cell_cluster
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

sc.pl.umap(adata, color='code_cluster', ax=axes[0], show=False,
           title='Cell UMAP - Code Cluster', legend_loc='right margin')
sc.pl.umap(adata, color='cell_cluster', ax=axes[1], show=False,
           title='Cell UMAP - Cell Cluster (Leiden)', legend_loc='right margin')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cell_umap_cluster_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

### 4.3 Cluster Comparison Metrics

In [None]:
# Filter out 'inactive' codes for comparison
valid_mask = adata.obs['code_cluster'] != 'inactive'
valid_code_cluster = adata.obs.loc[valid_mask, 'code_cluster'].astype(int)
valid_cell_cluster = adata.obs.loc[valid_mask, 'cell_cluster'].astype(int)

# Compute metrics
ari = adjusted_rand_score(valid_code_cluster, valid_cell_cluster)
nmi = normalized_mutual_info_score(valid_code_cluster, valid_cell_cluster)

print("=== Code Cluster vs Cell Cluster Comparison ===")
print(f"Adjusted Rand Index (ARI): {ari:.4f}")
print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
print(f"\nInterpretation:")
print(f"  ARI: 0=random, 1=perfect match")
print(f"  NMI: 0=no mutual info, 1=perfect correlation")

In [None]:
# Confusion matrix heatmap
confusion = pd.crosstab(adata.obs['code_cluster'], adata.obs['cell_cluster'], normalize='index')

plt.figure(figsize=(12, 10))
sns.heatmap(confusion, cmap='Blues', annot=False)
plt.title('Code Cluster vs Cell Cluster (row-normalized)')
plt.xlabel('Cell Cluster')
plt.ylabel('Code Cluster')
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cluster_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Special Code Identification

### 5.1 Code Specificity Scores

In [None]:
def compute_entropy(probs):
    """Compute normalized entropy from probability distribution."""
    probs = np.array(probs)
    probs = probs[probs > 0]  # Remove zeros
    if len(probs) <= 1:
        return 0.0
    entropy = -np.sum(probs * np.log2(probs))
    max_entropy = np.log2(len(probs))
    return entropy / max_entropy if max_entropy > 0 else 0.0

# Compute specificity scores for each code
specificity_scores = []

for idx, row in adata_codebook.obs.iterrows():
    # Organ specificity: max - second_max (higher = more specific)
    organ_probs = [row['organ_Skin'], row['organ_Colon']]
    organ_specificity = max(organ_probs) - min(organ_probs)
    
    # Organ entropy (lower = more specific)
    organ_entropy = compute_entropy(organ_probs)
    
    # Disease specificity: |ratio - 0.5| * 2 (higher = more specific)
    disease_spec_overall = abs(row['disease_ratio_overall'] - 0.5) * 2
    disease_spec_skin = abs(row.get('disease_ratio_skin', 0.5) - 0.5) * 2 if pd.notna(row.get('disease_ratio_skin')) else np.nan
    disease_spec_colon = abs(row.get('disease_ratio_colon', 0.5) - 0.5) * 2 if pd.notna(row.get('disease_ratio_colon')) else np.nan
    
    # Study entropy (lower = more specific to certain studies)
    study_cols = [col for col in row.index if col.startswith('study_')]
    study_probs = [row[col] for col in study_cols]
    study_entropy = compute_entropy(study_probs)
    
    specificity_scores.append({
        'code_name': idx,
        'organ_specificity': organ_specificity,
        'organ_entropy': organ_entropy,
        'disease_specificity_overall': disease_spec_overall,
        'disease_specificity_skin': disease_spec_skin,
        'disease_specificity_colon': disease_spec_colon,
        'study_entropy': study_entropy,
    })

specificity_df = pd.DataFrame(specificity_scores).set_index('code_name')

# Add to adata_codebook
for col in specificity_df.columns:
    adata_codebook.obs[col] = specificity_df[col]

specificity_df.describe()

In [None]:
# Plot specificity scores on codebook UMAP
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

sc.pl.umap(adata_codebook, color='organ_specificity', ax=axes[0, 0], show=False,
           title='Organ Specificity (higher=more specific)', cmap='viridis')
sc.pl.umap(adata_codebook, color='organ_entropy', ax=axes[0, 1], show=False,
           title='Organ Entropy (lower=more specific)', cmap='viridis_r')
sc.pl.umap(adata_codebook, color='disease_specificity_overall', ax=axes[1, 0], show=False,
           title='Disease Specificity (higher=more specific)', cmap='viridis')
sc.pl.umap(adata_codebook, color='study_entropy', ax=axes[1, 1], show=False,
           title='Study Entropy (lower=more specific)', cmap='viridis_r')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_umap_specificity.png', dpi=150, bbox_inches='tight')
plt.show()

### 5.2 Identify Special Codes

In [None]:
# Cross-tissue mixing codes (high organ entropy)
cross_tissue_mask = adata_codebook.obs['organ_entropy'] > CROSS_TISSUE_ENTROPY_THRESHOLD
cross_tissue_codes = adata_codebook.obs_names[cross_tissue_mask].tolist()
print(f"Cross-tissue codes (organ_entropy > {CROSS_TISSUE_ENTROPY_THRESHOLD}): {len(cross_tissue_codes)}")

# Disease-specific codes
disease_high_mask = adata_codebook.obs['disease_ratio_overall'] > DISEASE_SPECIFIC_THRESHOLD
disease_low_mask = adata_codebook.obs['disease_ratio_overall'] < (1 - DISEASE_SPECIFIC_THRESHOLD)
disease_specific_codes = adata_codebook.obs_names[disease_high_mask].tolist()
control_specific_codes = adata_codebook.obs_names[disease_low_mask].tolist()
print(f"Disease-specific codes (ratio > {DISEASE_SPECIFIC_THRESHOLD}): {len(disease_specific_codes)}")
print(f"Control-specific codes (ratio < {1-DISEASE_SPECIFIC_THRESHOLD}): {len(control_specific_codes)}")

# Tissue-specific codes
skin_specific_mask = adata_codebook.obs['organ_Skin'] > TISSUE_SPECIFIC_THRESHOLD
colon_specific_mask = adata_codebook.obs['organ_Colon'] > TISSUE_SPECIFIC_THRESHOLD
skin_specific_codes = adata_codebook.obs_names[skin_specific_mask].tolist()
colon_specific_codes = adata_codebook.obs_names[colon_specific_mask].tolist()
print(f"Skin-specific codes (ratio > {TISSUE_SPECIFIC_THRESHOLD}): {len(skin_specific_codes)}")
print(f"Colon-specific codes (ratio > {TISSUE_SPECIFIC_THRESHOLD}): {len(colon_specific_codes)}")

In [None]:
# Create special code labels
def assign_special_label(row):
    labels = []
    code_name = row.name
    
    if code_name in cross_tissue_codes:
        labels.append('cross_tissue')
    if code_name in disease_specific_codes:
        labels.append('disease_high')
    if code_name in control_specific_codes:
        labels.append('control_high')
    if code_name in skin_specific_codes:
        labels.append('skin_specific')
    if code_name in colon_specific_codes:
        labels.append('colon_specific')
    
    return ','.join(labels) if labels else 'none'

adata_codebook.obs['special_label'] = adata_codebook.obs.apply(assign_special_label, axis=1)
print(f"Special label distribution:\n{adata_codebook.obs['special_label'].value_counts()}")

In [None]:
# Create simplified category for plotting
def simplify_label(label):
    if label == 'none':
        return 'normal'
    elif 'cross_tissue' in label:
        return 'cross_tissue'
    elif 'disease_high' in label:
        return 'disease_specific'
    elif 'control_high' in label:
        return 'control_specific'
    elif 'skin_specific' in label:
        return 'skin_specific'
    elif 'colon_specific' in label:
        return 'colon_specific'
    else:
        return 'other'

adata_codebook.obs['special_category'] = adata_codebook.obs['special_label'].apply(simplify_label)
print(f"Special category distribution:\n{adata_codebook.obs['special_category'].value_counts()}")

### 5.3 Visualize Special Codes

In [None]:
# Plot special codes on codebook UMAP
special_palette = {
    'normal': 'lightgray',
    'cross_tissue': 'purple',
    'disease_specific': 'red',
    'control_specific': 'blue',
    'skin_specific': 'orange',
    'colon_specific': 'green',
    'other': 'gray'
}

fig, ax = plt.subplots(figsize=(10, 8))
sc.pl.umap(adata_codebook, color='special_category', ax=ax, show=False,
           title='Codebook UMAP - Special Codes', palette=special_palette,
           legend_loc='right margin')
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'codebook_umap_special_codes.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Apply special code labels to cell-level
# Create code_idx -> special_category mapping
code_to_special = {}
for code_name in adata_codebook.obs_names:
    code_idx = int(code_name.split('_')[1])
    code_to_special[code_idx] = adata_codebook.obs.loc[code_name, 'special_category']

adata.obs['special_category'] = adata.obs['vq_code'].map(code_to_special).fillna('inactive')
print(f"Cell-level special category:\n{adata.obs['special_category'].value_counts()}")

In [None]:
# Plot special codes on cell UMAP
cell_special_palette = special_palette.copy()
cell_special_palette['inactive'] = 'white'

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

sc.pl.umap(adata_codebook, color='special_category', ax=axes[0], show=False,
           title='Codebook UMAP - Special Codes', palette=special_palette)
sc.pl.umap(adata, color='special_category', ax=axes[1], show=False,
           title='Cell UMAP - Special Codes', palette=cell_special_palette)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'special_codes_both_umaps.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Code-Sample Heatmap

In [None]:
# Compute sample × code abundance matrix
sample_code_counts = pd.crosstab(adata.obs[SAMPLE_COL], adata.obs['vq_code'])

# Normalize by sample (row)
sample_code_freq = sample_code_counts.div(sample_code_counts.sum(axis=1), axis=0)

print(f"Sample × Code matrix shape: {sample_code_freq.shape}")

In [None]:
# Get sample metadata for ordering
sample_meta = adata.obs.groupby(SAMPLE_COL).first()[[STATUS_COL, ORGAN_COL, STUDY_COL]]
sample_meta = sample_meta.loc[sample_code_freq.index]

# Sort samples by organ, then status
sample_order = sample_meta.sort_values([ORGAN_COL, STATUS_COL]).index
sample_code_freq_ordered = sample_code_freq.loc[sample_order]

# Filter to active codes only and sort by cluster
active_code_indices = [int(c.split('_')[1]) for c in adata_codebook.obs_names]
code_cluster_order = adata_codebook.obs['code_cluster'].sort_values().index
code_order = [int(c.split('_')[1]) for c in code_cluster_order]

# Filter columns to active codes in cluster order
available_codes = [c for c in code_order if c in sample_code_freq_ordered.columns]
sample_code_freq_final = sample_code_freq_ordered[available_codes]

In [None]:
# Create row colors for sample metadata
organ_colors = {'Skin': 'red', 'Colon': 'blue'}
status_colors = {'HS': 'darkred', 'CD': 'darkblue', 'ctrl_skin': 'lightcoral', 'ctrl_colon': 'lightblue'}

row_colors = pd.DataFrame({
    'Organ': sample_meta.loc[sample_code_freq_final.index, ORGAN_COL].map(organ_colors),
    'Status': sample_meta.loc[sample_code_freq_final.index, STATUS_COL].map(status_colors)
})

In [None]:
# Plot clustermap
g = sns.clustermap(
    sample_code_freq_final,
    row_colors=row_colors,
    row_cluster=False,  # Keep our order
    col_cluster=False,  # Keep cluster order
    cmap='viridis',
    figsize=(20, 12),
    xticklabels=False,
    yticklabels=True,
    cbar_kws={'label': 'Frequency'}
)
g.ax_heatmap.set_xlabel('Codes (ordered by cluster)')
g.ax_heatmap.set_ylabel('Samples')
g.fig.suptitle('Sample × Code Abundance Heatmap', y=1.02)

plt.savefig(FIGURE_DIR / 'sample_code_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Save Results

In [None]:
# Save code-cluster mapping
code_cluster_mapping = pd.DataFrame({
    'code_idx': [int(c.split('_')[1]) for c in adata_codebook.obs_names],
    'code_cluster': adata_codebook.obs['code_cluster'].values,
    'special_category': adata_codebook.obs['special_category'].values,
    'special_label': adata_codebook.obs['special_label'].values,
})

code_cluster_mapping.to_csv(OUTPUT_DIR / 'code_cluster_mapping.csv', index=False)
print(f"Saved code_cluster_mapping.csv")

In [None]:
# Save special codes as JSON
special_codes = {
    'cross_tissue': [int(c.split('_')[1]) for c in cross_tissue_codes],
    'disease_specific': [int(c.split('_')[1]) for c in disease_specific_codes],
    'control_specific': [int(c.split('_')[1]) for c in control_specific_codes],
    'skin_specific': [int(c.split('_')[1]) for c in skin_specific_codes],
    'colon_specific': [int(c.split('_')[1]) for c in colon_specific_codes],
    'thresholds': {
        'disease_specific': DISEASE_SPECIFIC_THRESHOLD,
        'tissue_specific': TISSUE_SPECIFIC_THRESHOLD,
        'cross_tissue_entropy': CROSS_TISSUE_ENTROPY_THRESHOLD,
    }
}

with open(OUTPUT_DIR / 'special_codes.json', 'w') as f:
    json.dump(special_codes, f, indent=2)
print(f"Saved special_codes.json")

In [None]:
# Save specificity scores
specificity_df.to_csv(OUTPUT_DIR / 'code_specificity_scores.csv')
print(f"Saved code_specificity_scores.csv")

In [None]:
# Save cluster summary
cluster_summary.to_csv(OUTPUT_DIR / 'code_cluster_summary.csv', index=False)
print(f"Saved code_cluster_summary.csv")

In [None]:
# Save updated adata with new columns
output_adata_path = OUTPUT_DIR / "adata_with_vq_clustered.h5ad"
print(f"Saving updated adata to: {output_adata_path}")
adata.write_h5ad(output_adata_path)
print("Done!")

In [None]:
# Save codebook adata
output_codebook_adata_path = OUTPUT_DIR / "codebook_adata.h5ad"
print(f"Saving codebook adata to: {output_codebook_adata_path}")
adata_codebook.write_h5ad(output_codebook_adata_path)
print("Done!")

---

## Summary

### 저장된 파일

| 파일 | 내용 |
|------|------|
| `adata_with_vq_clustered.h5ad` | code_cluster, cell_cluster, special_category 추가 |
| `codebook_adata.h5ad` | codebook AnnData (UMAP, cluster 등 포함) |
| `code_cluster_mapping.csv` | code → cluster 매핑 |
| `special_codes.json` | 특수 code 목록 |
| `code_specificity_scores.csv` | organ/disease/study specificity |
| `code_cluster_summary.csv` | cluster별 특성 요약 |
| `figures/*.png` | 모든 시각화 결과 |

### 주요 분석 결과

1. **Codebook Clustering**: Leiden으로 code들을 그룹화
2. **Code vs Cell Cluster**: ARI/NMI로 비교
3. **Special Codes**: cross-tissue, disease-specific, tissue-specific codes 식별
4. **Sample-Code Heatmap**: sample별 code 분포 시각화