In [7]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

# Set Path
IN_H5AD  = "../data/processed/SKCM_GSE134388_aPD1_umap.h5ad"  # from Script 04
OUT_H5AD = "../data/processed/SKCM_GSE134388_aPD1_final_annotated.h5ad"

CLUSTER_KEY = "leiden"

TOPN_TABLE   = 50   
TOPN_HEATMAP = 3    
TOPN_DOTPLOT = 5    

# If adata.raw properly set (recommended), keep True.
# If raw is missing or you want everything from adata.X, set False.
USE_RAW = False

RESULTS_DIR = "../results/markers"
os.makedirs(RESULTS_DIR, exist_ok=True)

# Save figures in this directory
sc.settings.figdir = RESULTS_DIR
sc.settings.set_figure_params(dpi=180, facecolor="white")
sc.settings.verbosity = 2
sc.settings.seed = 0

# LOAD
adata = sc.read_h5ad(IN_H5AD)
print("Loaded:", IN_H5AD)
print("Shape:", adata.shape)
print("Has UMAP?", "X_umap" in adata.obsm)
print("Has clusters?", CLUSTER_KEY in adata.obs.columns)
print("Has raw?", adata.raw is not None)

if CLUSTER_KEY not in adata.obs.columns:
    raise ValueError(f"'{CLUSTER_KEY}' not found in adata.obs. Run Script 04 and ensure Leiden clustering was saved.")

# If you said USE_RAW=True but raw is missing, auto-fix to False
if USE_RAW and adata.raw is None:
    print("WARNING: USE_RAW=True but adata.raw is missing. Switching USE_RAW=False.")
    USE_RAW = False

Loaded: ../data/processed/SKCM_GSE134388_aPD1_umap.h5ad
Shape: (3632, 2000)
Has UMAP? True
Has clusters? True
Has raw? True


In [8]:
# Quick UMAP with cluster labels (numbers)
sc.pl.umap(adata, color=[CLUSTER_KEY], legend_loc="on data", show=False)
plt.savefig(os.path.join(RESULTS_DIR, "umap_leiden_labels.png"), bbox_inches="tight")
plt.close()
print("Saved: umap_leiden_labels.png")

Saved: umap_leiden_labels.png


In [9]:
# Marker gene DE per cluster
sc.tl.rank_genes_groups(
    adata,
    groupby=CLUSTER_KEY,
    method="wilcoxon",
    use_raw=USE_RAW,
    pts=True
)

# Save a standard rank plot
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, show=False)
plt.savefig(os.path.join(RESULTS_DIR, "rank_genes_groups_top20.png"), bbox_inches="tight")
plt.close()
print("Saved: rank_genes_groups_top20.png")

# Export full marker table (all clusters)
df_all = sc.get.rank_genes_groups_df(adata, group=None)

# Keep top TOPN_TABLE per cluster in a smaller file
df_top = (df_all.sort_values(["group", "scores"], ascending=[True, False])
                .groupby("group")
                .head(TOPN_TABLE))

df_all.to_csv(os.path.join(RESULTS_DIR, "markers_all_clusters_full.csv"), index=False)
df_top.to_csv(os.path.join(RESULTS_DIR, f"markers_top{TOPN_TABLE}_per_cluster.csv"), index=False)
print("Saved markers CSVs")

ranking genes
    finished (0:00:00)


  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "logfoldchanges"] = np.log2(


Saved: rank_genes_groups_top20.png
Saved markers CSVs


In [16]:
# Heatmap
n_clusters = adata.obs[CLUSTER_KEY].nunique()

sc.pl.heatmap(
    adata,
    var_names=genes_hm,
    groupby=CLUSTER_KEY,
    use_raw=USE_RAW,
    standard_scale="var",
    figsize=(14, 1.0 * n_clusters),   # wider + more vertical spacing
    dendrogram=False,
    show=False
)

plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=10)

plt.savefig(
    os.path.join(RESULTS_DIR, "heatmap_top3_per_cluster_final.png"),
    dpi=200,
    bbox_inches="tight"
)
plt.close()


In [17]:
# Dotplot
top_dp = (df_all.sort_values(["group", "scores"], ascending=[True, False])
               .groupby("group")
               .head(TOPN_DOTPLOT))

genes_dp = top_dp["names"].dropna().unique().tolist()
print("Dotplot genes:", len(genes_dp))

sc.pl.dotplot(
    adata,
    var_names=genes_dp,
    groupby=CLUSTER_KEY,
    use_raw=USE_RAW,
    standard_scale="var",
    show=False
)
plt.savefig(os.path.join(RESULTS_DIR, f"dotplot_top{TOPN_DOTPLOT}_per_cluster.png"), bbox_inches="tight")
plt.close()
print(f"Saved: dotplot_top{TOPN_DOTPLOT}_per_cluster.png")

Dotplot genes: 66
Saved: dotplot_top5_per_cluster.png


In [18]:
# Marker sanity panel on UMAP (immune/tumor)
marker_panel = [
    "PTPRC",
    "CD3D", "CD3E", "TRAC",
    "CD8A", "NKG7", "GNLY",
    "MS4A1", "CD79A",
    "LYZ", "LST1", "S100A8", "S100A9",
    "DCN", "COL1A1",
    "EPCAM", "KRT8", "KRT18",
    "MKI67"
]

varset = set(adata.raw.var_names) if (USE_RAW and adata.raw is not None) else set(adata.var_names)
marker_present = [g for g in marker_panel if g in varset]
print("Marker genes present:", len(marker_present), "/", len(marker_panel))

if len(marker_present) > 0:
    sc.pl.umap(adata, color=marker_present[:12], ncols=4, use_raw=USE_RAW, show=False)
    plt.savefig(os.path.join(RESULTS_DIR, "umap_marker_panel_first12.png"), bbox_inches="tight")
    plt.close()
    print("Saved: umap_marker_panel_first12.png")

    if len(marker_present) > 12:
        sc.pl.umap(adata, color=marker_present[12:], ncols=4, use_raw=USE_RAW, show=False)
        plt.savefig(os.path.join(RESULTS_DIR, "umap_marker_panel_remaining.png"), bbox_inches="tight")
        plt.close()
        print("Saved: umap_marker_panel_remaining.png")

Marker genes present: 16 / 19
Saved: umap_marker_panel_first12.png
Saved: umap_marker_panel_remaining.png


In [None]:
# FINAL STEP: Cluster -> Cell type annotation Start with everything as "Unknown"
cluster_ids = sorted(adata.obs[CLUSTER_KEY].astype(str).unique(), key=lambda x: int(x))
print("Clusters:", cluster_ids)

# TODO: Replace the values after you interpret your markers.
# Example mapping (PLACEHOLDER) â€” you MUST edit according to your results
cluster_to_celltype = {cid: "Unknown" for cid in cluster_ids}

# Example (edit these):
# cluster_to_celltype["0"] = "Fibroblast"
# cluster_to_celltype["1"] = "T cells"
# cluster_to_celltype["2"] = "Myeloid"
# cluster_to_celltype["3"] = "NK cells"
# cluster_to_celltype["4"] = "Tumor/Epithelial"
# cluster_to_celltype["5"] = "B cells"

adata.obs["cell_type"] = adata.obs[CLUSTER_KEY].astype(str).map(cluster_to_celltype).astype("category")
print(adata.obs["cell_type"].value_counts(dropna=False))

# UMAP colored by cell type (names like papers)
sc.pl.umap(adata, color="cell_type", legend_loc="on data", show=False)
plt.savefig(os.path.join(RESULTS_DIR, "umap_cell_type_labels.png"), bbox_inches="tight")
plt.close()
print("Saved: umap_cell_type_labels.png") 

Clusters: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13']
cell_type
Unknown    3632
Name: count, dtype: int64
Saved: umap_cell_type_labels.png


In [None]:
# ============================================================
# Additional plots 
# ============================================================
# (A) UMAP split by a metadata column (only if you have it)
# Example: by patient, batch, sample, treatment, etc.
possible_cols = ["Patient", "sample", "Sample", "batch", "Batch", "treatment", "Treatment"]
cols_found = [c for c in possible_cols if c in adata.obs.columns]
print("Metadata columns found for split:", cols_found)

# (B) Proportion barplot of cell types per sample (if sample exists)
# if "Sample" in adata.obs.columns:
#     ct = pd.crosstab(adata.obs["Sample"], adata.obs["cell_type"], normalize="index")
#     ax = ct.plot(kind="bar", stacked=True, figsize=(10,4))
#     plt.tight_layout()
#     plt.savefig(os.path.join(RESULTS_DIR, "celltype_proportions_by_sample.png"), dpi=200)
#     plt.close()

In [None]:
# Save final AnnData
adata.write_h5ad(OUT_H5AD)
print("Saved final:", OUT_H5AD)

print("SCRIPT 05 COMPLETE")