In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
!pip install scanpy
!pip install --upgrade scipy scikit-learn celltypist



In [14]:
import celltypist
import scanpy as sc

# ---------- Step 1: 載入原始資料 ----------
adata = sc.read_h5ad('/content/drive/MyDrive/Colab Notebooks/MoRE/GSE153935.h5ad')

In [15]:
import numpy as np
from scipy import sparse

# ✅ 第一步：如果是 sparse matrix，直接清理 .X.data
if sparse.issparse(adata.X):
    adata.X.data = np.nan_to_num(adata.X.data, nan=0.0, posinf=0.0, neginf=0.0)
else:
    adata.X = np.nan_to_num(adata.X, nan=0.0, posinf=0.0, neginf=0.0)

In [16]:
# Normalize total counts
sc.pp.normalize_total(adata, target_sum=1e4)

# Log-transform
sc.pp.log1p(adata)

In [17]:
import scanpy as sc

# HVG selection
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    flavor='seurat'
)

In [18]:
import numpy as np

# 使用 np.expm1 對數組進行操作並求和
result = np.expm1(adata.X).sum(axis=1)

print(result)

[[ 711.48775778]
 [ 673.051458  ]
 [ 778.41949203]
 ...
 [1522.49066501]
 [ 506.65976548]
 [1019.98210339]]


In [19]:
adata.obs

Unnamed: 0,sample_id,Celltype_training,Data_source,Cancer_type
TL710T_GTTCAGCTTCTT,TL710T,Fibroblasts,GSE153935,NSCLC
TL710T_GCGCCCCATCAC,TL710T,Fibroblasts,GSE153935,NSCLC
TL710T_TGTCGGGGATCA,TL710T,Fibroblasts,GSE153935,NSCLC
TL710T_CCAATAATCTTA,TL710T,Fibroblasts,GSE153935,NSCLC
TL710T_CCTTTCCGTTTG,TL710T,Fibroblasts,GSE153935,NSCLC
...,...,...,...,...
TL1001T_CGTCCCTTGTAT,TL1001T,B cells,GSE153935,NSCLC
TL1001T_AGCAGATTCTGG,TL1001T,B cells,GSE153935,NSCLC
TL1001T_AGTGCGGGATCT,TL1001T,B cells,GSE153935,NSCLC
TL1001T_AGATCCGAGAGC,TL1001T,Epithelial cells,GSE153935,NSCLC


In [20]:
# 預測 + 自動合併結果
predictions = celltypist.annotate(
    adata,
    model='/content/drive/MyDrive/Colab Notebooks/Immune_All_Low.pkl',
    majority_voting=True
)
adata = predictions.to_adata(insert_labels=True)


# ---------- Step 4: 大分類 mapping 定義 ----------
mapping = {
    # B cells
    'Follicular B cells': 'B cells',
    'Memory B cells': 'B cells',
    'Naive B cells': 'B cells',
    'Germinal center B cells': 'B cells',
    'Plasma cells': 'B cells',
    'Cycling B cells': 'B cells',
    'Age-associated B cells': 'B cells',
    'Transitional B cells': 'B cells',
    'B cells': 'B cells',
    'Large pre-B cells': 'B cells',
    'Pro-B cells': 'B cells',
    'Plasmablasts': 'B cells',
    'Proliferative germinal center B cells': 'B cells',

    # T cells
    'CD8a/b(entry)': 'T cells',
    'CD8a/a': 'T cells',
    'Tcm/Naive helper T cells': 'T cells',
    'Tcm/Naive cytotoxic T cells': 'T cells',
    'Tem/Trm cytotoxic T cells': 'T cells',
    'Tem/Temra cytotoxic T cells': 'T cells',
    'Trm cytotoxic T cells': 'T cells',
    'Treg(diff)': 'T cells',
    'Regulatory T cells': 'T cells',
    'Tem/Effector helper T cells': 'T cells',
    'Type 1 helper T cells': 'T cells',
    'Type 17 helper T cells': 'T cells',
    'T(agonist)': 'T cells',
    'Follicular helper T cells': 'T cells',
    'CRTAM+ gamma-delta T cells': 'T cells',
    'Cycling T cells': 'T cells',
    'NKT cells': 'T cells',
    'MAIT cells': 'T cells',

    # NK cells / ILC
    'NK cells': 'NK/ILC',
    'CD16+ NK cells': 'NK/ILC',
    'CD16- NK cells': 'NK/ILC',
    'Cycling NK cells': 'NK/ILC',
    'Transitional NK': 'NK/ILC',
    'ILC1': 'NK/ILC',
    'ILC3': 'NK/ILC',
    'ILC2': 'NK/ILC',
    'ILC precursor': 'NK/ILC',
    'ILC': 'NK/ILC',

    # Monocytes & Macrophages
    'Classical monocytes': 'Monocytes',
    'Non-classical monocytes': 'Monocytes',
    'Monocytes': 'Monocytes',
    'Monocyte precursor': 'Monocytes',
    'Mono-mac': 'Macrophages',

    'Intermediate macrophages': 'Macrophages',
    'Alveolar macrophages': 'Macrophages',
    'Kupffer cells': 'Macrophages',
    'Macrophages': 'Macrophages',
    'Intestinal macrophages': 'Macrophages',
    'Erythrophagocytic macrophages': 'Macrophages',
    'Hofbauer cells': 'Macrophages',

    # Dendritic cells
    'DC': 'Dendritic cells',
    'DC1': 'Dendritic cells',
    'DC2': 'Dendritic cells',
    'DC3': 'Dendritic cells',
    'pDC precursor': 'Dendritic cells',
    'pDC': 'Dendritic cells',
    'DC precursor': 'Dendritic cells',
    'Migratory DCs': 'Dendritic cells',

    # Granulocytes
    'Neutrophils': 'Granulocytes',
    'Neutrophil-myeloid progenitor': 'Granulocytes',
    'Granulocytes': 'Granulocytes',
    'Mast cells': 'Granulocytes',

    # Erythroid
    'Early erythroid': 'Erythroid cells',
    'Mid erythroid': 'Erythroid cells',
    'Late erythroid': 'Erythroid cells',
    'Erythrocytes': 'Erythroid cells',

    # Progenitors
    'ELP': 'Progenitor',
    'MEMP': 'Progenitor',
    'HSC/MPP': 'Progenitor',
    'MNP': 'Progenitor',
    'Early MK': 'Progenitor',
    'Double-positive thymocytes': 'Progenitor',
    'Double-negative thymocytes': 'Progenitor',
    'Megakaryocyte-erythroid-mast cell progenitor': 'Progenitor',

    # 結構細胞
    'Epithelial cells': 'Structural cells',
    'Endothelial cells': 'Structural cells',
    'Fibroblasts': 'Structural cells',
}

mapping.update({
    'Myelocytes': 'Granulocytes',
    'Early lymphoid/T lymphoid': 'Progenitor',
    'Memory CD4+ cytotoxic T cells': 'T cells',
})

mapping.update({
    # Progenitor
    'Pre-pro-B cells': 'Progenitor',
    'Small pre-B cells': 'Progenitor',
    'ETP': 'Progenitor',
    'Promyelocytes': 'Progenitor',
    # Monocyte
    'Cycling monocytes': 'Monocyte',
    # Dendritic cell
    'Transitional DC': 'Dendritic cell',
    # Megakaryocyte/Platelet
    'Megakaryocytes/platelets': 'Megakaryocyte/Platelet',
    # T cells
    'gamma-delta T cells': 'T cells',
})

# ---------- Step 5: 建立 Celltype_training 欄位 ----------
adata.obs['Celltype_training'] = adata.obs['predicted_labels'].replace(mapping)

# （可選）印出 label 整併情況
print("✅ 合併後的大分類 Cell types：", adata.obs['Celltype_training'].unique())




✅ 合併後的大分類 Cell types： ['NK/ILC', 'Progenitor', 'T cells', 'Structural cells', 'Granulocytes', 'B cells', 'Macrophages', 'Monocytes', 'Dendritic cells']
Categories (9, object): ['B cells', 'NK/ILC', 'T cells', 'Dendritic cells', ...,
                         'Structural cells', 'Macrophages', 'Granulocytes', 'Monocytes']


In [21]:
print(adata.obs.columns)

Index(['sample_id', 'Celltype_training', 'Data_source', 'Cancer_type',
       'predicted_labels', 'over_clustering', 'majority_voting', 'conf_score'],
      dtype='object')


In [22]:
adata.write_h5ad("/content/drive/MyDrive/Colab Notebooks/MoRE/GSE153935_annotate.h5ad")