In [5]:
import pandas as pd
import numpy as np

# ==========================================
# 1. DEFINE YOUR CUSTOM MARKER DICTIONARY
# ==========================================
# Edit this dictionary with your known markers.
# Format: 'Cell Type Name': ['Gene1', 'Gene2', 'Gene3']
# MARKER_DB = {
#     'T-Cells': ['CD3D', 'CD3E', 'CD3G', 'CD247', 'IL7R'],
#     'B-Cells': ['MS4A1', 'CD79A', 'CD79B', 'CD19'],
#     'Monocytes': ['CD14', 'LYZ', 'FCGR3A', 'MS4A7'],
#     'NK Cells': ['GNLY', 'NKG7', 'KLRB1'],
#     'Dendritic Cells': ['FCER1A', 'CST3'],
#     'Platelets': ['PPBP', 'PF4'],
#     'Proliferation/Cycling': ['MKI67', 'TOP2A']
# }

MARKER_DB = {
    "T-Cell": ["CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B", "IL7R", "TRAC", "TCF7", "LEF1", "CD28"],
    "B-Cell": ["CD79A", "CD79B", "CD19", "MS4A1", "CD20", "PAX5", "BANK1", "IGHM", "IGHD"],
    "Macrophage": ["CD68", "CD163", "MRC1", "CSF1R", "MARCO", "C1QA", "C1QB", "C1QC", "APOE", "LGALS3"],
    "Monocyte": ["CD14", "LYZ", "S100A8", "S100A9", "VCAN", "FCN1"],
    "Dendritic Cell": ["ITGAX", "CD1C", "CLEC9A", "FCER1A", "CD83", "CCR7", "LAMP3"],
    "NK Cell": ["NCAM1", "NKG7", "GNLY", "KLRB1", "KLRD1", "KLRF1", "PRF1", "GZMB"],
    "Neutrophil": ["CSF3R", "CXCR2", "FCGR3B", "S100A8", "S100A9"],
    "Mast Cell": ["KIT", "TPSAB1", "CPA3", "MS4A2", "HPGDS"],
    "Epithelial": ["EPCAM", "KRT8", "KRT18", "KRT19", "CDH1"],
    "Fibroblast": ["COL1A1", "COL1A2", "DCN", "LUM", "FAP", "PDGFRA"],
    "Endothelial": ["PECAM1", "CDH5", "VWF", "ENG", "EMCN"],
}

# ==========================================
# 2. CONFIGURATION
# ==========================================
FILE_PATH = 'rank_genes_groups_by_cluster_JP.xlsx'  # Replace with your actual file name
PVAL_CUTOFF = 0.05
LOGFC_CUTOFF = 0.25           # Minimum log-fold change to consider a gene a marker

def score_cluster(cluster_genes, marker_db):
    """
    Compares a list of genes from a cluster against the marker database.
    Returns a sorted list of potential cell types and their scores.
    """
    scores = {}

    # We turn cluster genes into a set for fast lookup
    cluster_gene_set = set(cluster_genes)

    for cell_type, markers in marker_db.items():
        # Find intersection: which reference markers are in this cluster?
        matches = [gene for gene in markers if gene in cluster_gene_set]

        # SIMPLE SCORING ALGORITHM:
        # You can make this more complex (e.g., weight by logFC),
        # but counting matches is usually robust enough for a first pass.
        score = len(matches)

        if score > 0:
            scores[cell_type] = {
                'score': score,
                'matches': matches
            }

    # Sort by score (highest to lowest)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1]['score'], reverse=True)
    return sorted_scores

def main():
    try:
        # Read all sheets from the Excel file
        print(f"Loading {FILE_PATH}...")
        xls = pd.ExcelFile(FILE_PATH)

        results = []

        for sheet_name in xls.sheet_names:
            print(f"Processing Group/Cluster: {sheet_name}...")

            # Load data for this cluster
            df = pd.read_excel(xls, sheet_name=sheet_name)

            # --------------------------------------
            # DATA HYGIENE / FILTERING
            # --------------------------------------
            # Ensure columns exist (case insensitive check optional, assuming standard names)
            required_cols = ['gene', 'pvals_adj', 'logfoldchange']
            if not all(col in df.columns for col in required_cols):
                print(f"  -> Skipped (Missing columns). Found: {df.columns.tolist()}")
                continue

            # Filter for significant, upregulated genes
            # We filter pvals_adj < 0.05 AND logfoldchange > threshold
            filtered_df = df[
                (df['pvals_adj'] < PVAL_CUTOFF) &
                (df['logfoldchange'] > LOGFC_CUTOFF)
            ]

            # Get the top genes (e.g., top 50 by logfoldchange) to avoid noise
            top_genes = filtered_df.sort_values(by='logfoldchange', ascending=False).head(100)['gene'].tolist()

            if not top_genes:
                print("  -> No significant markers found after filtering.")
                continue

            # --------------------------------------
            # ANNOTATION
            # --------------------------------------
            predictions = score_cluster(top_genes, MARKER_DB)

            best_match = "Unknown"
            match_details = ""

            if predictions:
                # Top prediction
                best_match = predictions[0][0]
                count = predictions[0][1]['score']
                genes_found = ", ".join(predictions[0][1]['matches'])
                match_details = f"{count} markers found ({genes_found})"

            results.append({
                'Cluster_Group': sheet_name,
                'Predicted_Cell_Type': best_match,
                'Evidence': match_details,
                'Top_Cluster_Genes': ", ".join(top_genes[:5]) # Show top 5 genes for manual verification
            })

        # --------------------------------------
        # OUTPUT
        # --------------------------------------
        results_df = pd.DataFrame(results)
        print("\n=== ANNOTATION RESULTS ===")
        print(results_df)

        # Save to CSV
        results_df.to_csv('annotated_clusters.csv', index=False)
        print("\nResults saved to 'annotated_clusters.csv'")

    except FileNotFoundError:
        print(f"Error: The file {FILE_PATH} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

Loading rank_genes_groups_by_cluster_JP.xlsx...
Processing Group/Cluster: Group_0...
Processing Group/Cluster: Group_1...
Processing Group/Cluster: Group_2 Macrophages...
Processing Group/Cluster: Group_3...
Processing Group/Cluster: Group_4...
Processing Group/Cluster: Group_5...
Processing Group/Cluster: Group_6...
Processing Group/Cluster: Group_7...
Processing Group/Cluster: Group_8...
Processing Group/Cluster: Group_9...
Processing Group/Cluster: Group_10...
Processing Group/Cluster: Group_11...
Processing Group/Cluster: Group_12...
Processing Group/Cluster: Group_13...
Processing Group/Cluster: Group_14...
Processing Group/Cluster: Group_15...
Processing Group/Cluster: Group_16...
Processing Group/Cluster: Group_17...

=== ANNOTATION RESULTS ===
          Cluster_Group Predicted_Cell_Type  \
0               Group_0              T-Cell   
1               Group_1          Fibroblast   
2   Group_2 Macrophages          Macrophage   
3               Group_3             Unknown   
4  