In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import sys
import celltypist
import pandas as pd
from typing import TextIO
# Import AnnData for cell type annotation
from anndata import AnnData
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb # <-- NEW IMPORT


def setup_logging(logfile: str) -> TextIO:
    """
    Redirects stdout and stderr to a log file.
    
    Args:
        logfile: The path to the log file.
        
    Returns:
        The file handle for the opened log file.
    """
    print(f"Redirecting output to log file: {logfile}")
    log_file_handle = open(logfile, "w")
    sys.stdout = log_file_handle
    sys.stderr = log_file_handle
    print("Starting Scanpy pipeline...")
    return log_file_handle

def close_logging(log_file_handle: TextIO):
    """
    Closes the log file and restores stdout/stderr.
    
    Args:
        log_file_handle: The file handle returned by setup_logging.
    """
    print("Analysis pipeline completed successfully.")
    sys.stdout.close()
    # Restore standard output
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__
    print(f"Log file created. Check 'scanpy_analysis.log' for details.")

def load_data(data_dir: str, airrport_path: str, vdj_path: str, igblast_path: str) -> tuple[AnnData, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Loads the 10x Genomics data and renames barcodes.
    Loads AIRRPORT dat (parquet file)
    Loads Igblast data and merge with AIRRPORT data by seq_id 
    Load vdj-seq data (refernce)
    """
    print("--- Loading single cells Data ---")
    adata = sc.read_10x_mtx(data_dir, var_names='gene_symbols', cache=True)
    # replace -1 from barcodes with empthy string
    adata.obs_names = adata.obs_names.str.replace('-1', '', regex=False)
    print(f"Loaded {adata.n_obs} cells and {adata.n_vars} genes.")

    
    ### AIRRPORT results ###
    print(f"--- Loading AIRRPORT data from {airrport_path} ---")
    df_airrport = pd.read_parquet(airrport_path)
    # Clean Cell Barcode (CB)
    df_airrport['CB'] = df_airrport['CB'].astype(str).str.replace('Z:', '', regex=False)
    df_airrport['CB'] = df_airrport['CB'].str.replace('*', '-1', regex=False)
    df_airrport['CB'] = df_airrport['CB'].str.replace('-1', '', regex=False)

    ### VDJ (CDR3 sequencing for reference) ###
    print(f"--- Loading VDJ data from {vdj_path} ---")
    df_vdj = pd.read_csv(vdj_path)
    # Add ‚Äúvdj‚Äù before each column name
    df_vdj = df_vdj.add_prefix('vdj_')
    df_vdj['vdj_barcode'] = df_vdj['vdj_barcode'].str.replace("-1$", "", regex=True)

    ### IgBlast for AIRRPORT results ###
    print(f"--- Loading IgBlast data from {igblast_path} ---")
    df_airrport_igblast = pd.read_csv(igblast_path, sep='\t')
    # Add ‚ÄúIgBlast‚Äù before each column name
    df_airrport_igblast = df_airrport_igblast.add_prefix('IgBlast_')
    
    # Select summary columns
    df_airrport_igblast_summary = df_airrport_igblast[[
        "IgBlast_sequence_id", "IgBlast_sequence_aa", "IgBlast_sequence_alignment_aa",
        "IgBlast_cdr3", "IgBlast_cdr3_aa", "IgBlast_v_support", "IgBlast_j_support",
        "IgBlast_v_identity", "IgBlast_j_identity"
    ]]

    ### Left join to IgBlast with AIRRPORT ###
    print("Joining IgBlast and AIRRPORT data...")
    df_airrport_igblast_unified = pd.merge(
        df_airrport_igblast_summary,
        df_airrport,
        left_on="IgBlast_sequence_id",
        right_on="seq_id",
        how="left"
    )

    ### Filter by conditions in order to remove noise ###
    print("Filtering results based on strict conditions...")
    cdr3_strict_conditions = df_airrport_igblast_unified[
        (df_airrport_igblast_unified['IgBlast_v_support'] < 1e-05) &
        (df_airrport_igblast_unified['IgBlast_v_identity'] >= 90) &
        (df_airrport_igblast_unified['IgBlast_j_support'] < 1e-05) &
        (df_airrport_igblast_unified['IgBlast_j_identity'] >= 90)
    ]
    
    print(f"Found {len(cdr3_strict_conditions)} high-confidence (strict conditions of IgBlast) AIRR-seq entries.")


    return adata, df_airrport, df_vdj, df_airrport_igblast, cdr3_strict_conditions, df_airrport_igblast_unified

def analysis_logs(adata, df_airrport, df_vdj, df_airrport_igblast, cdr3_strict_conditions, classifier_table):
    print(f"--- AIRRPORT Data Analysis ---")
    if "CDR3_match" in df_airrport.columns:
        unique_airrport_cdr3_count = df_airrport['CDR3_match'].nunique()
        print(f"Unique values in df_airrport['CDR3_match']: {unique_airrport_cdr3_count}")
        airrport_cdr3_set = set(df_airrport['CDR3_match'].dropna())
    else:
        print("Column 'CDR3_match' not found in df_airrport. Skipping count.")
        airrport_cdr3_set = set()

    print(f"--- VDJ Data Analysis ---")
    if "vdj_cdr3" in df_vdj.columns:
        unique_vdj_cdr3_count = df_vdj['vdj_cdr3'].nunique()
        print(f"Unique values in df_vdj['vdj_cdr3']: {unique_vdj_cdr3_count}")
        vdj_cdr3_set = set(df_vdj['vdj_cdr3'].dropna())
    else:
        print("Column 'vdj_cdr3' not found in df_vdj. Skipping count.")
        vdj_cdr3_set = set()

    print(f"--- Common Sequence Analysis ---")
    if "CDR3_match" in df_airrport.columns and "vdj_cdr3" in df_vdj.columns:
        common_sequences = airrport_cdr3_set.intersection(vdj_cdr3_set)
        print(f"Common sequences between df_airrport['CDR3_match'] and df_vdj['vdj_cdr3']: {len(common_sequences)}")
    else:
        print("Skipping common sequence count due to missing column(s).")
        
    # --- NEW SECTION: Classifier Table Analysis ---
    print(f"--- Classifier Table Analysis ---")
    if classifier_table is not None and not classifier_table.empty:
        # 1. Unique CDR3 count in classifier_table
        # (Assuming the column name in classifier_table is 'cdr3' based on previous code, 
        # but checking for 'CDR3_match' just in case)
        cdr3_col = 'cdr3' if 'cdr3' in classifier_table.columns else 'CDR3_match'
        
        if cdr3_col in classifier_table.columns:
            unique_cdr3_classifier = classifier_table[cdr3_col].nunique()
            print(f"Unique CDR3 sequences in classifier_table: {unique_cdr3_classifier}")
            
            # 2. Unique CB count for those sequences
            # This is just the total unique CBs in the table, as every row has a CB.
            if 'CB' in classifier_table.columns:
                unique_cb_classifier = classifier_table['CB'].nunique()
                print(f"Unique Cell Barcodes (CB) in classifier_table: {unique_cb_classifier}")
        else:
             print(f"Warning: Neither 'cdr3' nor 'CDR3_match' found in classifier_table.")

        # 3. Count of sequences that are BOTH 'in_vdj' AND 'T cells'
        # We need to check if the required columns exist first.
        required_cols = ['label', 'cdr3'] # We can use the 'label' column we created earlier
        if all(col in classifier_table.columns for col in required_cols):
             # The 'label' column already holds (is_t_cell & is_in_vdj)
             # We want unique CDR3s that have at least one True label.
             true_label_cdr3s = classifier_table.loc[classifier_table['label'] == True, cdr3_col].nunique()
             print(f"Unique CDR3s that are both 'in_vdj' and 'T cell': {true_label_cdr3s}")
        elif 'in_vdj' in classifier_table.columns and 'cell_type' in classifier_table.columns and cdr3_col in classifier_table.columns:
            # Fallback if 'label' column hasn't been created yet
            is_t_cell = classifier_table['cell_type'].str.contains("T cells", case=False, na=False)
            is_in_vdj = classifier_table['in_vdj'] == True
            true_label_cdr3s = classifier_table.loc[is_t_cell & is_in_vdj, cdr3_col].nunique()
            print(f"Unique CDR3s that are both 'in_vdj' and 'T cell': {true_label_cdr3s}")
        else:
             print("Skipping T-cell/VDJ intersection count due to missing columns (need 'in_vdj', 'cell_type', and 'cdr3').")

    else:
        print("classifier_table is empty or None.")


def preprocess_sc_data(adata: AnnData) -> AnnData:
    """
    Runs standard preprocessing (filtering, normalization, HVGs).
    
    Args:
        adata: The AnnData object to preprocess.
        
    Returns:
        The preprocessed AnnData object.
    """
    print("--- Standard Preprocessing for Single cells ---")
    
    # Basic filtering (adjust thresholds as needed for your specific tissue)
    # This removes any cell that detected fewer than 200 genes.
    # Why? Cells with very few detected genes are often dead cells, empty droplets
    # (identifying background noise rather than a real cell), or failed library preparation.
    sc.pp.filter_cells(adata, min_genes=200)
    # This removes any gene that was detected in fewer than 3 cells across the entire dataset.
    # Why? deeply rarely expressed genes (e.g., found in only 1 or 2 cells out of thousands) provide little statistical power for clustering or differential expression and increase computational noise.
    sc.pp.filter_genes(adata, min_cells=3)
    print(f"Data shape after filtering: {adata.n_obs} cells x {adata.n_vars} genes")

    # Mitochondrial gene filtering
    # High mitochondrial DNA (mtDNA) percentage is a classic sign of a stressed or dying cell, and these are usually removed from analysis.
    adata.var['mito_genes'] = adata.var_names.str.startswith('MT-')  # MT- for human
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mito_genes'], percent_top=None, log1p=False, inplace=True)

    # Filter cells based on mitochondrial content (e.g., < 10% or < 20%)
    # Adjust this threshold based on your data's distribution (plot with sc.pl.violin)
    print(f"Cells before mitochondrial gene filtering: {adata.n_obs}")
    adata = adata[adata.obs.pct_counts_mito_genes < 20, :]
    print(f"Cells after mitochondrial gene filtering: {adata.n_obs}")

    # Normalization and Scaling
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    
    # Identify highly variable genes for dimensionality reduction
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    
    # Make a copy of raw data for later use (e.g., CellTypist prefers it)
    # This MUST be done AFTER normalization/log1p but BEFORE subsetting and scaling.
    adata.raw = adata 
    
    # 2. Subset to highly variable genes
    # This subsets the main adata object for clustering.
    adata = adata[:, adata.var.highly_variable]
    print(f"Subsetting to {adata.n_vars} highly variable genes.")

    # 3. Scale data to unit variance and 0 mean.
    # This is done LAST, only on the highly variable genes.
    sc.pp.scale(adata, max_value=10) 
    
    return adata

def cluster_and_embed(adata: AnnData) -> AnnData:
    """
    Runs PCA, neighborhood graph, UMAP, and Leiden clustering.
    
    Args:
        adata: The preprocessed AnnData object.
        
    Returns:
        The clustered and embedded AnnData object.
    """
    print("--- Clustering and Embedding ---")
    # PCA and Neighborhood graph
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    #Clustering and Embedding
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=0.5) # Adjust resolution for more/fewer clusters
    return adata

def save_clustering_plots(adata: AnnData):
    """
    Saves UMAP and PCA plots colored by Leiden cluster.
    
    Args:
        adata: The clustered AnnData object.
    """
    print("Saving clustering plots...")
    sc.pl.umap(adata, color=['leiden'], save="_leiden_clusters.png", show=False)
    sc.pl.pca(adata, color='leiden', save="_pca_clusters.png", show=False)

def annotate_cell_types(adata: AnnData, model_name: str) -> AnnData:
    """
    Annotates cell types using CellTypist.
    
    Args:
        adata: The clustered AnnData object.
        model_name: The name of the CellTypist model to use.
        
    Returns:
        The AnnData object with 'predicted_cell_type' in .obs.
    """
    print("--- Cell Type Annotation (CellTypist) ---")
       
    # Download model (e.g., for immune cells)
    print(f"Downloading/loading model: {model_name}")
    model = celltypist.models.download_models(model=model_name)
    
    # Predict cell types
    # Note: CellTypist automatically uses adata.raw if available, which is good practice.
    print("Running cell type prediction...")
    predictions = celltypist.annotate(adata, model=model, majority_voting=True)
    
    # Add predictions to AnnData object
    # We use the 'majority_voting' result for cleaner clusters
    adata.obs['predicted_cell_type'] = predictions.predicted_labels['majority_voting']
    return adata

def save_annotation_results(adata: AnnData):
    """
    Saves the UMAP plot colored by cell type and a CSV of annotations.
    
    Args:
        adata: The annotated AnnData object.
    """
    print("Saving CellTypist annotation plot...")
    sc.pl.umap(adata, color='predicted_cell_type', save="_celltypist_annotation.png", show=False)
    
    # Optional: Save a CSV of just the barcodes and their new cell types for easy viewing later
    print("Saving annotation CSV...")
    adata.obs[['leiden', 'predicted_cell_type']].to_csv("cell_type_annotations.csv")

def classifier_table(
    adata: AnnData,
    df_airrport_igblast_unified: pd.DataFrame,
    df_vdj: pd.DataFrame,
    publicness_file_path: str,
    cell_type_col: str = 'predicted_labels'
) -> pd.DataFrame:
    """
    Builds the classifier table by joining AIRR, IgBlast, publicness,
    gene expression, and cell type data.
    
    Args:
        df_airrport: DataFrame from AIRRPORT (e.g., df_airrport_SRX10124718)
        df_igblast: DataFrame from IgBlast (e.g., df_airrport_igblast_..._unified)
        adata: Your preprocessed AnnData object
        publicness_file_path: String path to the _ppub_counts.csv file
        cell_type_col: Name of the column in adata.obs with cell types
        
    Returns:
        A new DataFrame 'classifier_table'
    """
    print("\n--- Building Classifier Table ---")

    # --- Read and join publicness ---
    print("Read publicness data")
    publicness_df = pd.read_csv(publicness_file_path)
    print(publicness_df.head())

    publicness_subset = (
        publicness_df[['CDR3_match', 'publicness_score']]
        .drop_duplicates(subset=['CDR3_match'], keep='first')
    )
    
    classifier_table = pd.merge(
        df_airrport_igblast_unified,
        publicness_subset,
        how='left',
        left_on='CDR3_match',
        right_on='CDR3_match'
    )

    print("Add gene expression matrix to classifier table")
    expr_df = adata.to_df()
    expr_df = expr_df.reset_index().rename(columns={'index': 'CB'})
    classifier_table = pd.merge(
        classifier_table,
        expr_df,
        how='left',
        on='CB'
    )

    #  Add Cell Type from annotation
    print("Adding cell types")
    if cell_type_col in adata.obs.columns:
        # Create a map with cleaned barcodes
        cell_type_series = adata.obs[cell_type_col]
        cell_type_map = cell_type_series.to_dict()
        
        # Map cell types using the cleaned CBs
        classifier_table['cell_type'] = classifier_table['CB'].map(cell_type_map)
    else:
        print(f"Warning: Cell type column '{cell_type_col}' not found in adata.obs. Skipping.")
        classifier_table['cell_type'] = None


    print("Creating concatenated of CDR3_match and CB and then check if it exist in VDJ reference...")
    # concatenated of CDR3_match and CB in vdj reference
    vdj_keys = set(
        df_vdj['vdj_cdr3'].astype(str) + "_" + df_vdj['vdj_barcode'].astype(str)
    )

    # Create the new concatenated key column in classifier_table.
    classifier_table['temp_key'] = (
        classifier_table['CDR3_match'].astype(str) + "_" + classifier_table['CB'].astype(str)
    )

    # Use .isin() to check if each key is in the vdj_keys set.
    classifier_table['is_in_vdj'] = classifier_table['temp_key'].isin(vdj_keys)

    # Remove the temporary key column
    classifier_table = classifier_table.drop(columns=['temp_key'])

    print("\nDone! Final table shape:", classifier_table.shape)
    print("Write classifier table head to csv file")
    classifier_table.head(5).to_csv("classifier_head.csv", index=False)
    
    print("\n--- Analysis of classifier_table ---")

    # 1. How many return true in 'in_vdj' column?
    if 'is_in_vdj' in classifier_table.columns:
        # .sum() treats True as 1 and False as 0
        in_vdj_true_count = classifier_table['is_in_vdj'].sum()
        print(f"Total rows where 'in_vdj' is True: {in_vdj_true_count}")
    else:
        print("Error: Column 'in_vdj' not found in classifier_table.")

    # 2. How many cdr3 there is for each cell type?
    if 'cell_type' in classifier_table.columns and 'CDR3_match' in classifier_table.columns:
        print("Unique CDR3 count per cell type:")
        
        # Group by cell type, then count the unique CDR3s in each group
        cdr3_per_cell_type = classifier_table.groupby('cell_type')['CDR3_match'].nunique()
        
        print(cdr3_per_cell_type)
    else:
        print("\nError: Columns 'cell_type' or 'CDR3_match' not found, skipping CDR3 count per cell type.")

    print("--- Counting unique CDR3s for T cells in VDJ ---")
 
    # Check for necessary columns
    required_cols = ['cell_type', 'is_in_vdj', 'CDR3_match']
    if all(col in classifier_table.columns for col in required_cols):
            
        # 1. Filter for 'cell_type' containing "T cells"
        #    Using na=False to safely handle potential NaN values
        #    Using case=False to make the search case-insensitive
        t_cell_filter = classifier_table['cell_type'].str.contains(
            "T cells", 
            na=False, 
            case=False
        )
            
        # 2. Filter for 'in_vdj' being True
        in_vdj_filter = classifier_table['is_in_vdj'] == True
            
        # 3. Combine filters
        combined_filter = t_cell_filter & in_vdj_filter

        # 4. Apply filter and get the 'CDR3_match' column
        filtered_cdr3s = classifier_table.loc[combined_filter, 'CDR3_match']

        # 5. Count unique CDR3s
        unique_cdr3_count = filtered_cdr3s.nunique()
            
        print(f"Found {unique_cdr3_count} unique CDR3s that are 'in_vdj' and in cells containing 'T cells'.")
            
    else:
        print(f"Error: 'classifier_table' is missing one or more required columns: {required_cols}")


    return classifier_table

def build_model(classifier_table):
    print("classifier_table:", type(classifier_table))
    # --- STEP 1: CREATE YOUR LABEL ---
    print("Creating target label 'y'...")
    classifier_table['cell_type'] = classifier_table['cell_type'].fillna('')
    is_t_cell = classifier_table['cell_type'].str.contains("T cells", case=False)
    is_in_vdj = classifier_table['is_in_vdj'] == True
    classifier_table['label'] = is_t_cell & is_in_vdj

    print("Label distribution:")
    label_counts = classifier_table['label'].value_counts()
    print(label_counts)

    # --- STEP 2: AUTOMATIC FEATURE SELECTION ---
    print("Automatically selecting features 'X'...")

    # Define columns to EXCLUDE from features
    exclude_cols = ['cdr3', 'CB', 'label', 'cell_type', 'is_in_vdj']
    
    # Select all columns that are numeric AND not in the exclude list
    # This will automatically find all IgBlast scores, publicness, and gene columns
    X = classifier_table.drop(columns=exclude_cols, errors='ignore').select_dtypes(include=np.number)
    y = classifier_table['label']
    
    # Save column names for later
    feature_names = X.columns.tolist()
    print(f"Found {len(feature_names)} numerical features (e.g., {feature_names[:3]}...)")

    # --- STEP 3: HANDLE MISSING DATA (Imputation) ---
    # (Using median is robust to outliers, good for gene data)
    print("Imputing missing data (NaNs)...")
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)

    # --- STEP 4: TRAIN-TEST SPLIT ---
    print("Splitting data into train/test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_imputed, 
        y, 
        test_size=0.2, 
        random_state=42,
        stratify=y  # Ensures both train and test get a similar % of True/False
    )

    # --- STEP 5: SCALE FEATURES ---
    print("Scaling features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # --- STEP 6: DEFINE & TRAIN MODELS (NEW) ---

    # Calculate scale_pos_weight for XGBoost to handle imbalance
    # (Count of negative class) / (Count of positive class)
    scale_pos_weight = (y_train == False).sum() / (y_train == True).sum()
    print(f"Using scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

    models = {
        "RandomForest": RandomForestClassifier(
            class_weight='balanced', # Handles imbalance for RF
            random_state=42
        ),
        "XGBoost": xgb.XGBClassifier(
            scale_pos_weight=scale_pos_weight, # Handles imbalance for XGB
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )
    }

    # --- STEP 7: EVALUATE MODELS (NEW) ---
    
    for name, model in models.items():
        print(f"\n--- Training {name} ---")
        model.fit(X_train_scaled, y_train)
        
        print(f"--- Evaluating {name} ---")
        y_pred = model.predict(X_test_scaled)

        print(f"\nConfusion Matrix ({name}):")
        print(confusion_matrix(y_test, y_pred))

        print(f"\nClassification Report ({name}):")
        print(classification_report(y_test, y_pred, target_names=['Not T-Cell', 'Is T-Cell']))

        # Get Feature Importances
        print(f"\nFeature Importances ({name}):")
        importances = model.feature_importances_
        feature_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        })
        print(feature_df.sort_values(by='importance', ascending=False).head(10))
        print("-" * 30)


def main():
    """
    Main function to run the entire pipeline.
    """
    # --- Configuration ---
    SC_DATA_DIR = '/dsi/efroni-lab/AIRRPORT/test_env/sc/GEX/runs/SRX10124718_gex/outs/filtered_feature_bc_matrix/'
    AIRRPORT_PATH = '/home/ls/linoym/r_files/airrport/SRX10124718_sample/matched_SRX10124718_unaligned_reads_plusCBUB_trimmed_R2_collapsed.parquet'
    VDJ_PATH = '/home/ls/linoym/r_files/airrport/SRX10124718_sample/P4_LNM_vdj_filtered_contig_annotations.csv'
    IGBLAST_PATH = '/home/ls/linoym/r_files/airrport/SRX10124718_sample/igblast_SRX10124718.tsv'
    LOG_FILE = "scanpy_analysis.log"
    CELLTYPIST_MODEL = 'Immune_All_Low.pkl'
    FINAL_ADATA_FILE = "sc_rna_seq_processed.h5ad"
    PUBLICNESS_PATH = "/home/ls/linoym/r_files/airrport/SRX10124718_ppub_counts.csv"

    log_handle = setup_logging(LOG_FILE)
    
    try:
        # Run Pipeline
        sc_rna_seq, df_airrport, df_vdj, df_airrport_igblast, cdr3_strict_conditions, df_airrport_igblast_unified = load_data(
            SC_DATA_DIR,
            AIRRPORT_PATH,
            VDJ_PATH,
            IGBLAST_PATH
        )
        # Preprocessing
        sc_rna_seq = preprocess_sc_data(sc_rna_seq)
        # Dimensionality Reduction & Clustering
        sc_rna_seq = cluster_and_embed(sc_rna_seq)
        # Save clustering plots
        save_clustering_plots(sc_rna_seq)
        # Annotate
        sc_rna_seq = annotate_cell_types(sc_rna_seq, CELLTYPIST_MODEL)
        save_annotation_results(sc_rna_seq)
        # Final Save of the entire object
        sc_rna_seq.write(FINAL_ADATA_FILE)
        classifier_data = classifier_table(sc_rna_seq,df_airrport_igblast_unified,df_vdj ,PUBLICNESS_PATH,cell_type_col = 'predicted_cell_type')
        analysis_logs(sc_rna_seq, df_airrport, df_vdj, df_airrport_igblast, cdr3_strict_conditions, classifier_data)
        build_model(classifier_data)

    except Exception as e:
        print(f"--- ERROR: Pipeline failed ---", file=sys.stderr)
        print(str(e), file=sys.stderr)
        # Re-raise the exception after logging it
        raise
    finally:
        # This block will run NO MATTER WHAT (success or error),
        # ensuring your log file is always closed properly.
        close_logging(log_handle)

if __name__ == "__main__":
    main()

  from scanpy import __version__ as scv


Redirecting output to log file: scanpy_analysis.log


... storing 'feature_types' as categorical




üìÇ Storing models in /home/ls/linoym/.celltypist/data/models
üíæ Total models to download: 1
‚è© Skipping [1/1]: Immune_All_Low.pkl (file exists)
üëÄ Invalid expression matrix in `.X`, expect log1p normalized expression to 10000 counts per cell; will use `.raw.X` instead
üî¨ Input data has 11084 cells and 25687 genes
üîó Matching reference genes in the model
üß¨ 5618 features used for prediction
‚öñÔ∏è Scaling input data
üñãÔ∏è Predicting labels
‚úÖ Prediction done!
üëÄ Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
‚õìÔ∏è Over-clustering input data with resolution set to 10
üó≥Ô∏è Majority voting the predictions
‚úÖ Majority voting done!




... storing 'feature_types' as categorical


KeyboardInterrupt: 