## Cell 1: Imports and Setup

In [3]:

"""
Entity Classifier Workflow - Jupyter Notebook Version
Interactive version with cell-by-cell execution control
"""

###############
### Imports ###
###############
import sys
import pandas as pd
import numpy as np
import warnings
import logging
import multiprocessing as mp
import os
from datetime import datetime
from pathlib import Path
from contextlib import redirect_stdout, redirect_stderr

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

# Setup paths
current_script_path = Path.cwd()  # Use current working directory in Jupyter
project_root = current_script_path
module_path = project_root / "src" / "data"
if str(module_path) not in sys.path:
    sys.path.append(str(module_path))

# Import configuration
from entity_model_settings_TEST import *

# Create output directory
project_root = os.path.abspath(os.getcwd())
output_dir = os.path.join(project_root, 'data', run_folder_name)
os.makedirs(output_dir, exist_ok=True)

print(f"✅ Setup completed")
print(f"📁 Output directory: {output_dir}")
print(f"🎯 Target class: {TARGET_CLASS}")
print(f"📊 Classification column: {CLASSIFIED_BY}")




✅ Setup completed
📁 Output directory: /home/lestrada/tumor_type_prediction/notebooks/data/CHDM_250824_results
🎯 Target class: ['CHDM']
📊 Classification column: code_oncotree


In [None]:
import importlib
importlib.reload(entity_model_settings_TEST)



## Cell 2: Logging Configuration

In [5]:
#####################
### Logging Setup ###
#####################

log_filename = f"classifier_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # Also log to notebook output
    ]
)

warnings.filterwarnings("default") 
warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: logging.warning(
    f"{category.__name__}: {message} (in {filename}:{lineno})"
)

mp.set_start_method('spawn', force=True)

print(f"📝 Logging configured - log file: {log_filename}")





📝 Logging configured - log file: classifier_log_20250824_102701.log



## Cell 3: Import Custom Modules


In [6]:

 
def import_custom_modules():
    """Import custom modules with error handling"""
    try:
        import preprocessing as prep
        import feature_selection as fs
        import model_fit as mf
        import graphs as grph
        return prep, fs, mf, grph
    except ImportError as e:
        print(f"❌ Error importing custom modules: {e}")
        print("Make sure the following modules are in src/data/:")
        print("- preprocessing.py") 
        print("- feature_selection.py")
        print("- model_fit.py")
        print("- graphs.py")
        raise

# Import modules
prep, fs, mf, grph = import_custom_modules()
print("✅ Custom modules imported successfully")
 




✅ Custom modules imported successfully



## Cell 4: Configuration Display

In [7]:
def print_configuration():
    """Print current configuration settings"""
    print("=" * 80)
    print("CURRENT CONFIGURATION")
    print("=" * 80)
    print(f"Target Class: {TARGET_CLASS}")
    print(f"Classification Column: {CLASSIFIED_BY}")
    print(f"Data Folder: {PROCESSED_DATA_FOLDER}")
    print(f"Split Size: {SPLIT_SIZE}")
    print(f"High Confidence Threshold: {HIGH_CONFIDENCE_THRESHOLD}")
    print(f"ElasticNet Parameters: L1_ratio={ELNET_L1_RATIO}, C={ELNET_C_VALUE}")
    print(f"Cross-validation: {ELNET_N_SPLITS} splits, {ELNET_N_REPEATS} repeats")
    print(f"Nested CV: {NESTED_CV_RANDOM_STATE_TRIES} tries, {NESTED_CV_N_SPLITS} splits")
    print("=" * 80)

print_configuration()
 


CURRENT CONFIGURATION
Target Class: ['CHDM']
Classification Column: code_oncotree
Data Folder: 2025.08.06_CJ_paper_final/
Split Size: 0.3
High Confidence Threshold: 0.7
ElasticNet Parameters: L1_ratio=0.5, C=1
Cross-validation: 3 splits, 67 repeats
Nested CV: 10 tries, 3 splits



## Cell 5: Data Loading


In [17]:

 
def load_data():
    """Load all required data files"""
    print("="*80)
    print("Loading data files...")
    print("="*80)
    
    # Construct file paths using configuration variables
    intensity_path_file = FOLDER_PATH + PROCESSED_DATA_FOLDER + PREPROCESSED_FP_INTENSITY
    z_scores_path_file = FOLDER_PATH + PROCESSED_DATA_FOLDER + PREPROCESSED_FP_Z_SCORES
    the_metadata_file = METADATA_PATH + METADATA_FILE
    
    print(f"📂 Loading intensity data from: {intensity_path_file}")
    print(f"📂 Loading z-scores data from: {z_scores_path_file}")
    print(f"📂 Loading metadata from: {the_metadata_file}")
    
    try:
        input_quantifications = prep.read_table_with_correct_sep(intensity_path_file)
        df_z_scores = prep.read_table_with_correct_sep(z_scores_path_file)
        input_metadata = pd.read_excel(the_metadata_file,
                                        usecols=['Sample name', 'code_oncotree', 'Tumor cell content', 'TCC_Bioinfo', 'TCC GROUP'],
                                        dtype={'Sample name': 'string', 'code_oncotree': 'string', 'Tumor cell content': 'float64', 'TCC_Bioinfo': 'float64', 'TCC GROUP': 'string'},
                                        na_values=['', 'NA', 'NaN', 'nan', 'N/A', 'n/a', 'None', 'TBD', 'notavailable', 'missing'])

        print("✅ Data files loaded successfully.")
        print(f"📊 Quantifications shape: {input_quantifications.shape}")
        print(f"📊 Z-scores shape: {df_z_scores.shape}")
        print(f"📊 Metadata shape: {input_metadata.shape}")

        return input_quantifications, df_z_scores, input_metadata

    except FileNotFoundError as e:
        print(f"❌ Error loading data files: {e}")
        print("Please check that all data files exist in the specified paths:")
        print(f"  - {intensity_path_file}")
        print(f"  - {z_scores_path_file}")
        print(f"  - {the_metadata_file}")
        raise

# Execute data loading
input_quantifications, df_z_scores, input_metadata = load_data()
 


Loading data files...
📂 Loading intensity data from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2025.08.06_CJ_paper_final/preprocessed_fp.csv
📂 Loading z-scores data from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2025.08.06_CJ_paper_final/full_proteome_measures_z.tsv
📂 Loading metadata from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/LE_PROdict/paper_freeze_versions_22_08/METADATA_PANCANCER_PAPER_final.xlsx
✅ Data files loaded successfully.
📊 Quantifications shape: (13069, 5173)
📊 Z-scores shape: (13069, 1999)
📊 Metadata shape: (1998, 5)



## Cell 6: Data Preprocessing


In [20]:
input_metadata['TCC GROUP'].value_counts()

TCC GROUP
high            885
intermediate    633
low             286
very low        182
notdefined       12
Name: count, dtype: Int64

In [None]:
def preprocess_data(input_quantifications, df_z_scores, input_metadata):
    """Preprocess all data"""
    print("="*80)
    print("Preprocessing data...")
    print("="*80)
    
    # Protein quantification intensities post-processing
    input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
    peptides_quant_info = prep.post_process_meta_intensities(
        input_quantifications.iloc[:, int(input_quantifications.shape[1]/2):].T
    )
    proteins_quant = input_quantifications.iloc[:, :int(input_quantifications.shape[1]/2)].T
    print(f"🔬 Proteins quantifications columns: {proteins_quant.iloc[:,:10].columns.tolist()}")
    
    # Imputation with configurable parameters
    prot_quant_imputed = prep.impute_normal_down_shift_distribution(
        proteins_quant, 
        width=IMPUTATION_WIDTH, 
        downshift=IMPUTATION_DOWNSHIFT, 
        seed=IMPUTATION_SEED
    )
    na_columns = prot_quant_imputed.isna().any()
    na_columns_true = na_columns[na_columns].index.tolist()
    print("🚨 Proteins with empty values:", na_columns_true)

    # Cleaning sample names
    prot_quant_imputed.reset_index(inplace=True)
    prot_quant_imputed.rename(columns={'index': SAMPLES_COLUMN}, inplace=True)
    prot_quant_imputed[SAMPLES_COLUMN] = prot_quant_imputed[SAMPLES_COLUMN].str.replace('pat_', '').str.strip()
    
    # Dataset with protein intensities and metadata
    input_metadata['TCC'] = input_metadata['TCC_Bioinfo'].fillna(input_metadata['Tumor cell content'])
    samples_metadata = input_metadata[[SAMPLES_COLUMN, CLASSIFIED_BY, 'TCC', 'TCC GROUP']]
    samples_metadata[SAMPLES_COLUMN] = samples_metadata[SAMPLES_COLUMN].str.strip()
    initial_df = samples_metadata.merge(prot_quant_imputed, on=SAMPLES_COLUMN, how='left')
    
    # Peptides quantification to binary dataset
    peptides_df_binary = pd.DataFrame(
        np.where(peptides_quant_info > 1, 1, 0),
        index=peptides_quant_info.index,
        columns=peptides_quant_info.columns  
    )
    peptides_df_binary.reset_index(inplace=True)
    peptides_df_binary.replace('Identification metadata ', '', regex=True, inplace=True)
    peptides_df_binary['index'] = peptides_df_binary['index'].str.strip()
    peptides_df_binary['index'] = peptides_df_binary['index'].str.strip()
    peptides_df_binary2 = samples_metadata.merge(peptides_df_binary, left_on=SAMPLES_COLUMN, right_on='index')
    peptides_df_binary2.drop('index', axis=1, inplace=True)
    
    print(f"🧬 Peptides binary dataframe shape: {peptides_df_binary.shape}")
    
    # Process Z-scores
    z_scores_df = df_z_scores.transpose(copy=True) 
    print(f"📈 Z-scores dataframe shape before processing: {z_scores_df.shape}")
    z_scores_df = z_scores_df.reset_index()
    z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
    z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
    z_scores_df.drop(axis=0, index=0, inplace=True)
    z_scores_df['Gene names'] = z_scores_df.iloc[:,0].str.replace('pat_', '')
    z_scores_df = z_scores_df.set_index('Gene names') 
    
    z_scores_imputed = prep.impute_normal_down_shift_distribution(
        z_scores_df,
        width=IMPUTATION_WIDTH, 
        downshift=IMPUTATION_DOWNSHIFT, 
        seed=IMPUTATION_SEED
    )
    z_scores_imputed.reset_index(inplace=True)
    z_scores_imputed.rename(columns={'Gene names': SAMPLES_COLUMN}, inplace=True)
    z_scores_imputed[SAMPLES_COLUMN] = z_scores_imputed[SAMPLES_COLUMN].str.strip()    z_scores_imputed[SAMPLES_COLUMN] = z_scores_imputed[SAMPLES_COLUMN].str.strip()
    z_scores_initial_df = samples_metadata.merge(z_scores_imputed, on=SAMPLES_COLUMN, how='left')
    
    print(f"📈 Z-scores initial dataframe shape: {z_scores_initial_df.shape}")
    print("✅ Preprocessing completed successfully")
    
    return initial_df, peptides_df_binary, z_scores_initial_df, z_scores_imputed, peptides_df_binary2

# Execute preprocessing
initial_df, peptides_df_binary, z_scores_initial_df, z_scores_imputed, peptides_df_binary2 = preprocess_data(
    input_quantifications, df_z_scores, input_metadata
)

# Display basic info about processed data
print(f"\n📋 Data Summary:")
print(f"Initial dataframe shape: {initial_df.shape}")
print(f"Classes in dataset: {initial_df[CLASSIFIED_BY].value_counts().to_dict()}")
 


Preprocessing data...
🔬 Proteins quantifications columns: ['SYMPK', 'NUP160', 'FARP1', 'UPF1', 'IGBP1', 'PSMA1', 'COL6A2', 'TXLNA', 'POGLUT1', 'EFTUD2', 'PI4KA', 'LRCH4', 'MYH14', 'UQCRC2', 'COL6A3', 'RBM17', 'SLC25A12', 'PICALM', 'MUL1', 'ATL3', 'MMAB', 'ANKFY1', 'MYOF', 'TFG', 'TINAGL1', 'HCLS1', 'FARSA', 'CS', 'SERPINA1', 'PPM1B']
(2586, 13069)
🚨 Proteins with empty values: ['ENPP7', 'SHOX2', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']
🧬 Peptides binary dataframe shape: (2586, 13070)
📈 Z-scores dataframe shape before processing: (1999, 13069)
(1998, 13069)
📈 Z-scores initial dataframe shape: (1998, 13073)
✅ Preprocessing completed successfully

📋 Data Summary:
Initial dataframe shape: (1998, 13073)
Classes in dataset: {'BRCA': 238, 'CUPNOS': 112, 'CHDM': 102, 'SYNS': 84, 'LMS': 77, 'SARCNOS': 63, 'ACYC': 62, 'SFT': 52, 'MFH': 51, 'ARMS': 46, 'ES': 45, 'ACC': 39, 'IHCH': 35, 'OS': 35, 'ERMS': 29, 'DSRCT': 28, 'DDLS': 27, 'MRLS': 27, 'ULMS': 25, 'ASPS': 23, 'DIFG': 23, 'PANET': 21, 'PAAD'

In [12]:
initial_df['Sample name'][initial_df['Sample name'].str.contains('H021-CF522-T1-Q1')]

560    H021-CF522-T1-Q1
Name: Sample name, dtype: object


## Cell 7: Data Splitting


In [22]:

 
def split_data(initial_df, z_scores_initial_df):
    """Split data into training and held-out sets"""
    print("="*80)
    print("Splitting data...")
    print("="*80)
    
    nos_cases = initial_df[initial_df[CLASSIFIED_BY].str.endswith('NOS', na=False)][CLASSIFIED_BY].unique().tolist()
    cases_to_remove = nos_cases + OTHER_CASES
    print(f"🗑️ Removing undefined cases: {cases_to_remove}")

    # Removing samples not part of the train-validation set
    ml_initial_df = (
        initial_df
        .pipe(prep.remove_class, cases_to_remove, CLASSIFIED_BY, output_dir)
        .pipe(prep.remove_class, ['very low', 'missing'], 'TCC GROUP', output_dir)
        .loc[lambda df: df['TCC GROUP'].notna()]
    )

    # Splitting dataset into training and held-out sets
    training_df, held_out_df = prep.data_split(
        ml_initial_df,
        output_directory=output_dir, 
        split_size=SPLIT_SIZE, 
        classified_by=CLASSIFIED_BY, 
        export=True,
    )
    
    # Z_scores dataset
    z_scores_train_df = z_scores_initial_df[z_scores_initial_df['Sample name'].isin(training_df['Sample name'])]

    print(f"Samples match between Z-score and intesntity dataset: {set(training_df['Sample name']) == set(z_scores_train_df['Sample name'])}")
    print(f"🎯 Training set size: {training_df.shape}")
    print(f"🎯 Held-out set size: {held_out_df.shape}")
    print(f"📈 Z-scores training set size: {z_scores_train_df.shape}")
    print("✅ Data splitting completed")
    
    return training_df, held_out_df, z_scores_train_df

# Execute data splitting
training_df, held_out_df, z_scores_train_df = split_data(initial_df, z_scores_initial_df)

# Display split information
print(f"\n📊 Split Summary:")
print(f"Training classes: {training_df[CLASSIFIED_BY].value_counts().to_dict()}")
print(f"Held-out classes: {held_out_df[CLASSIFIED_BY].value_counts().to_dict()}")
 


Splitting data...
🗑️ Removing undefined cases: ['CUPNOS', 'NETNOS', 'SARCNOS', 'SCCNOS', 'RCSNOS', 'NECNOS', 'MBLNOS', 'missing']
Removed samples: 206
Remaining samples: 1792
Removed samples: 164
Remaining samples: 1628
Classes with only one sample: 66
Training set samples: 1159
Held-out set samples: 469
Samples match between Z-score and intesntity dataset: True
🎯 Training set size: (1159, 13073)
🎯 Held-out set size: (469, 13073)
📈 Z-scores training set size: (1159, 13073)
✅ Data splitting completed

📊 Split Summary:
Training classes: {'BRCA': 156, 'CHDM': 65, 'SYNS': 53, 'LMS': 50, 'ACYC': 41, 'SFT': 36, 'MFH': 32, 'ES': 32, 'ARMS': 32, 'ACC': 26, 'IHCH': 22, 'OS': 22, 'ERMS': 20, 'ULMS': 17, 'DSRCT': 17, 'DDLS': 15, 'PANET': 15, 'MRLS': 15, 'DIFG': 14, 'PAAD': 14, 'MEL': 13, 'SDCA': 13, 'ASPS': 13, 'LGFMS': 11, 'MPNST': 11, 'GIST': 11, 'CHS': 11, 'COAD': 11, 'THYC': 11, 'EPIS': 10, 'READ': 9, 'CCS': 9, 'UM': 9, 'DFSP': 8, 'BA': 8, 'RMS': 8, 'GINET': 8, 'THYM': 7, 'PLEMESO': 7, 'ANGS'

In [27]:
training_df['Sample name'][training_df['Sample name'].str.contains('H021-CF522-T1-Q1')]

Series([], Name: Sample name, dtype: object)

In [26]:
held_out_df['Sample name'][held_out_df['Sample name'].str.contains('H021-CF522-T1-Q1')]

Series([], Name: Sample name, dtype: object)

In [28]:
initial_df['TCC GROUP'].value_counts()

TCC GROUP
high            885
intermediate    633
low             286
very low        182
notdefined       12
Name: count, dtype: Int64

In [None]:

def split_data(initial_df, z_scores_initial_df):
    print("="*80)
    print(f"Initial rows: {len(initial_df)}")
    nos_cases = initial_df[initial_df[CLASSIFIED_BY].str.endswith('NOS', na=False)][CLASSIFIED_BY].unique().tolist()
    cases_to_remove = nos_cases + OTHER_CASES
    print(f"🗑️ Removing undefined cases: {cases_to_remove}")

    df1 = prep.remove_class(initial_df, cases_to_remove, CLASSIFIED_BY, output_dir)
    print(f"Rows after removing cases_to_remove: {len(df1)}")

    df2 = prep.remove_class(df1, ['very low', 'notdefined'], 'TCC GROUP', output_dir)
    print(f"Rows after removing 'very low'/'missing' TCC GROUP: {len(df2)}")

    df3 = df2.loc[df2['TCC GROUP'].notna()]
    print(f"Rows after dropping NA in TCC GROUP: {len(df3)}")

    ml_initial_df = df3

    training_df, held_out_df = prep.data_split(
        ml_initial_df,
        output_directory=output_dir, 
        split_size=SPLIT_SIZE, 
        classified_by=CLASSIFIED_BY, 
        export=False,
    )
    print(f"Rows in training set: {len(training_df)}")
    print(f"Rows in held-out set: {len(held_out_df)}")
    print(f"Sum of splits: {len(training_df) + len(held_out_df)}")
    print("="*80)
    return training_df, held_out_df, z_scores_initial_df[z_scores_initial_df['Sample name'].isin(training_df['Sample name'])]

training_df, held_out_df, z_scores_train_df = split_data(initial_df, z_scores_initial_df)


Initial rows: 1998
🗑️ Removing undefined cases: ['CUPNOS', 'NETNOS', 'SARCNOS', 'SCCNOS', 'RCSNOS', 'NECNOS', 'MBLNOS', 'missing']
Removed samples: 206
Remaining samples: 1792
Rows after removing cases_to_remove: 1792
Removed samples: 164
Remaining samples: 1628
Rows after removing 'very low'/'missing' TCC GROUP: 1628
Rows after dropping NA in TCC GROUP: 1618
Classes with only one sample: 66
Training set samples: 1152
Held-out set samples: 466
Rows in training set: 1152
Rows in held-out set: 466
Sum of splits: 1618


In [23]:

def split_data(initial_df, z_scores_initial_df):
    print("="*80)
    print(f"Initial rows: {len(initial_df)}")
    nos_cases = initial_df[initial_df[CLASSIFIED_BY].str.endswith('NOS', na=False)][CLASSIFIED_BY].unique().tolist()
    cases_to_remove = nos_cases + OTHER_CASES
    print(f"🗑️ Removing undefined cases: {cases_to_remove}")

    df1 = prep.remove_class(initial_df, cases_to_remove, CLASSIFIED_BY, output_dir)
    print(f"Rows after removing cases_to_remove: {len(df1)}")

    df2 = prep.remove_class(df1, ['very low', 'notdefined'], 'TCC GROUP', output_dir)
    print(f"Rows after removing 'very low'/'missing' TCC GROUP: {len(df2)}")

    df3 = df2.loc[df2['TCC GROUP'].notna()]
    print(f"Rows after dropping NA in TCC GROUP: {len(df3)}")

    ml_initial_df = df3

    training_df, held_out_df = prep.data_split(
        ml_initial_df,
        output_directory=output_dir, 
        split_size=SPLIT_SIZE, 
        classified_by=CLASSIFIED_BY, 
        export=False,
    )
    print(f"Rows in training set: {len(training_df)}")
    print(f"Rows in held-out set: {len(held_out_df)}")
    print(f"Sum of splits: {len(training_df) + len(held_out_df)}")
    print("="*80)
    return training_df, held_out_df, z_scores_initial_df[z_scores_initial_df['Sample name'].isin(training_df['Sample name'])]

training_df, held_out_df, z_scores_train_df = split_data(initial_df, z_scores_initial_df)


Initial rows: 1998
🗑️ Removing undefined cases: ['CUPNOS', 'NETNOS', 'SARCNOS', 'SCCNOS', 'RCSNOS', 'NECNOS', 'MBLNOS', 'missing']
Removed samples: 206
Remaining samples: 1792
Rows after removing cases_to_remove: 1792
Removed samples: 174
Remaining samples: 1618
Rows after removing 'very low'/'missing' TCC GROUP: 1618
Rows after dropping NA in TCC GROUP: 1618
Classes with only one sample: 66
Training set samples: 1152
Held-out set samples: 466
Rows in training set: 1152
Rows in held-out set: 466
Sum of splits: 1618



## Cell 8: Class-Specific Workflow


In [None]:

 
def class_specific_workflow(training_df, held_out_df, z_scores_train_df, peptides_df_binary):
    """Execute class-specific workflow for specified classification"""
    print("="*80)
    print(f"Starting class-specific workflow for {TARGET_CLASS}...")
    print("="*80)
    
    # Obtaining high confidence proteins by peptides
    target_proteins_by_peptides = fs.get_high_confidence_proteins(
        peptides_df_binary, TARGET_CLASS, CLASSIFIED_BY, threshold=HIGH_CONFIDENCE_THRESHOLD
    )
    
    print(f"🔬 Found {len(target_proteins_by_peptides)} high confidence proteins")
    
    # Binary labeling for specific class classification 
    target_training_df = fs.binary_labeling(training_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    target_ho_df = fs.binary_labeling(held_out_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    target_z_scores_train_df = fs.binary_labeling(z_scores_train_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    
    # 1st Filter - Filtering training and held-out dataframes by proteins with peptides
    target_training_df = target_training_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)
    target_ho_df = target_ho_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)
    target_z_scores_train_df = target_z_scores_train_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)

    print(f"📊 Filtered training set shape: {target_training_df.shape}")
    print(f"📊 Filtered held-out set shape: {target_ho_df.shape}")
    print(f"📈 Filtered z-scores training set shape: {target_z_scores_train_df.shape}")
    
    # Display class distribution
    print(f"\n🎯 Binary Classification Distribution:")
    print(f"Training: {target_training_df['Classifier'].value_counts().to_dict()}")
    print(f"Held-out: {target_ho_df['Classifier'].value_counts().to_dict()}")
    
    print("✅ Class-specific workflow completed")
    
    return target_training_df, target_ho_df, target_z_scores_train_df

# Execute class-specific workflow
target_training_df, target_ho_df, target_z_scores_train_df = class_specific_workflow(
    training_df, held_out_df, z_scores_train_df, peptides_df_binary
)
 


In [None]:
target_training_df['code_oncotree'].value_counts()

In [None]:
target_z_scores_train_df['code_oncotree'].value_counts()


## Cell 9: Feature Selection


In [None]:

 
def feature_selection_workflow(target_z_scores_train_df):
    """Perform feature selection using ElasticNet"""
    print("="*80)
    print("Starting feature selection...")
    print("="*80)
    print(f"🔧 Using L1 ratios: {FEATURE_SELECTION_L1_RATIOS}")
    print(f"🔧 Using C values: {FEATURE_SELECTION_C_VALUES}")
    
    # Hyperparameters for ElasticNet
    print("-"*80)
    print("Defining hyperparameters for ElasticNet...")

    try:
        target_cv_results, target_best_params, target_best_score, target_grid_search_obj = fs.hparameter_grid_search(
            target_z_scores_train_df, GRID_SEARCH_N_SPLITS, FEATURE_SELECTION_L1_RATIOS, FEATURE_SELECTION_C_VALUES, classified_by=CLASSIFIED_BY
        )
        print(f"✅ Best parameters found: {target_best_params}")
        print(f"🎯 Best CV score: {target_best_score:.4f}")
        
    except Exception as e:
        print(f"⚠️ Warning: Hyperparameter search failed: {e}")
        print("Using configured default parameters...")
        target_best_params = {'l1_ratio': ELNET_L1_RATIO, 'C': ELNET_C_VALUE}

    # Feature Selection by ElasticNet Cross-Validation
    print("-"*80)
    print("Selecting features...")

    try:
        class_name = "_".join(TARGET_CLASS)
        target_cross_val_coeffs = fs.elnet_wrapper(
            target_z_scores_train_df, 
            classified_by=CLASSIFIED_BY, 
            tumor_type_name=f'{class_name}_features', 
            l1_ratio=target_best_params.get('l1_ratio'), 
            C=target_best_params.get('C'), 
            output_directory=output_dir,
            n_splits=ELNET_N_SPLITS, 
            n_repeats=ELNET_N_REPEATS, 
            n_jobs=ELNET_N_JOBS, 
            export=True
        )
        
        target_stats, target_proteins = fs.statistic_from_coefficients(target_cross_val_coeffs, TARGET_CLASS, output_dir)
        print(f"✅ Selected {len(target_proteins)} protein features")
        print(f"🔬 Top 10 selected proteins: {target_proteins[:10]}")
        
        return target_proteins, target_cross_val_coeffs, target_stats
        
    except Exception as e:
        print(f"❌ Feature selection failed: {e}")
        raise

# Execute feature selection
target_proteins, target_cross_val_coeffs, target_stats = feature_selection_workflow(target_z_scores_train_df)
 



## Cell 10: Model Fitting and Evaluation


In [None]:

 
def model_fitting_workflow(target_training_df, target_ho_df, target_proteins):
    """Fit the final model and evaluate"""
    print("="*80)
    print("Starting model fitting...")
    print("="*80)
    print(f"🔄 Using {NESTED_CV_RANDOM_STATE_TRIES} random state tries for nested CV")
    print(f"🔄 Using {NESTED_CV_N_SPLITS} splits for nested CV")
    
    # Reshaping dataset for training and test
    target_training_fs = fs.reshape_df_for_fitting(target_training_df, target_proteins)
    target_test_fs = fs.reshape_df_for_fitting(target_ho_df, target_proteins)
    
    print(f"📊 Training set shape after feature selection: {target_training_fs.shape}")
    print(f"📊 Test set shape after feature selection: {target_test_fs.shape}")

    # Hyperparameter Selection for Logistic Regression
    try:
        target_nested_cv_results = mf.wrapper_nested_cv(
            target_training_fs, 
            random_state_tries=NESTED_CV_RANDOM_STATE_TRIES, 
            n_splits=NESTED_CV_N_SPLITS, 
            classified_by=CLASSIFIED_BY
        )
        target_nested_hp = mf.nested_cv_hparameters_selection(target_nested_cv_results)
        hyperparameter_C = pd.DataFrame(target_nested_hp).T.sort_values(by='count', ascending=False).index.tolist()[0]
        print(f"🎯 Selected hyperparameter C: {hyperparameter_C}")
    except Exception as e:
        print(f"⚠️ Warning: Hyperparameter selection failed: {e}")
        hyperparameter_C = 1.0  # Default value
        print(f"🔧 Using default hyperparameter C: {hyperparameter_C}")

    # Model Fit
    try:
        target_log_reg_model = mf.logistic_regression_ridge(
            target_training_fs, 
            hyperparameter_C, 
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir,
        )
        
        # Get results
        target_coefficients, target_train_probabilities, target_test_probabilities = mf.logistic_regression_results(
            target_log_reg_model, 
            target_training_fs, 
            target_test_fs,  
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir
        )
        
        # Classification scores
        test_target_scores = mf.classification_scores(target_test_probabilities)
        
        print("✅ Model training and evaluation completed successfully!")
        print(f"📊 Test scores summary:")
        if hasattr(test_target_scores, 'describe'):
            print(test_target_scores.describe())
        
        return target_log_reg_model, target_coefficients, target_train_probabilities, target_test_probabilities, test_target_scores
        
    except Exception as e:
        print(f"❌ Error during model fitting: {e}")
        return None, None, None, None, None

# Execute model fitting
model_results = model_fitting_workflow(target_training_df, target_ho_df, target_proteins)
target_log_reg_model, target_coefficients, target_train_probabilities, target_test_probabilities, test_target_scores = model_results
 



## Cell 11: Results Visualization


In [None]:

 
def generate_visualizations(initial_df, test_target_scores, target_proteins):
    """Generate and save graphs for results exploration"""
    print("="*80)
    print("Generating visualizations...")
    print("="*80)
    
    try:
        # UMAP plot
        print("📊 Creating UMAP plot...")
        UMAP_plot = grph.create_umap_plot(
            df=initial_df, 
            output_directory=output_dir,
            feature_columns=target_proteins, 
            color_column=CLASSIFIED_BY, 
            metadata_cols=[SAMPLES_COLUMN, CLASSIFIED_BY, 'TCC GROUP'],
            n_neighbors=5,
        )
        
        # TCC vs Probability plot
        print("📈 Creating TCC vs Probability plot...")
        TCC_plot = grph.plot_tcc_vs_probability(initial_df, test_target_scores, output_dir)
        
        print("✅ Visualizations generated successfully!")
        return TCC_plot, UMAP_plot
        
    except Exception as e:
        print(f"⚠️ Warning: Visualization generation failed: {e}")
        return None, None

# Generate visualizations (only if model was successful)
if target_log_reg_model is not None:
    TCC_plot, UMAP_plot = generate_visualizations(initial_df, test_target_scores, target_proteins)
else:
    print("⚠️ Skipping visualizations due to model fitting errors")
 



## Cell 12: Final Summary and Results


In [None]:

 
def display_final_summary():
    """Display final summary of the workflow"""
    print("#" * 80)
    print("🏁 FINAL WORKFLOW SUMMARY")
    print("#" * 80)
    
    if target_log_reg_model is not None:
        print("✅ CLASSIFIER WORKFLOW COMPLETED SUCCESSFULLY!")
        print(f"🎯 Target class: {TARGET_CLASS}")
        print(f"🔬 Selected {len(target_proteins)} protein features")
        print(f"📁 Results exported to: {output_dir}")
        
        # Display some key metrics if available
        if test_target_scores is not None and not test_target_scores.empty:
            print(f"\n📊 Model Performance Summary:")
            if 'accuracy' in test_target_scores.columns:
                print(f"   Average Accuracy: {test_target_scores['accuracy'].mean():.4f}")
            if 'roc_auc' in test_target_scores.columns:
                print(f"   Average ROC AUC: {test_target_scores['roc_auc'].mean():.4f}")
                
        print(f"\n📋 Top 10 Selected Features:")
        print(target_proteins[:10] if len(target_proteins) >= 10 else target_proteins)
        
    else:
        print("❌ CLASSIFIER WORKFLOW COMPLETED WITH ERRORS")
        print("Please check the error messages in previous cells")
    
    print("#" * 80)

# Display final summary
display_final_summary()
 



## Cell 13: Optional - Detailed Results Exploration


In [None]:

 
# Optional cell for detailed exploration of results
print("="*80)
print("DETAILED RESULTS EXPLORATION")
print("="*80)

if target_log_reg_model is not None:
    print("📊 Available results for exploration:")
    print("- target_log_reg_model: Fitted logistic regression model")
    print("- target_coefficients: Model coefficients")
    print("- target_train_probabilities: Training set predictions")
    print("- target_test_probabilities: Test set predictions") 
    print("- test_target_scores: Performance metrics")
    print("- target_proteins: Selected protein features")
    print("- target_stats: Feature selection statistics")
    
    # Example explorations you can run:
    print(f"\n🔍 Quick exploration examples:")
    print(f"Model coefficients shape: {target_coefficients.shape if target_coefficients is not None else 'N/A'}")
    print(f"Test probabilities shape: {target_test_probabilities.shape if target_test_probabilities is not None else 'N/A'}")
    
    # Show feature importance if coefficients available
    if target_coefficients is not None and len(target_coefficients) > 0:
        print(f"\n🏆 Top 5 most important features (by absolute coefficient):")
        coeff_series = pd.Series(target_coefficients, index=target_proteins)
        top_features = coeff_series.abs().sort_values(ascending=False).head()
        for feature, coeff in top_features.items():
            print(f"   {feature}: {coeff:.4f}")

print("\n💡 You can now explore the results interactively in subsequent cells!")
 



## Usage Instructions

### How to use this notebook:

1. **Run cells sequentially** - Each cell depends on the previous ones
2. **Check outputs** - Each cell will show progress and results
3. **Debug easily** - If a cell fails, you can fix issues and re-run just that cell
4. **Explore results** - Use the final cells to explore your results interactively

### Key variables available after execution:
- `initial_df`: Original processed dataset
- `training_df`, `held_out_df`: Split datasets
- `target_proteins`: Selected protein features
- `target_log_reg_model`: Fitted model
- `test_target_scores`: Performance metrics
- `target_coefficients`: Model coefficients

### Benefits of this notebook structure:
- ✅ **Granular control**: Run one step at a time
- ✅ **Easy debugging**: Isolate and fix issues in specific steps
- ✅ **Interactive exploration**: Examine intermediate results
- ✅ **Flexible execution**: Skip or repeat steps as needed
- ✅ **Better visualization**: See progress and results clearly

### Next steps:
1. Save this content as `entity_classifier_notebook.ipynb`
2. Make sure your configuration file `entity_model_settings_TEST.py` is accessible
3. Run cells sequentially to execute your workflow
4. Use additional cells for custom analysis and exploration