## Cell 1: Imports and Setup

In [2]:

"""
Entity Classifier Workflow - Jupyter Notebook Version
Interactive version with cell-by-cell execution control
"""

###############
### Imports ###
###############
import sys
import pandas as pd
import numpy as np
import warnings
import logging
import multiprocessing as mp
import os
from datetime import datetime
from pathlib import Path
from contextlib import redirect_stdout, redirect_stderr

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

# Setup paths
current_script_path = Path.cwd()  # Use current working directory in Jupyter
project_root = current_script_path
module_path = project_root / "src" / "data"
if str(module_path) not in sys.path:
    sys.path.append(str(module_path))

# Import configuration
from entity_model_settings_ACYC import *

# Create output directory
project_root = os.path.abspath(os.getcwd())
output_dir = os.path.join(project_root, 'data', run_folder_name)
os.makedirs(output_dir, exist_ok=True)

print(f"✅ Setup completed")
print(f"📁 Output directory: {output_dir}")
print(f"🎯 Target class: {TARGET_CLASS}")
print(f"📊 Classification column: {CLASSIFIED_BY}")




✅ Setup completed
📁 Output directory: /home/lestrada/tumor_type_prediction/notebooks/data/ACYC_250916_results
🎯 Target class: ['ACYC']
📊 Classification column: code_oncotree


In [None]:
import importlib
importlib.reload(entity_model_settings_TEST)



## Cell 2: Logging Configuration

In [3]:
#####################
### Logging Setup ###
#####################

log_filename = f"classifier_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # Also log to notebook output
    ]
)

warnings.filterwarnings("default") 
warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: logging.warning(
    f"{category.__name__}: {message} (in {filename}:{lineno})"
)

mp.set_start_method('spawn', force=True)

print(f"📝 Logging configured - log file: {log_filename}")



📝 Logging configured - log file: classifier_log_20250916_171800.log



## Cell 3: Import Custom Modules


In [4]:

def import_custom_modules():
    """Import custom modules with error handling"""
    try:
        import preprocessing as prep
        import feature_selection as fs
        import model_fit as mf
        import graphs as grph
        return prep, fs, mf, grph
    except ImportError as e:
        print(f"❌ Error importing custom modules: {e}")
        print("Make sure the following modules are in src/data/:")
        print("- preprocessing.py") 
        print("- feature_selection.py")
        print("- model_fit.py")
        print("- graphs.py")
        raise

# Import modules
prep, fs, mf, grph = import_custom_modules()
print("✅ Custom modules imported successfully")
 




✅ Custom modules imported successfully



## Cell 4: Configuration Display

In [5]:
def print_configuration():
    """Print current configuration settings"""
    print("=" * 80)
    print("CURRENT CONFIGURATION")
    print("=" * 80)
    print(f"Target Class: {TARGET_CLASS}")
    print(f"Classification Column: {CLASSIFIED_BY}")
    print(f"Data Folder: {PROCESSED_DATA_FOLDER}")
    print(f"Split Size: {SPLIT_SIZE}")
    print(f"High Confidence Threshold: {HIGH_CONFIDENCE_THRESHOLD}")
    print(f"ElasticNet Parameters: L1_ratio={ELNET_L1_RATIO}, C={ELNET_C_VALUE}")
    print(f"Cross-validation: {ELNET_N_SPLITS} splits, {ELNET_N_REPEATS} repeats")
    print(f"Nested CV: {NESTED_CV_RANDOM_STATE_TRIES} tries, {NESTED_CV_N_SPLITS} splits")
    print("=" * 80)

print_configuration()
 


CURRENT CONFIGURATION
Target Class: ['ACYC']
Classification Column: code_oncotree
Data Folder: 2025.08.06_CJ_paper_final/
Split Size: 0.3
High Confidence Threshold: 0.7
ElasticNet Parameters: L1_ratio=0.5, C=1
Cross-validation: 3 splits, 67 repeats
Nested CV: 10 tries, 3 splits



## Cell 5: Data Loading


In [6]:

 
def load_data():
    """Load all required data files"""
    print("="*80)
    print("Loading data files...")
    print("="*80)
    
    # Construct file paths using configuration variables
    intensity_path_file = FOLDER_PATH + PROCESSED_DATA_FOLDER + PREPROCESSED_FP_INTENSITY
    z_scores_path_file = FOLDER_PATH + PROCESSED_DATA_FOLDER + PREPROCESSED_FP_Z_SCORES
    the_metadata_file = METADATA_PATH + METADATA_FILE
    
    print(f"📂 Loading intensity data from: {intensity_path_file}")
    print(f"📂 Loading z-scores data from: {z_scores_path_file}")
    print(f"📂 Loading metadata from: {the_metadata_file}")
    
    try:
        input_quantifications = prep.read_table_with_correct_sep(intensity_path_file)
        df_z_scores = prep.read_table_with_correct_sep(z_scores_path_file)
        input_metadata = pd.read_excel(the_metadata_file,
                                        usecols=['Sample name', 'code_oncotree', 'Tumor cell content', 'TCC_Bioinfo', 'TCC GROUP'],
                                        dtype={'Sample name': 'string', 'code_oncotree': 'string', 'Tumor cell content': 'float64', 'TCC_Bioinfo': 'float64', 'TCC GROUP': 'string'},
                                        na_values=['', 'NA', 'NaN', 'nan', 'N/A', 'n/a', 'None', 'TBD', 'notavailable', 'missing'])

        print("✅ Data files loaded successfully.")
        print(f"📊 Quantifications shape: {input_quantifications.shape}")
        print(f"📊 Z-scores shape: {df_z_scores.shape}")
        print(f"📊 Metadata shape: {input_metadata.shape}")

        return input_quantifications, df_z_scores, input_metadata

    except FileNotFoundError as e:
        print(f"❌ Error loading data files: {e}")
        print("Please check that all data files exist in the specified paths:")
        print(f"  - {intensity_path_file}")
        print(f"  - {z_scores_path_file}")
        print(f"  - {the_metadata_file}")
        raise

# Execute data loading
input_quantifications, df_z_scores, input_metadata = load_data()
 


Loading data files...
📂 Loading intensity data from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2025.08.06_CJ_paper_final/preprocessed_fp.csv
📂 Loading z-scores data from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2025.08.06_CJ_paper_final/full_proteome_measures_z.tsv
📂 Loading metadata from: /media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/LE_PROdict/paper_freeze_versions_22_08/METADATA_PANCANCER_PAPER_final.xlsx
✅ Data files loaded successfully.
📊 Quantifications shape: (13069, 5173)
📊 Z-scores shape: (13069, 1999)
📊 Metadata shape: (1998, 5)



## Cell 6: Data Preprocessing


In [7]:
def preprocess_data(input_quantifications, df_z_scores, input_metadata):
    """Preprocess all data"""
    print("="*80)
    print("Preprocessing data...")
    print("="*80)
    
    # Protein quantification intensities post-processing
    input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
    peptides_quant_info = prep.post_process_meta_intensities(
        input_quantifications.iloc[:, int(input_quantifications.shape[1]/2):].T
    )
    proteins_quant = input_quantifications.iloc[:, :int(input_quantifications.shape[1]/2)].T
    print(f"🔬 Proteins quantifications columns: {proteins_quant.iloc[:,:10].columns.tolist()}")
    
    # Imputation with configurable parameters
    prot_quant_imputed = prep.impute_normal_down_shift_distribution(
        proteins_quant, 
        width=IMPUTATION_WIDTH, 
        downshift=IMPUTATION_DOWNSHIFT, 
        seed=IMPUTATION_SEED
    )
    na_columns = prot_quant_imputed.isna().any()
    na_columns_true = na_columns[na_columns].index.tolist()
    print("🚨 Proteins with empty values:", na_columns_true)

    # Cleaning sample names
    prot_quant_imputed.reset_index(inplace=True)
    prot_quant_imputed.rename(columns={'index': SAMPLES_COLUMN}, inplace=True)
    prot_quant_imputed[SAMPLES_COLUMN] = prot_quant_imputed[SAMPLES_COLUMN].str.replace('pat_', '').str.strip()
    
    # Dataset with protein intensities and metadata
    input_metadata['TCC'] = input_metadata['TCC_Bioinfo'].fillna(input_metadata['Tumor cell content'])
    samples_metadata = input_metadata[[SAMPLES_COLUMN, CLASSIFIED_BY, 'TCC', 'TCC GROUP']]
    samples_metadata[SAMPLES_COLUMN] = samples_metadata[SAMPLES_COLUMN].str.strip()
    initial_df = samples_metadata.merge(prot_quant_imputed, on=SAMPLES_COLUMN, how='left')
    
    # Peptides quantification to binary dataset
    peptides_df_binary = pd.DataFrame(
        np.where(peptides_quant_info > 1, 1, 0),
        index=peptides_quant_info.index,
        columns=peptides_quant_info.columns  
    )
    peptides_df_binary.reset_index(inplace=True)
    peptides_df_binary.replace('Identification metadata ', '', regex=True, inplace=True)
    peptides_df_binary['index'] = peptides_df_binary['index'].str.strip()
    peptides_df_binary['index'] = peptides_df_binary['index'].str.strip()
    peptides_df_binary2 = samples_metadata.merge(peptides_df_binary, left_on=SAMPLES_COLUMN, right_on='index')
    peptides_df_binary2.drop('index', axis=1, inplace=True)
    
    print(f"🧬 Peptides binary dataframe shape: {peptides_df_binary.shape}")
    
    # Process Z-scores
    z_scores_df = df_z_scores.transpose(copy=True) 
    print(f"📈 Z-scores dataframe shape before processing: {z_scores_df.shape}")
    z_scores_df = z_scores_df.reset_index()
    z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
    z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
    z_scores_df.drop(axis=0, index=0, inplace=True)
    z_scores_df['Gene names'] = z_scores_df.iloc[:,0].str.replace('pat_', '')
    z_scores_df = z_scores_df.set_index('Gene names') 
    
    z_scores_imputed = prep.impute_normal_down_shift_distribution(
        z_scores_df,
        width=IMPUTATION_WIDTH, 
        downshift=IMPUTATION_DOWNSHIFT, 
        seed=IMPUTATION_SEED
    )
    z_scores_imputed.reset_index(inplace=True)
    z_scores_imputed.rename(columns={'Gene names': SAMPLES_COLUMN}, inplace=True)
    z_scores_imputed[SAMPLES_COLUMN] = z_scores_imputed[SAMPLES_COLUMN].str.strip()    
    z_scores_initial_df = samples_metadata.merge(z_scores_imputed, on=SAMPLES_COLUMN, how='left')
    
    print(f"📈 Z-scores initial dataframe shape: {z_scores_initial_df.shape}")
    print("✅ Preprocessing completed successfully")
    
    return initial_df, peptides_df_binary, z_scores_initial_df, z_scores_imputed, peptides_df_binary2

# Execute preprocessing
initial_df, peptides_df_binary, z_scores_initial_df, z_scores_imputed, peptides_df_binary2 = preprocess_data(
    input_quantifications, df_z_scores, input_metadata
)

# Display basic info about processed data
print(f"\n📋 Data Summary:")
print(f"Initial dataframe shape: {initial_df.shape}")
print(f"Classes in dataset: {initial_df[CLASSIFIED_BY].value_counts().to_dict()}")
 


Preprocessing data...
🔬 Proteins quantifications columns: ['SYMPK', 'NUP160', 'FARP1', 'UPF1', 'IGBP1', 'PSMA1', 'COL6A2', 'TXLNA', 'POGLUT1', 'EFTUD2']
(2586, 13069)
🚨 Proteins with empty values: ['ENPP7', 'SHOX2', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']
🧬 Peptides binary dataframe shape: (2586, 13070)
📈 Z-scores dataframe shape before processing: (1999, 13069)
(1998, 13069)
📈 Z-scores initial dataframe shape: (1998, 13073)
✅ Preprocessing completed successfully

📋 Data Summary:
Initial dataframe shape: (1998, 13073)
Classes in dataset: {'BRCA': 238, 'CUPNOS': 112, 'CHDM': 102, 'SYNS': 84, 'LMS': 77, 'SARCNOS': 63, 'ACYC': 62, 'SFT': 52, 'MFH': 51, 'ARMS': 46, 'ES': 45, 'ACC': 39, 'IHCH': 35, 'OS': 35, 'ERMS': 29, 'DSRCT': 28, 'DDLS': 27, 'MRLS': 27, 'ULMS': 25, 'ASPS': 23, 'DIFG': 23, 'PANET': 21, 'PAAD': 21, 'MEL': 21, 'SDCA': 21, 'THYM': 19, 'COAD': 19, 'MPNST': 18, 'CHS': 16, 'LGFMS': 16, 'EPIS': 16, 'THYC': 16, 'GIST': 15, 'EHAE': 14, 'UM': 14, 'CCS': 14, 'ANGS': 13, 'READ': 13, 

In [11]:
initial_df['Sample name'][initial_df['Sample name'].str.contains('H021-CF522-T1-Q1')]

560    H021-CF522-T1-Q1
Name: Sample name, dtype: object


## Cell 7: Data Splitting


In [8]:

 
def split_data(initial_df, z_scores_initial_df):
    """Split data into training and held-out sets"""
    print("="*80)
    print("Splitting data...")
    print("="*80)
    
    nos_cases = initial_df[initial_df[CLASSIFIED_BY].str.endswith('NOS', na=False)][CLASSIFIED_BY].unique().tolist()
    cases_to_remove = nos_cases + OTHER_CASES
    print(f"🗑️ Removing undefined cases: {cases_to_remove}")

    # Removing samples not part of the train-validation set
    ml_initial_df = (
        initial_df
        .pipe(prep.remove_class, cases_to_remove, CLASSIFIED_BY, output_dir)
        .pipe(prep.remove_class, ['very low', 'missing'], 'TCC GROUP', output_dir)
        .loc[lambda df: df['TCC GROUP'].notna()]
    )

    # Splitting dataset into training and held-out sets
    training_df, held_out_df = prep.data_split(
        ml_initial_df,
        output_directory=output_dir, 
        split_size=SPLIT_SIZE, 
        classified_by=CLASSIFIED_BY, 
        export=True,
    )
    
    # Z_scores dataset
    z_scores_train_df = z_scores_initial_df[z_scores_initial_df['Sample name'].isin(training_df['Sample name'])]
    z_scores_test_df = z_scores_initial_df[z_scores_initial_df['Sample name'].isin(held_out_df['Sample name'])]

    print(f"Samples match between Z-score and intesntity dataset: {set(training_df['Sample name']) == set(z_scores_train_df['Sample name'])}")
    print(f"🎯 Training set size: {training_df.shape}")
    print(f"🎯 Held-out set size: {held_out_df.shape}")
    print(f"📈 Z-scores training set size: {z_scores_train_df.shape}")
    print("✅ Data splitting completed")
    
    return training_df, held_out_df, z_scores_train_df, z_scores_test_df

# Execute data splitting
training_df, held_out_df, z_scores_train_df, z_scores_test_df = split_data(initial_df, z_scores_initial_df)

# Display split information
print(f"\n📊 Split Summary:")
print(f"Training classes: {training_df[CLASSIFIED_BY].value_counts().to_dict()}")
print(f"Held-out classes: {held_out_df[CLASSIFIED_BY].value_counts().to_dict()}")
 


Splitting data...
🗑️ Removing undefined cases: ['CUPNOS', 'NETNOS', 'SARCNOS', 'SCCNOS', 'RCSNOS', 'NECNOS', 'MBLNOS', 'missing']
Removed samples: 206
Remaining samples: 1792
Removed samples: 164
Remaining samples: 1628
Classes with only one sample: 66
Training set samples: 1159
Held-out set samples: 469
Samples match between Z-score and intesntity dataset: True
🎯 Training set size: (1159, 13073)
🎯 Held-out set size: (469, 13073)
📈 Z-scores training set size: (1159, 13073)
✅ Data splitting completed

📊 Split Summary:
Training classes: {'BRCA': 156, 'CHDM': 65, 'SYNS': 53, 'LMS': 50, 'ACYC': 41, 'SFT': 36, 'MFH': 32, 'ES': 32, 'ARMS': 32, 'ACC': 26, 'IHCH': 22, 'OS': 22, 'ERMS': 20, 'ULMS': 17, 'DSRCT': 17, 'DDLS': 15, 'PANET': 15, 'MRLS': 15, 'DIFG': 14, 'PAAD': 14, 'MEL': 13, 'SDCA': 13, 'ASPS': 13, 'LGFMS': 11, 'MPNST': 11, 'GIST': 11, 'CHS': 11, 'COAD': 11, 'THYC': 11, 'EPIS': 10, 'READ': 9, 'CCS': 9, 'UM': 9, 'DFSP': 8, 'BA': 8, 'RMS': 8, 'GINET': 8, 'THYM': 7, 'PLEMESO': 7, 'ANGS'

In [27]:
training_df['Sample name'][training_df['Sample name'].str.contains('H021-CF522-T1-Q1')]

Series([], Name: Sample name, dtype: object)


## Cell 8: Class-Specific Workflow


In [None]:
def class_specific_workflow(training_df, held_out_df, z_scores_train_df, peptides_df_binary):
    """Execute class-specific workflow for specified classification"""
    print("="*80)
    print(f"Starting class-specific workflow for {TARGET_CLASS}...")
    print("="*80)
    
    # Obtaining high confidence proteins by peptides
    target_proteins_by_peptides = fs.get_high_confidence_proteins(
        peptides_df_binary, TARGET_CLASS, CLASSIFIED_BY, threshold=HIGH_CONFIDENCE_THRESHOLD
    )
    
    print(f"🔬 Found {len(target_proteins_by_peptides)} high confidence proteins")
    
    # Binary labeling for specific class classification 
    target_training_df = fs.binary_labeling(training_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    target_ho_df = fs.binary_labeling(held_out_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    target_z_scores_train_df = fs.binary_labeling(z_scores_train_df, classified_by=CLASSIFIED_BY, true_class=TARGET_CLASS)
    
    # 1st Filter - Filtering training and held-out dataframes by proteins with peptides
    target_training_df = target_training_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)
    target_ho_df = target_ho_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)
    target_z_scores_train_df = target_z_scores_train_df.filter(items=[SAMPLES_COLUMN, CLASSIFIED_BY, 'Classifier'] + target_proteins_by_peptides)

    print(f"📊 Filtered training set shape: {target_training_df.shape}")
    print(f"📊 Filtered held-out set shape: {target_ho_df.shape}")
    print(f"📈 Filtered z-scores training set shape: {target_z_scores_train_df.shape}")
    
    # Display class distribution
    print(f"\n🎯 Binary Classification Distribution:")
    print(f"Training: {target_training_df['Classifier'].value_counts().to_dict()}")
    print(f"Held-out: {target_ho_df['Classifier'].value_counts().to_dict()}")
    
    print("✅ Class-specific workflow completed")
    
    return target_training_df, target_ho_df, target_z_scores_train_df

# Execute class-specific workflow
target_training_df, target_ho_df, target_z_scores_train_df = class_specific_workflow(
    training_df, held_out_df, z_scores_train_df, peptides_df_binary
)
 



## Cell 9: Feature Selection


In [None]:
def feature_selection_workflow(target_z_scores_train_df):
    """Perform feature selection using ElasticNet"""
    print("="*80)
    print("Starting feature selection...")
    print("="*80)
    print(f"🔧 Using L1 ratios: {FEATURE_SELECTION_L1_RATIOS}")
    print(f"🔧 Using C values: {FEATURE_SELECTION_C_VALUES}")
    
    # Hyperparameters for ElasticNet
    print("-"*80)
    print("Defining hyperparameters for ElasticNet...")

    try:
        target_cv_results, target_best_params, target_best_score, target_grid_search_obj = fs.hparameter_grid_search(
            target_z_scores_train_df, GRID_SEARCH_N_SPLITS, FEATURE_SELECTION_L1_RATIOS, FEATURE_SELECTION_C_VALUES, classified_by=CLASSIFIED_BY
        )
        print(f"✅ Best parameters found: {target_best_params}")
        print(f"🎯 Best CV score: {target_best_score:.4f}")
        
    except Exception as e:
        print(f"⚠️ Warning: Hyperparameter search failed: {e}")
        print("Using configured default parameters...")
        target_best_params = {'l1_ratio': ELNET_L1_RATIO, 'C': ELNET_C_VALUE}

    # Feature Selection by ElasticNet Cross-Validation
    print("-"*80)
    print("Selecting features...")

    try:
        class_name = "_".join(TARGET_CLASS)
        target_cross_val_coeffs = fs.elnet_wrapper(
            target_z_scores_train_df, 
            classified_by=CLASSIFIED_BY, 
            tumor_type_name=f'{class_name}_features', 
            l1_ratio=target_best_params.get('l1_ratio'), 
            C=target_best_params.get('C'), 
            output_directory=output_dir,
            n_splits=ELNET_N_SPLITS, 
            n_repeats=ELNET_N_REPEATS, 
            n_jobs=ELNET_N_JOBS, 
            export=True
        )
        
        target_stats, target_proteins = fs.statistic_from_coefficients(target_cross_val_coeffs, TARGET_CLASS, output_dir)
        print(f"✅ Selected {len(target_proteins)} protein features")
        print(f"🔬 Top 10 selected proteins: {target_proteins[:10]}")
        
        return target_proteins, target_cross_val_coeffs, target_stats
        
    except Exception as e:
        print(f"❌ Feature selection failed: {e}")
        raise

# Execute feature selection
target_proteins, target_cross_val_coeffs, target_stats = feature_selection_workflow(target_z_scores_train_df)
 



## Cell 10: Model Fitting and Evaluation


In [9]:
ACYC_old = ['FABP7', 'IRF6', 'ITGA9', 'LAMC3', 'RFLNA', 'KRT23', 'AADAC', 'COBL',
       'TGFBR3', 'LAMC2', 'FOLR1', 'SOX9', 'MYO5B', 'ESRP1', 'MSI1', 'LRRC8D',
       'MET', 'LOXL4', 'DCHS1', 'NPNT', 'LLGL2', 'SPINT1', 'CRTAC1', 'CCDC8',
       'SPINT2', 'SMOC2', 'ZNF574', 'HPD', 'ERRFI1', 'TTC19', 'POR', 'UBE2N',
       'KRT5', 'CRMP1', 'JUP', 'EPPK1', 'SFN']
ACYC_new = ['VIT',
 'ART3',
 'MEGF6',
 'MATN1',
 'BCL2',
 'TRAF3IP3',
 'VTCN1',
 'SOX10',
 'VGLL4',
 'TFAP2C',
 'MIF4GD',
 'NOTCH1',
 'NTN1',
 'ESRP2',
 'GAS2',
 'FRMD4A',
 'ITGA9',
 'MT2A',
 'PDZK1',
 'ANXA8L1',
 'VANGL2',
 'RASA1',
 'NAV2',
 'MMP7',
 'LAMC3',
 'TTC39A',
 'KLC3',
 'RASAL1',
 'ZNF574',
 'CA8',
 'IGSF3',
 'CA13',
 'MCEE',
 'MCUR1',
 'APOF',
 'FABP7',
 'WNK2',
 'TJP3',
 'CLDN7',
 'GRHL2',
 'CELSR1',
 'HPGD',
 'AQP4',
 'MACROD2',
 'MET',
 'SLC34A2',
 'HOPX',
 'MYO5B',
 'PLEKHA7',
 'BSPRY',
 'PHYHD1',
 'STAT3',
 'ETV6',
 'SCEL',
 'SRPK2',
 'COMTD1',
 'PGAP1',
 'HOOK1',
 'FCER1G',
 'AFAP1',
 'STK26',
 'PLSCR4',
 'MYADM',
 'SEPTIN4',
 'COA5',
 'AP1M2',
 'CANT1',
 'TOR1AIP1']

In [33]:
z_target_training_df = fs.binary_labeling(z_scores_train_df, classified_by='code_oncotree', true_class=['ACYC'])
z_target_ho_df = fs.binary_labeling(z_scores_test_df, classified_by='code_oncotree', true_class=['ACYC'])


Number of samples per class:
Classifier
0    1118
1      41
Name: count, dtype: int64


Number of samples per class:
Classifier
0    451
1     18
Name: count, dtype: int64



In [24]:
def model_fitting_workflow(target_training_df, target_ho_df, target_proteins):
    """Fit the final model and evaluate"""
    print("="*80)
    print("Starting model fitting...")
    print("="*80)
    print(f"🔄 Using {NESTED_CV_RANDOM_STATE_TRIES} random state tries for nested CV")
    print(f"🔄 Using {NESTED_CV_N_SPLITS} splits for nested CV")
    
    # Reshaping dataset for training and test
    target_training_fs = fs.reshape_df_for_fitting(target_training_df, target_proteins)
    target_test_fs = fs.reshape_df_for_fitting(target_ho_df, target_proteins)
    
    print(f"📊 Training set shape after feature selection: {target_training_fs.shape}")
    print(f"📊 Test set shape after feature selection: {target_test_fs.shape}")

    # Hyperparameter Selection for Logistic Regression
    try:
        target_nested_cv_results = mf.wrapper_nested_cv(
            target_training_fs, 
            random_state_tries=NESTED_CV_RANDOM_STATE_TRIES, 
            n_splits=NESTED_CV_N_SPLITS, 
            classified_by=CLASSIFIED_BY
        )
        target_nested_hp = mf.nested_cv_hparameters_selection(target_nested_cv_results)
        hyperparameter_C = pd.DataFrame(target_nested_hp).T.sort_values(by='count', ascending=False).index.tolist()[0]
        print(f"🎯 Selected hyperparameter C: {hyperparameter_C}")
    except Exception as e:
        print(f"⚠️ Warning: Hyperparameter selection failed: {e}")
        hyperparameter_C = 1.0  # Default value
        print(f"🔧 Using default hyperparameter C: {hyperparameter_C}")

    # Model Fit
    try:
        target_log_reg_model = mf.logistic_regression_ridge(
            target_training_fs, 
            hyperparameter_C, 
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir,
        )
        
        # Get results
        target_coefficients, target_train_probabilities, target_test_probabilities = mf.logistic_regression_results(
            target_log_reg_model, 
            target_training_fs, 
            target_test_fs,  
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir
        )
        
        # Classification scores
        test_target_scores = mf.classification_scores(target_test_probabilities)
        
        print("✅ Model training and evaluation completed successfully!")
        print(f"📊 Test scores summary:")
        if hasattr(test_target_scores, 'describe'):
            print(test_target_scores.describe())
        
        return target_log_reg_model, target_coefficients, target_train_probabilities, target_test_probabilities, test_target_scores
        
    except Exception as e:
        print(f"❌ Error during model fitting: {e}")
        return None, None, None, None, None

# Execute model fitting
old_model_results = model_fitting_workflow(target_training_df, target_ho_df, ACYC_old)
old_target_log_reg_model, old_target_coefficients, old_target_train_probabilities, old_target_test_probabilities, old_test_target_scores = old_model_results


Starting model fitting...
🔄 Using 10 random state tries for nested CV
🔄 Using 3 splits for nested CV
📊 Training set shape after feature selection: (1159, 41)
📊 Test set shape after feature selection: (469, 41)
• Running for random_state=0
1 Inner fold best parameter={'C': 10}, Score=0.8252, Outer Validation MCC Score: 0.9648

Inner fold model did not converged.
2 Inner fold best parameter={'C': 10}, Score=0.8919, Outer Validation MCC Score: 0.9204

3 Inner fold best parameter={'C': 1}, Score=0.8797, Outer Validation MCC Score: 0.8299

Average MCC across all outer folds: 0.9050

--------------------------------------------------
• Running for random_state=1
1 Inner fold best parameter={'C': 10}, Score=0.8438, Outer Validation MCC Score: 0.8363

2 Inner fold best parameter={'C': 1}, Score=0.8315, Outer Validation MCC Score: 0.8978

3 Inner fold best parameter={'C': 1}, Score=0.8367, Outer Validation MCC Score: 0.9259

Average MCC across all outer folds: 0.8866

--------------------------

In [25]:
old_target_coefficients

Unnamed: 0,FABP7,IRF6,ITGA9,LAMC3,RFLNA,KRT23,AADAC,COBL,TGFBR3,LAMC2,...,KRT5,CRMP1,JUP,EPPK1,SFN,Intercept,F1_1,F1_0,F1_weighted,MCC_score
0,0.981299,-0.206681,2.25885,0.692839,0.501605,0.197962,-0.125862,-0.323243,0.048775,0.03725,...,1.101609,0.061149,0.297008,-0.143167,0.209594,-8.748542,0.837209,0.992179,0.986231,0.841917


In [14]:
def model_fitting_workflow(target_training_df, target_ho_df, target_proteins):
    """Fit the final model and evaluate"""
    print("="*80)
    print("Starting model fitting...")
    print("="*80)
    print(f"🔄 Using {NESTED_CV_RANDOM_STATE_TRIES} random state tries for nested CV")
    print(f"🔄 Using {NESTED_CV_N_SPLITS} splits for nested CV")
    
    # Reshaping dataset for training and test
    target_training_fs = fs.reshape_df_for_fitting(target_training_df, target_proteins)
    target_test_fs = fs.reshape_df_for_fitting(target_ho_df, target_proteins)
    
    print(f"📊 Training set shape after feature selection: {target_training_fs.shape}")
    print(f"📊 Test set shape after feature selection: {target_test_fs.shape}")

    # Hyperparameter Selection for Logistic Regression
    try:
        target_nested_cv_results = mf.wrapper_nested_cv(
            target_training_fs, 
            random_state_tries=NESTED_CV_RANDOM_STATE_TRIES, 
            n_splits=NESTED_CV_N_SPLITS, 
            classified_by=CLASSIFIED_BY
        )
        target_nested_hp = mf.nested_cv_hparameters_selection(target_nested_cv_results)
        hyperparameter_C = pd.DataFrame(target_nested_hp).T.sort_values(by='count', ascending=False).index.tolist()[0]
        print(f"🎯 Selected hyperparameter C: {hyperparameter_C}")
    except Exception as e:
        print(f"⚠️ Warning: Hyperparameter selection failed: {e}")
        hyperparameter_C = 1.0  # Default value
        print(f"🔧 Using default hyperparameter C: {hyperparameter_C}")

    # Model Fit
    try:
        target_log_reg_model = mf.logistic_regression_ridge(
            target_training_fs, 
            hyperparameter_C, 
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir,
        )
        
        # Get results
        target_coefficients, target_train_probabilities, target_test_probabilities = mf.logistic_regression_results(
            target_log_reg_model, 
            target_training_fs, 
            target_test_fs,  
            TARGET_CLASS, 
            CLASSIFIED_BY,
            output_dir
        )
        
        # Classification scores
        test_target_scores = mf.classification_scores(target_test_probabilities)
        
        print("✅ Model training and evaluation completed successfully!")
        print(f"📊 Test scores summary:")
        if hasattr(test_target_scores, 'describe'):
            print(test_target_scores.describe())
        
        return target_log_reg_model, target_coefficients, target_train_probabilities, target_test_probabilities, test_target_scores
        
    except Exception as e:
        print(f"❌ Error during model fitting: {e}")
        return None, None, None, None, None

# Execute model fitting
model_results = model_fitting_workflow(target_training_df, target_ho_df, ACYC_new)
target_log_reg_model, target_coefficients, target_train_probabilities, target_test_probabilities, test_target_scores = model_results
 


Starting model fitting...
🔄 Using 10 random state tries for nested CV
🔄 Using 3 splits for nested CV
📊 Training set shape after feature selection: (1159, 72)
📊 Test set shape after feature selection: (469, 72)
• Running for random_state=0
1 Inner fold best parameter={'C': 1}, Score=0.8915, Outer Validation MCC Score: 1.0000

2 Inner fold best parameter={'C': 1}, Score=0.9331, Outer Validation MCC Score: 0.9204

3 Inner fold best parameter={'C': 1}, Score=0.9074, Outer Validation MCC Score: 0.9648

Average MCC across all outer folds: 0.9617

--------------------------------------------------
• Running for random_state=1
1 Inner fold best parameter={'C': 0.1}, Score=0.9339, Outer Validation MCC Score: 0.9329

2 Inner fold best parameter={'C': 1}, Score=0.9315, Outer Validation MCC Score: 0.9623

3 Inner fold best parameter={'C': 1}, Score=0.9261, Outer Validation MCC Score: 1.0000

Average MCC across all outer folds: 0.9651

--------------------------------------------------
• Running fo

In [32]:
target_test_probabilities[target_test_probabilities['Classifier'] == 1]

Unnamed: 0,Sample name,code_oncotree,Classifier,Probability
54,H021-21A6R8-M1-E2,ACYC,1,0.999954
188,H021-52PSUG-M1-Q1,ACYC,1,0.999681
201,H021-5AU8H4-M2-E1,ACYC,1,0.999952
227,H021-5U1R7Q-T3-E2,ACYC,1,0.999789
272,H021-6KGVYB-T1-E2,ACYC,1,0.999906
283,H021-6RWA6L-M1-E1,ACYC,1,0.999983
332,H021-7SZUP1-T2-E1,ACYC,1,0.999949
510,H021-BK9AH6-T1-E2,ACYC,1,0.644141
654,H021-ERYP5D-M1-E2,ACYC,1,0.908075
878,H021-JJEMQJ-M3-E1,ACYC,1,0.999997


In [None]:
len(set(target_log_reg_model.feature_names_in_)) 
len(set(target_log_reg_model.feature_names_in_) & set(ACYC_old))
len(set(set(ACYC_old))

37

In [74]:
set(target_log_reg_model.feature_names_in_) & set(ACYC_old)

{'FABP7', 'ITGA9', 'LAMC3', 'MET', 'MYO5B', 'ZNF574'}

In [30]:
initial_df[ACYC_old].describe().T.sort_values(by='mean', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
POR,1998.0,9.082996,0.316002,7.96698,8.873387,9.024665,9.226168,10.4734
SFN,1998.0,8.753805,0.770487,6.71793,8.152268,8.676825,9.298307,11.243
KRT5,1998.0,8.64038,1.060702,5.993099,7.913428,8.35241,9.106713,11.9521
UBE2N,1998.0,8.535807,0.482642,7.43306,8.16576,8.41375,8.970282,9.82665
JUP,1998.0,8.532222,0.583603,7.06163,8.065532,8.507575,8.959535,10.5183
EPPK1,1998.0,8.510581,0.737045,6.70449,7.899172,8.479645,9.10627,10.5661
CRMP1,1998.0,8.164377,0.741755,6.61175,7.615742,8.012055,8.66244,10.2516
CRTAC1,1998.0,7.327832,0.509665,5.74141,7.050445,7.290125,7.5994,10.1822
SPINT1,1998.0,7.219318,0.845922,4.52812,6.390439,7.242695,8.002455,8.81805
LLGL2,1998.0,7.18409,0.645929,5.43548,6.558554,7.22125,7.720295,8.74218


#### Separate normalization Try

In [35]:
target_training_df = fs.binary_labeling(training_df, classified_by='code_oncotree', true_class=['ACYC'])
target_ho_df = fs.binary_labeling(held_out_df, classified_by='code_oncotree', true_class=['ACYC'])


Number of samples per class:
Classifier
0    1118
1      41
Name: count, dtype: int64


Number of samples per class:
Classifier
0    451
1     18
Name: count, dtype: int64



In [49]:
target_training_df#[ACYC_new]

Unnamed: 0,Sample name,code_oncotree,Classifier,TCC,TCC GROUP,SYMPK,NUP160,FARP1,UPF1,IGBP1,...,LRRC2,ZP4,TRIML2,TBX19,DNAI3,CSF3R,MSANTD3,SLITRK3,MEIOC,SIGLEC14
1,A26K-5SXQR3-T24-Q1,BRCA,0,42.0,intermediate,8.57806,8.72228,8.20552,9.10957,8.12756,...,5.916328,5.621991,2.777706,4.508408,5.424421,7.781280,5.362418,4.731108,6.115551,5.842130
2,A26K-9TET1N-T11-Q1,BRCA,0,52.0,intermediate,8.51226,8.57632,8.59788,9.22791,7.67448,...,5.869186,5.963717,2.909937,4.470184,5.502500,6.184025,5.260068,6.992130,6.108674,5.932879
3,A26K-ADUQXR-T11-Q1,BRCA,0,53.0,intermediate,8.40461,8.67921,8.51353,9.13640,7.82232,...,5.866144,5.549171,3.197355,4.661765,5.787210,7.341060,5.381826,5.046227,6.126885,5.722657
4,A26K-HS3BDB-T14-Q1,BRCA,0,80.0,high,8.47792,8.60176,8.34986,9.12525,8.13908,...,5.882700,5.870433,2.888137,4.295576,5.567577,6.084543,5.554687,4.623281,6.264008,6.301680
5,A26K-NBD9LE-T22-Q1,BRCA,0,61.0,intermediate,8.69256,8.81846,8.03475,9.32584,7.92077,...,5.899810,5.202308,2.966257,4.664798,6.836950,5.973390,5.292103,4.962125,6.086051,5.866266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1622,S033-33+026-T1-Q1,CHDM,0,80.0,high,8.25469,8.50641,8.34203,8.98411,8.16078,...,5.956619,5.425871,4.080696,4.291549,5.827766,8.326710,5.309285,4.430149,6.088870,5.794342
1623,S033-33+029-T1-Q1,CHDM,0,70.0,high,8.25754,8.43530,8.52874,9.00228,8.01648,...,5.990435,5.546583,3.584468,4.782279,6.001073,8.339490,5.241708,4.589465,6.096099,5.927313
1624,S033-33+030-T1-Q1,CHDM,0,80.0,high,8.53470,8.58667,8.40193,9.14908,8.12346,...,5.860580,5.692487,3.070138,4.601958,5.662872,7.108060,5.338191,4.331348,6.077551,5.885792
1625,S033-33+035-T1-Q1,CHDM,0,,notdefined,8.45643,8.71756,8.42200,9.15144,7.98480,...,5.993472,5.653452,3.426112,4.704912,5.741976,7.408820,5.499131,4.848742,6.040955,5.950844


In [67]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_train = scaler.fit_transform(target_training_df[ACYC_new])
scaled_train = pd.DataFrame(scaled_train, columns=ACYC_new, index=target_training_df.index)

scaled_test = scaler.transform(target_ho_df[ACYC_new])
scaled_test = pd.DataFrame(scaled_test, columns=ACYC_new, index=target_ho_df.index)

In [68]:
#scaled_train = scaled_train.merge(target_training_df[['Sample name', 'code_oncotree', 'Classifier', 'TCC', 'TCC GROUP']], left_index=True, right_index=True)
#scaled_test = scaled_test.merge(target_ho_df[['Sample name', 'code_oncotree', 'Classifier', 'TCC', 'TCC GROUP']], left_index=True, right_index=True)



scaled_train = pd.concat([target_training_df[['Sample name', 'code_oncotree', 'Classifier', 'TCC', 'TCC GROUP']], scaled_train], axis=1)
scaled_test = pd.concat([target_ho_df[['Sample name', 'code_oncotree', 'Classifier', 'TCC', 'TCC GROUP']], scaled_test], axis=1)
scaled_test

Unnamed: 0,Sample name,code_oncotree,Classifier,TCC,TCC GROUP,VIT,ART3,MEGF6,MATN1,BCL2,...,FCER1G,AFAP1,STK26,PLSCR4,MYADM,SEPTIN4,COA5,AP1M2,CANT1,TOR1AIP1
0,A26K-5SXQR3-T11-Q1,BRCA,0,30.0,low,-0.824851,-0.129040,-0.451441,-0.184433,-0.981578,...,-1.642551,0.276189,-0.120562,0.383366,0.197756,-0.098379,-1.171189,1.333678,1.970547,-0.371477
12,H021-192LV1-M2,DDLS,0,58.0,intermediate,-0.271287,-0.299569,-0.535320,0.100341,-0.738649,...,0.279278,1.177985,0.206072,-1.056758,-1.472929,-0.318754,-1.257405,-0.763001,0.364338,-0.405415
13,H021-1AJ4XJ-M4-E2,PADA,0,86.0,high,1.177807,0.769600,1.646426,1.374043,0.577825,...,1.106766,-0.726487,0.143615,1.244205,0.997779,0.215261,1.400323,1.647478,0.607649,0.218175
20,H021-1GB4TR-M1,NSGCT,0,45.0,intermediate,-1.049524,-0.213147,-1.043140,-0.351623,-0.892661,...,0.217886,0.576811,0.985511,-1.115552,0.888050,-0.269638,-0.573935,1.171830,0.714788,0.243712
21,H021-1GLAY6-T3,DDLS,0,20.0,low,-0.086123,-0.860246,1.384129,-0.494938,0.864456,...,0.452376,-1.100289,1.203509,-0.961462,1.508515,-0.212624,-1.686883,0.271183,1.257845,-1.786908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,S033-33+018-T1-Q1,CHDM,0,80.0,high,-0.895798,-0.427982,-0.454901,0.674410,-0.851987,...,0.885410,-0.589416,-1.412824,-0.960706,0.135106,-0.252618,-0.796648,-1.206986,-0.350941,-1.022870
1616,S033-33+020-T1-Q1,CHDM,0,60.0,intermediate,-0.293916,-0.838117,-0.526443,-0.514142,-1.042708,...,0.780616,0.429232,0.744304,-0.975559,0.295228,1.225773,-1.184550,-1.278906,0.917027,-0.103514
1617,S033-33+021-T1-Q1,CHDM,0,80.0,high,0.239940,0.452668,1.368181,3.246084,-0.628924,...,-1.081389,0.887385,-0.598678,-0.962086,-0.366243,-1.681244,-0.787904,-0.962008,0.848789,0.433685
1621,S033-33+025-T1-Q1,CHDM,0,70.0,high,-0.645462,-0.326541,-0.706010,-0.883635,-0.452019,...,0.783407,1.412595,-0.571193,-1.178058,0.736798,0.408870,-0.933539,0.098284,0.177260,-0.192945


In [69]:
scaled_model_results = model_fitting_workflow(scaled_train, scaled_test, ACYC_new)
scaled_target_log_reg_model, scaled_target_coefficients, scaled_target_train_probabilities, scaled_target_test_probabilities, scaled_test_target_scores = scaled_model_results

Starting model fitting...
🔄 Using 10 random state tries for nested CV
🔄 Using 3 splits for nested CV
📊 Training set shape after feature selection: (1159, 72)
📊 Test set shape after feature selection: (469, 72)
• Running for random_state=0
1 Inner fold best parameter={'C': 10}, Score=0.8827, Outer Validation MCC Score: 0.8931

2 Inner fold best parameter={'C': 1}, Score=0.9342, Outer Validation MCC Score: 0.8855

3 Inner fold best parameter={'C': 1}, Score=0.8937, Outer Validation MCC Score: 0.9648

Average MCC across all outer folds: 0.9145

--------------------------------------------------
• Running for random_state=1
1 Inner fold best parameter={'C': 1}, Score=0.9108, Outer Validation MCC Score: 1.0000

2 Inner fold best parameter={'C': 1}, Score=0.9315, Outer Validation MCC Score: 0.9285

3 Inner fold best parameter={'C': 1}, Score=0.9439, Outer Validation MCC Score: 0.9259

Average MCC across all outer folds: 0.9514

--------------------------------------------------
• Running for


## Cell 11: Results Visualization


In [None]:

 
def generate_visualizations(initial_df, test_target_scores, target_proteins):
    """Generate and save graphs for results exploration"""
    print("="*80)
    print("Generating visualizations...")
    print("="*80)
    
    try:
        # UMAP plot
        print("📊 Creating UMAP plot...")
        UMAP_plot = grph.create_umap_plot(
            df=initial_df, 
            output_directory=output_dir,
            feature_columns=target_proteins, 
            color_column=CLASSIFIED_BY, 
            metadata_cols=[SAMPLES_COLUMN, CLASSIFIED_BY, 'TCC GROUP'],
            n_neighbors=5,
        )
        
        # TCC vs Probability plot
        print("📈 Creating TCC vs Probability plot...")
        TCC_plot = grph.plot_tcc_vs_probability(initial_df, test_target_scores, output_dir)
        
        print("✅ Visualizations generated successfully!")
        return TCC_plot, UMAP_plot
        
    except Exception as e:
        print(f"⚠️ Warning: Visualization generation failed: {e}")
        return None, None

# Generate visualizations (only if model was successful)
if target_log_reg_model is not None:
    TCC_plot, UMAP_plot = generate_visualizations(initial_df, test_target_scores, target_proteins)
else:
    print("⚠️ Skipping visualizations due to model fitting errors")
 



## Cell 12: Final Summary and Results


In [13]:

 
def display_final_summary():
    """Display final summary of the workflow"""
    print("#" * 80)
    print("🏁 FINAL WORKFLOW SUMMARY")
    print("#" * 80)
    
    if target_log_reg_model is not None:
        print("✅ CLASSIFIER WORKFLOW COMPLETED SUCCESSFULLY!")
        print(f"🎯 Target class: {TARGET_CLASS}")
        print(f"🔬 Selected {len(target_proteins)} protein features")
        print(f"📁 Results exported to: {output_dir}")
        
        # Display some key metrics if available
        if test_target_scores is not None and not test_target_scores.empty:
            print(f"\n📊 Model Performance Summary:")
            if 'accuracy' in test_target_scores.columns:
                print(f"   Average Accuracy: {test_target_scores['accuracy'].mean():.4f}")
            if 'roc_auc' in test_target_scores.columns:
                print(f"   Average ROC AUC: {test_target_scores['roc_auc'].mean():.4f}")
                
        print(f"\n📋 Top 10 Selected Features:")
        print(target_proteins[:10] if len(target_proteins) >= 10 else target_proteins)
        
    else:
        print("❌ CLASSIFIER WORKFLOW COMPLETED WITH ERRORS")
        print("Please check the error messages in previous cells")
    
    print("#" * 80)

# Display final summary
display_final_summary()
 


################################################################################
🏁 FINAL WORKFLOW SUMMARY
################################################################################
✅ CLASSIFIER WORKFLOW COMPLETED SUCCESSFULLY!
🎯 Target class: ['ACYC']


NameError: name 'target_proteins' is not defined


## Cell 13: Optional - Detailed Results Exploration


In [None]:

 
# Optional cell for detailed exploration of results
print("="*80)
print("DETAILED RESULTS EXPLORATION")
print("="*80)

if target_log_reg_model is not None:
    print("📊 Available results for exploration:")
    print("- target_log_reg_model: Fitted logistic regression model")
    print("- target_coefficients: Model coefficients")
    print("- target_train_probabilities: Training set predictions")
    print("- target_test_probabilities: Test set predictions") 
    print("- test_target_scores: Performance metrics")
    print("- target_proteins: Selected protein features")
    print("- target_stats: Feature selection statistics")
    
    # Example explorations you can run:
    print(f"\n🔍 Quick exploration examples:")
    print(f"Model coefficients shape: {target_coefficients.shape if target_coefficients is not None else 'N/A'}")
    print(f"Test probabilities shape: {target_test_probabilities.shape if target_test_probabilities is not None else 'N/A'}")
    
    # Show feature importance if coefficients available
    if target_coefficients is not None and len(target_coefficients) > 0:
        print(f"\n🏆 Top 5 most important features (by absolute coefficient):")
        coeff_series = pd.Series(target_coefficients, index=target_proteins)
        top_features = coeff_series.abs().sort_values(ascending=False).head()
        for feature, coeff in top_features.items():
            print(f"   {feature}: {coeff:.4f}")

print("\n💡 You can now explore the results interactively in subsequent cells!")
 


### Key variables
- `initial_df`: Original processed dataset
- `training_df`, `held_out_df`: Split datasets
- `target_proteins`: Selected protein features
- `target_log_reg_model`: Fitted model
- `test_target_scores`: Performance metrics
- `target_coefficients`: Model coefficients
