## Imports

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocessing as prep
import feature_selection as fs
import model_fit as mf

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [2]:
import importlib
importlib.reload(prep)


<module 'preprocessing' from '/home/lestrada/tumor_type_prediction/src/data/preprocessing.py'>

## Data Import

In [12]:
#Proteins quantification intensities file
processed_data = '2025.01.29_CJ_pancancer_273/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
metadata_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)



In [4]:
project_root = os.path.abspath(os.getcwd())
output_dir = os.path.join(project_root, 'data', 'data_output', 'Notebook_output')
output_dir

'/home/lestrada/tumor_type_prediction/notebooks/data/data_output/Notebook_output'

## Data Preprocessing

In [13]:
# Protein and peptides quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])

peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

(2360, 13074)
Proteins with  empty values: ['PTGER4', 'CD19', 'IAPP', 'FOXO4', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']


Unnamed: 0,Sample name,code_oncotree,PHB,TBL2,SAA1,CDKN1B,SNRNP70,TARDBP,GALNT6,ADGRL2,...,B4GALNT4,VGLL1,CCDC18,ZDHHC12,EGLN2,GABARAP,GUCY1B2,ATOH1,IL37,MYEOV
0,H021-3RLVZS-T1-Q1,AASTR,1,1,0,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,0
1,H021-VFM3B1-T1-Q1,AASTR,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,H021-3RLVZS-T1-Q1-R2,AASTR,1,1,0,1,1,1,0,1,...,0,0,1,0,1,1,0,0,0,0
3,H021-XBLS3R-M1-Q1,AASTR,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,H021-M2MSRE-M1-Q1,ACBC,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,H021-BYHFHEU-M2-Q1,VMM,1,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1840,H021-VYS51F-M1-Q1,VSC,1,1,1,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1841,H021-1B7R18-M1-Q1,VSC,1,1,1,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1842,H021-FUFZFT-T1-Q1,VSC,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Transforming Z-scores file to obtain values and info
z_scores_df = df_z_scores.transpose(copy=True) 
z_scores_df = z_scores_df.reset_index()
z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
z_scores_df.drop(axis=0, index=0, inplace=True)
z_scores_df['Gene names'] = z_scores_df['Gene names'].str.replace('pat_', '')
z_scores_df = z_scores_df.set_index('Gene names') 

z_scores_imputed = prep.impute_normal_down_shift_distribution(z_scores_df)
z_scores_imputed.reset_index(inplace=True)
z_scores_imputed.rename(columns={'Gene names': 'Sample name'}, inplace=True)

z_scores_initial_df = samples_metadata.merge(z_scores_imputed, left_on='Sample name', right_on='Sample name')
z_scores_initial_df

(1846, 13074)


Unnamed: 0,Sample name,code_oncotree,PHB,TBL2,SAA1,CDKN1B,SNRNP70,TARDBP,GALNT6,ADGRL2,...,B4GALNT4,VGLL1,CCDC18,ZDHHC12,EGLN2,GABARAP,GUCY1B2,ATOH1,IL37,MYEOV
0,H021-3RLVZS-T1-Q1,AASTR,0.570269,-1.954848,-0.108304,0.528920,-0.552978,-1.547589,-0.674703,-1.431098,...,-2.259616,-2.229895,1.711366,-1.893114,-0.914195,3.933296,-1.717553,-1.747781,-2.026128,-1.951108
1,H021-VFM3B1-T1-Q1,AASTR,-0.119897,-0.913546,-2.356393,-0.412755,-0.101366,-0.493058,-0.704860,-1.872432,...,-2.321418,-2.418372,1.284506,-1.944684,-2.061922,1.575512,-1.533850,-2.016380,-1.735505,-2.259470
2,H021-3RLVZS-T1-Q1-R2,AASTR,0.182432,-2.414916,-1.678159,0.802436,-1.726570,-1.213458,-1.876987,-1.315078,...,-1.811154,-2.516052,1.024149,-1.739299,0.406768,1.253458,-2.380925,-2.355762,-1.814242,-1.670028
3,H021-XBLS3R-M1-Q1,AASTR,-0.481549,-1.759738,-0.296455,0.976292,-0.733072,-0.810804,-1.327052,-0.500066,...,-1.982625,-1.891657,-0.934652,-2.052825,-1.745946,0.607305,-1.605360,-1.540036,-2.340112,-2.377034
4,H021-M2MSRE-M1-Q1,ACBC,-0.795462,-0.784477,0.288746,0.830664,1.627005,1.117730,-0.512298,0.071553,...,-2.597361,-2.609782,-1.005164,-1.348821,-0.205083,-1.273310,-1.476473,-1.751050,-2.302141,-1.765846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,H021-BYHFHEU-M2-Q1,VMM,0.393200,0.235131,-0.877088,0.801262,0.599496,-0.052041,-1.489120,-1.133797,...,-2.729675,-2.223789,1.170956,-1.411426,-0.330144,0.235525,-1.478528,-1.426303,-1.969946,-1.601865
1840,H021-VYS51F-M1-Q1,VSC,-0.109858,0.177969,0.999149,-0.416473,-0.454592,-0.972266,-1.999528,-2.062077,...,-2.012480,-3.238598,-0.047412,-1.186199,-2.171985,0.395960,-1.350378,-2.050263,-2.349782,-1.399920
1841,H021-1B7R18-M1-Q1,VSC,0.053191,-2.516390,0.668423,-1.445997,-1.593026,0.353921,-1.313306,-1.000417,...,-1.656409,-2.937084,-1.760477,-1.459320,-2.525625,-0.513694,-1.784874,-1.749608,-2.344643,-1.184619
1842,H021-FUFZFT-T1-Q1,VSC,-0.931686,0.048827,1.045210,0.620125,0.288494,-0.249849,1.396934,-0.084231,...,-2.060663,-1.817443,-1.373557,-1.131807,-1.800865,-0.534801,-2.060059,-1.987010,-1.375791,-1.685683


## Data Set Split

In [8]:
#Removing samples not part of the Oncotree classification
NOS_cases = initial_df[initial_df['code_oncotree'].str.endswith('NOS', na=False)]['code_oncotree'].unique().tolist()
other_cases = ['missing']
cases_to_remove = NOS_cases + other_cases
ml_initial_df = prep.remove_class(initial_df, cases_to_remove, 'code_oncotree')

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)

#Z_scores dataset
z_scores_training_df = z_scores_initial_df.iloc[training_df.index]

Removed samples: 203
Remaining samples: 1457
Classes with only one sample: 68
Training set samples: 1109
Held-out set samples: 348


# Class Specific Worflow

In [9]:
#Set Classification Parameters
target_class = ['CHDM'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'


In [10]:

#Obtaining high confidence proteins by peptides
entity_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, target_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification 
entity_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=target_class)
entity_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=target_class)

entity_z_scores_train_df = fs.binary_labeling(z_scores_training_df, classified_by=classified_by, true_class=target_class)


# 1st Filter - Filtering entity training and held-out dataframes by proteins with peptides
entity_training_df = entity_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)
entity_ho_df = entity_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

entity_z_scores_train_df = entity_z_scores_train_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

 28 proteins identified in 70.0% of ['CHDM'] samples

Number of samples per class:
Classifier
0    1030
1      79
Name: count, dtype: int64


Number of samples per class:
Classifier
0    321
1     27
Name: count, dtype: int64


Number of samples per class:
Classifier
0    1030
1      79
Name: count, dtype: int64



## Feature Selection

### Hyperparametes for ElasticNet
Calculated in Z-scores

In [11]:
entity_cv_results, entity_best_params, entity_best_score, entity_grid_search_obj = fs.hparameter_grid_search(entity_z_scores_train_df, 4, l1_ratio_list=[0.7,0.5], C_list=[0.1,1], classified_by='code_oncotree')

Grid search completed in 1.78 seconds
Best parameters: {'C': 1, 'l1_ratio': 0.5, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score: 0.6773657341657652


### Feature Selection by ElasticNet Cross-Validation

In [12]:
entity_cross_val_coeffs = fs.elnet_wrapper(entity_z_scores_train_df,classified_by=classified_by, tumor_type_name='CHDM', l1_ratio=entity_best_params['l1_ratio'], C=entity_best_params['C'], n_splits=4, n_repeats=25,n_jobs=16, export=True)

Running Logistic Regression:   0%|          | 0/25 [00:00<?, ?iteration/s]Exception ignored in: <function ResourceTracker.__del__ at 0x7cf6e258dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7a0066f8dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91,

DataFrame exported to: /home/lestrada/tumor_type_prediction/notebooks/data/data_output/BRCA_250812_1030_results/feature_selection/CHDM_coefficients.xlsx


In [13]:
entity_stats_try, entity_proteins = fs.statistic_from_coefficients(entity_cross_val_coeffs, target_class)

With  28  folds, the following statistics were obtained, from feature selection:
• Mean MCC score: 0.5628 ± 0.0529

----------------------------------------
• Top 3 proteins with highest coefficients:
           mean       std  Freq  Wald Chi-Square   p-value_corrected  \
TCAF2  1.370146  0.154238   1.0        78.913606            0.000000   
PFAS   0.910248  0.185453   1.0        24.090807            0.000003   
AMPD2  0.661815  0.199735   1.0        10.979075            0.002172   

       Significant  
TCAF2          1.0  
PFAS           1.0  
AMPD2          1.0  

----------------------------------------
• List of significant proteins: ['TCAF2', 'PFAS', 'AMPD2', 'KRT5', 'CCT2', 'MSH6', 'BIN1', 'PPA1', 'INTS3']
• Number of significant proteins: 9

----------------------------------------


## Model Fitting
Calculated on intensities

### Reshaping dataset for training and test

In [14]:
entity_training_fs = fs.reshape_df_for_fitting(entity_training_df, entity_proteins)
entity_test_fs = fs.reshape_df_for_fitting(entity_ho_df, entity_proteins)


### Hyperparameter Selection for Logistic Regression

In [15]:
entity_to_nest_cv_results = mf.wrapper_nested_cv(entity_training_fs, random_state_tries=5, n_splits=3, classified_by=classified_by)
entity_to_nest_hp = mf.nested_cv_hparameters_selection(entity_to_nest_cv_results)

• Running for random_state=0
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
1 Inner fold best parameter={'C': 10}, Score=0.5092, Outer Validation MCC Score: 0.4426

Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
2 Inner fold best parameter={'C': 1}, Score=0.4786, Outer Validation MCC Score: 0.4793

Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
3 

In [16]:
hyperparameter_C = pd.DataFrame(entity_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

In [17]:
hyperparameter_C

10

### Model Fit

In [18]:
entity_log_reg_model = mf.logistic_regression_ridge( entity_training_fs, hyperparameter_C, target_class, classified_by=classified_by) 

Model saved as CHDM_log_reg_ridge_model.pkl




In [19]:
entity_coefficients, entity_train_probabilities, entity_test_probabilities = mf.logistic_regression_results(entity_log_reg_model, entity_training_fs, entity_ho_df, target_class, classified_by=classified_by)

# of Iterations: [10000]
MCC train: 0.47319156771586096
MCC test: 0.4749539249389099
F1 Positive: 0.47058823529411764


In [20]:
test_entity_scores = mf.classification_scores(entity_test_probabilities)

------------------------------------
•General Scores:
MCC Score: 0.4749539249389099
F1 Macro: 0.6898395721925134
F1 Micro: 0.8448275862068966
F1 Entity Score: 0.47058823529411764

------------------------------------
•Confusion Matrix:
  TN | FP
[[270  51]
 [  3  24]]
  FN | TP

------------------------------------
•False Positives:
              Sample name code_oncotree  Classifier  Probability  Predicted
0       H021-3RLVZS-T1-Q1         AASTR           0     0.873161        1.0
2       H021-T1B3YD-M1-Q1           ACC           0     0.986838        1.0
7       H021-ADE19T-M1-Q1           ACC           0     0.561760        1.0
16      H021-XQG7LH-T1-Q1          ANGS           0     0.695787        1.0
42         H021-1AZ5GH-T1            BA           0     0.907111        1.0
46     K26K-AKQTJ4-M12-P1          BRCA           0     0.999924        1.0
56     K26K-RTLHQJ-M11-Q1          BRCA           0     0.787424        1.0
68     K26K-YN54AN-M11-Q1          BRCA           0     0

Exception ignored in: <function ResourceTracker.__del__ at 0x76871d785ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x768675391ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/

# Graphs

### 3D UMAP with selected features

In [16]:
initial_df

Unnamed: 0,Sample name,code_oncotree,PHB,TBL2,SAA1,CDKN1B,SNRNP70,TARDBP,GALNT6,ADGRL2,...,B4GALNT4,VGLL1,CCDC18,ZDHHC12,EGLN2,GABARAP,GUCY1B2,ATOH1,IL37,MYEOV
0,H021-3RLVZS-T1-Q1,AASTR,8.984560,8.023966,6.753402,7.454177,8.925887,8.619639,6.503819,6.634201,...,3.908711,4.893061,8.384738,5.540645,5.610053,8.589079,4.280819,4.804935,5.081195,6.330591
1,H021-VFM3B1-T1-Q1,AASTR,8.718650,8.251288,4.339679,7.100625,9.011852,8.804410,6.485060,6.455182,...,3.716039,5.097237,8.154195,5.523128,4.333116,7.669777,4.017277,4.735047,5.042454,6.344058
2,H021-3RLVZS-T1-Q1-R2,AASTR,8.835052,7.923742,4.993241,7.556858,8.702717,8.678147,5.679800,6.681296,...,3.523717,4.570259,8.013464,5.413037,6.559988,7.543514,3.253285,4.820705,5.485397,6.428895
3,H021-XBLS3R-M1-Q1,AASTR,8.579244,8.066515,6.595761,7.622104,8.891614,8.748704,5.830286,7.012399,...,3.741589,5.271207,6.954656,5.456343,5.011769,7.289927,3.622817,4.718377,4.756875,6.355915
4,H021-M2MSRE-M1-Q1,ACBC,8.458278,8.279497,7.085902,7.567453,9.340441,9.086740,6.604867,7.244211,...,3.617362,5.161070,6.916521,5.458319,6.121038,5.577484,4.061102,4.686711,5.029859,6.337855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,H021-BYHFHEU-M2-Q1,VMM,8.916305,8.502172,6.109480,7.556417,9.145065,8.881749,5.766890,6.754905,...,3.986428,5.139966,8.092827,5.487697,6.030887,7.143923,4.014328,4.771435,5.033161,6.388024
1840,H021-VYS51F-M1-Q1,VSC,8.722520,8.489673,7.680883,7.099229,8.944614,8.720405,5.669458,6.378328,...,3.864893,4.952716,7.434742,5.426502,4.768099,7.206934,4.049730,4.749698,5.009383,6.308210
1841,H021-1B7R18-M1-Q1,VSC,8.785227,7.901660,7.403964,6.712853,8.728085,8.952848,6.006462,6.809079,...,3.886330,5.299532,6.508352,5.459801,4.192192,6.851469,3.747775,4.817958,5.058267,6.315013
1842,H021-FUFZFT-T1-Q1,VSC,8.405800,8.461433,7.719436,7.488420,9.085873,8.847058,7.791974,7.181439,...,3.604483,5.250035,6.717356,5.476761,4.972328,6.843180,4.012043,4.593288,5.307830,6.394320


In [18]:
CHDM_proteins = ['SCARA5',
'TRIL',
'OLFML2A',
'SUSD5',
'PDE1A',
'KRT80',
'XYLT1',
'AKR1B10',
'GALNT3',
'ITGBL1',
'COL2A1',
'SPP2',
'MGARP',
'LYST',
'TLR3',
'CHST3',
'ABLIM3',
'CDK18',
'LGALSL',
'CD109',
'LPA',
'TUBA8',
'CRISP3',
'PADI4',
'GPAT3',
'HVCN1',
'RAB3B',
'CA12',
'TUBB1',
'DOCK8',
'CLMN',
'DOCK9',
'EPPK1',
'PODXL',
'NES',
'GLA',
'GPRC5A']

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot
import umap
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def create_3d_umap_plot(df, feature_columns, color_column='code_oncotree', 
                       metadata_cols=['Sample name', 'code_oncotree', 'TCC'],
                       n_neighbors=15, min_dist=0.1, random_state=42,
                       title="3D UMAP Visualization"):
    """
    Create a 3D UMAP visualization with Plotly
    
    Parameters:
        df : pandas.DataFrame
            Input dataframe
        feature_columns : list
            List of column names to use as features for UMAP
        color_column : str
            Column name to use for coloring points (default: 'code_oncotree')
        hover_columns : list
            Column names to show on hover (default: ['Sample name', 'code_oncotree', 'TCC'])
        n_neighbors : int
            UMAP parameter for number of neighbors (default: 15)
        min_dist : float
            UMAP parameter for minimum distance (default: 0.1)
        random_state : int
            Random state for reproducibility (default: 42)
        title : str
            Plot title (default: "3D UMAP Visualization")
        
    Returns:

    plotly.graph_objects.Figure
        The 3D UMAP plot figure
    """
    
     
    # Filter and prepare data
    print(f"Original dataframe shape: {df.shape}")
    
    # Select feature columns
    feature_data = df[feature_columns].copy()
    print(f"Feature data shape: {feature_data.shape}")
    
    
    # Standardize features
    print("Standardizing features...")
    scaler = StandardScaler()
    feature_data_scaled = scaler.fit_transform(feature_data)
    
    # Apply UMAP
    print("Applying UMAP...")
    umap_model = umap.UMAP(
        n_components=3,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        random_state=random_state,
        verbose=True
    )
    
    embedding_3d = umap_model.fit_transform(feature_data_scaled)
    
    # Prepare data for plotting
    df_plot = df.copy()
    df_plot['UMAP_1'] = embedding_3d[:, 0]
    df_plot['UMAP_2'] = embedding_3d[:, 1]
    df_plot['UMAP_3'] = embedding_3d[:, 2]
    df_plot[metadata_cols] = df_plot[metadata_cols]

    
    
    # Get unique colors for each category
    unique_categories = df_plot[color_column].unique()
    n_categories = len(unique_categories)
    print(f"Number of unique categories in {color_column}: {n_categories}")
    
    fig = px.scatter_3d(
        df_plot,
        x='UMAP_1',
        y='UMAP_2',
        z='UMAP_3',
        color=color_column,
        hover_data={col: True for col in metadata_cols},
        title=title,
        opacity=0.8
    )
    
    # Update hover template for cleaner display
    fig.update_traces(
        hovertemplate='<br>'.join([f'{col}: %{{customdata[{i}]}}' 
                                    for i, col in enumerate(metadata_cols)]) + '<extra></extra>'
    )

    # Update layout
    fig.update_layout(
        title={
            'text': title,
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 16}
        },
        scene=dict(
            xaxis_title="UMAP 1",
            yaxis_title="UMAP 2",
            zaxis_title="UMAP 3",
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=1.5)
            )
        ),
        width=900,
        height=700,
        margin=dict(l=0, r=0, b=0, t=50)
    )

    fig.write_html(filename, include_plotlyjs=True)
    print(f"Plot saved as: {filename}")
    return fig

def save_plot_as_html(fig, filename="umap_3d_plot.html"):
    """
    Save the plotly figure as an HTML file
    
    Parameters:
    -----------
    fig : plotly.graph_objects.Figure
        The plotly figure to save
    filename : str
        Output filename (default: "umap_3d_plot.html")
    """





IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



Sample dataframe created
Shape: (1000, 53)
Columns: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']... (showing first 5 feature columns)
Original dataframe shape: (1000, 53)
Feature data shape: (1000, 50)
Standardizing features...
Applying UMAP...
UMAP(n_components=3, n_jobs=1, random_state=42, verbose=True)
Tue Aug 12 11:35:31 2025 Construct fuzzy simplicial set
Tue Aug 12 11:35:31 2025 Finding Nearest Neighbors
Tue Aug 12 11:35:36 2025 Finished Nearest Neighbor Search
Tue Aug 12 11:35:38 2025 Construct embedding


Epochs completed:   5%| ▌          26/500 [00:00]

	completed  0  /  500 epochs


Epochs completed:  24%| ██▍        120/500 [00:01]

	completed  50  /  500 epochs
	completed  100  /  500 epochs


Epochs completed:  38%| ███▊       191/500 [00:01]

	completed  150  /  500 epochs
	completed  200  /  500 epochs


Epochs completed:  57%| █████▋     286/500 [00:01]

	completed  250  /  500 epochs
	completed  300  /  500 epochs


Epochs completed:  75%| ███████▌   376/500 [00:02]

	completed  350  /  500 epochs


Epochs completed:  89%| ████████▉  445/500 [00:02]

	completed  400  /  500 epochs
	completed  450  /  500 epochs


Epochs completed: 100%| ██████████ 500/500 [00:02]


Tue Aug 12 11:35:41 2025 Finished embedding
Number of unique categories in code_oncotree: 20


Plot saved as: oncology_umap_3d.html


'\n# Load your dataframe\ndf = pd.read_csv("your_data.csv")  # or however you load your data\n\n# Define your feature columns (the ones to filter by)\nyour_feature_columns = ["gene1", "gene2", "gene3", ...]  # your actual feature column names\n\n# Create the plot\nfig = create_3d_umap_plot(\n    df=df,\n    feature_columns=your_feature_columns,\n    color_column=\'code_oncotree\',\n    hover_columns=[\'Sample name\', \'code_oncotree\', \'TCC\'],\n    title="Your Custom Title"\n)\n\n# Show in notebook\nfig.show()\n\n# Save as HTML\nsave_plot_as_html(fig, "your_plot_name.html")\n'

In [19]:
create_3d_umap_plot(df=initial_df, 
                       feature_columns=CHDM_proteins, 
                       color_column='code_oncotree', 
                       metadata_cols=['Sample name', 'code_oncotree'],
                       title="3D_UMAP_TEST")

Original dataframe shape: (1844, 13076)
Feature data shape: (1844, 37)
Standardizing features...
Applying UMAP...
UMAP(n_components=3, n_jobs=1, random_state=42, verbose=True)
Tue Aug 12 11:46:15 2025 Construct fuzzy simplicial set
Tue Aug 12 11:46:17 2025 Finding Nearest Neighbors
Tue Aug 12 11:46:17 2025 Finished Nearest Neighbor Search
Tue Aug 12 11:46:17 2025 Construct embedding


Epochs completed:   3%| ▎          15/500 [00:00]

	completed  0  /  500 epochs


Epochs completed:  14%| █▍         72/500 [00:00]

	completed  50  /  500 epochs


Epochs completed:  25%| ██▌        127/500 [00:00]

	completed  100  /  500 epochs


Epochs completed:  34%| ███▎       168/500 [00:01]

	completed  150  /  500 epochs


Epochs completed:  45%| ████▍      223/500 [00:01]

	completed  200  /  500 epochs


Epochs completed:  53%| █████▎     265/500 [00:02]

	completed  250  /  500 epochs


Epochs completed:  64%| ██████▍    321/500 [00:02]

	completed  300  /  500 epochs


Epochs completed:  75%| ███████▌   376/500 [00:02]

	completed  350  /  500 epochs


Epochs completed:  84%| ████████▎  418/500 [00:03]

	completed  400  /  500 epochs


Epochs completed:  95%| █████████▍ 473/500 [00:03]

	completed  450  /  500 epochs


Epochs completed: 100%| ██████████ 500/500 [00:03]


Tue Aug 12 11:46:21 2025 Finished embedding
Number of unique categories in code_oncotree: 221
