In [8]:
import pandas as pd # pyright: ignore[reportMissingModuleSource]
import numpy as np # type: ignore
from joblib import load

from sklearn.linear_model import LogisticRegression # pyright: ignore[reportMissingModuleSource]
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RepeatedStratifiedKFold, cross_val_score # pyright: ignore[reportMissingModuleSource]
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, matthews_corrcoef # pyright: ignore[reportMissingModuleSource]
from joblib import dump # type: ignore
from sklearn.exceptions import ConvergenceWarning # pyright: ignore[reportMissingModuleSource]


In [2]:
## CPTAC Dataset
BRCA_cptac =pd.read_csv('/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2025.03.19_MT_cptac_BRCA/preprocessed_fp2.csv')

## BRCA Model and standardization parameters
brca_standarization_params = '/home/lestrada/tumor_type_prediction/data/BRCA_250918_results/BRCA_normalization_parameters.pkl'
brca_ml_model = '/home/lestrada/tumor_type_prediction/data/BRCA_250918_results/BRCA_log_reg_ridge_model.pkl'

In [9]:
## loading pickled model
brca_standarization = load(brca_standarization_params)
brca_model = load(brca_ml_model)

In [23]:
brca_standarization.mean_
brca_standarization.var_
brca_standarization.feature_names_in_
BRCA_standarization_data = {'meaan': brca_standarization.mean_, 'var': brca_standarization.var_}
pd.DataFrame(BRCA_standarization_data, index=brca_standarization.feature_names_in_)

Unnamed: 0,meaan,var
SYMPK,8.503260,0.027030
NUP160,8.586438,0.023294
FARP1,8.444909,0.080326
UPF1,9.140662,0.014446
IGBP1,7.986940,0.027420
...,...,...
CSF3R,6.726725,0.363805
MSANTD3,5.369325,0.020747
SLITRK3,4.717663,0.168111
MEIOC,6.137134,0.008976


In [None]:
BRCA_proteins = list(brca_model.feature_names_in_)

In [21]:
## Obtaining intesity data from pipeline processing
BRCA_cptac.columns
BRCA_intensities = BRCA_cptac.iloc[:, 198:385].merge(BRCA_cptac['Gene names'], left_index=True, right_index=True)

BRCA_intensities_columns = list(BRCA_intensities.columns)
BRCA_intensities_columns_ordered = BRCA_intensities_columns[-1:] + BRCA_intensities_columns[:-1]
BRCA_intensities_ordered = BRCA_intensities[BRCA_intensities_columns_ordered]

In [22]:
BRCA_intensities_ordered

Unnamed: 0,Gene names,Reporter intensity corrected 1 BRCA_Batch01,Reporter intensity corrected 1 BRCA_Batch02,Reporter intensity corrected 1 BRCA_Batch03,Reporter intensity corrected 1 BRCA_Batch04,Reporter intensity corrected 1 BRCA_Batch05,Reporter intensity corrected 1 BRCA_Batch06,Reporter intensity corrected 1 BRCA_Batch07,Reporter intensity corrected 1 BRCA_Batch08,Reporter intensity corrected 1 BRCA_Batch09,...,Reporter intensity corrected 9 BRCA_Batch08,Reporter intensity corrected 9 BRCA_Batch09,Reporter intensity corrected 9 BRCA_Batch10,Reporter intensity corrected 9 BRCA_Batch11,Reporter intensity corrected 9 BRCA_Batch12,Reporter intensity corrected 9 BRCA_Batch13,Reporter intensity corrected 9 BRCA_Batch14,Reporter intensity corrected 9 BRCA_Batch15,Reporter intensity corrected 9 BRCA_Batch16,Reporter intensity corrected 9 BRCA_Batch17
0,LSM2,9.21480,9.30578,8.80714,9.26199,9.49711,9.20505,9.21393,9.16860,9.41695,...,9.26414,9.59997,9.26680,8.81895,9.17844,9.15370,8.85307,8.67793,9.44341,9.14825
1,EPB41L2,9.87133,10.08790,9.84679,9.69304,9.80991,9.83893,10.14170,9.80776,10.00080,...,10.01750,9.94675,9.94144,9.74556,9.90731,9.76807,9.92997,9.92960,9.57203,9.81919
2,PURA,9.53028,9.62064,9.50413,9.40133,9.22327,9.25705,9.71190,9.51396,9.63409,...,9.51850,9.18631,9.48080,9.51649,9.58805,9.38114,9.54134,9.51752,9.64835,9.42162
3,PPP1R7,9.70052,9.50598,9.29377,9.64103,9.60852,9.60474,9.68062,9.61106,9.58794,...,9.56717,9.48717,9.62993,9.55075,9.65160,9.56180,9.37612,9.55486,9.81326,9.43247
4,CAD,9.71965,9.61534,9.70392,9.92145,9.88771,9.79461,9.55604,9.69682,9.66599,...,9.72372,9.88960,9.74106,9.69358,9.84683,9.74009,9.43414,9.48270,9.73613,9.71804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,PDLIM1,9.90429,10.33440,10.09080,10.16970,9.95271,10.03670,10.23350,10.17090,10.11400,...,9.99221,9.89221,10.19770,9.96328,10.09770,10.11460,10.10420,10.13710,9.64936,10.16450
995,CALD1,10.75630,10.84200,10.78550,10.61110,10.65630,10.45240,10.73100,10.46550,10.86660,...,10.59420,10.07560,10.66920,10.59990,10.86820,10.49320,10.51900,10.33930,10.20720,10.78630
996,U2SURP,9.34122,9.31005,9.48783,9.39370,9.45714,9.42614,9.16163,9.33593,9.25681,...,9.31597,9.61979,9.26226,9.43577,9.38683,9.41355,8.78636,8.98561,9.51683,9.51210
997,CCT7,10.09030,10.23180,10.09110,10.30820,10.37950,10.23200,10.18940,10.27150,10.23540,...,10.13560,10.55210,10.21250,10.18860,10.32370,10.29370,10.16960,10.06190,10.27720,10.23430


In [31]:
len(set(BRCA_proteins) - set(BRCA_intensities_ordered['Gene names'].to_list()))

201

In [33]:
len(set(BRCA_proteins) & set(BRCA_intensities_ordered['Gene names'].to_list()))

33

In [32]:
len(set(BRCA_proteins) )

234