In [2]:
import cptac
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
def Check_CNV_Status(cnv_val):
    if ((cnv_val > 0.2) | (cnv_val < -0.2)):
        return 1
    else:
        return 0
    
def Get_CNV_Totals(cnv_orig):
    cnv_totals = cnv_orig.map(Check_CNV_Status)
    return(cnv_totals.sum(axis=1).to_frame())

In [4]:
cancer_types = ['brca','ccrcc','coad','gbm','hnscc','lscc','luad','ov','pdac','ucec']
data_accessors = {'brca':cptac.Brca(),'ccrcc':cptac.Ccrcc(),'coad':cptac.Coad(),
                  'gbm':cptac.Gbm(),'hnscc':cptac.Hnscc(),'lscc':cptac.Lscc(),
                  'luad':cptac.Luad(),'ov':cptac.Ov(),'pdac':cptac.Pdac(),'ucec':cptac.Ucec()}

In [12]:
def Get_Protein_Clinical_Dat(accessor,protein,with_proteomics):
    clinical = accessor.get_clinical('mssm')
    if with_proteomics:
        gene_expression = accessor.get_proteomics('umich')
    else:
        gene_expression = accessor.get_transcriptomics('washu')
    cnv_totals = Get_CNV_Totals(accessor.get_CNV('washu'))
    cnv_totals = cnv_totals.rename(columns={cnv_totals.columns[0]:'CNV_Score'})
    if protein in gene_expression:
        proteomic_dat = gene_expression[[protein]].droplevel(1, axis=1)
        clinical_dat = clinical[['age', 'sex', 'race','histologic_type','tumor_stage_pathological']]
        protein_clinical_dat = proteomic_dat.join(cnv_totals, on='Patient_ID').join(clinical_dat, on='Patient_ID')
        return protein_clinical_dat
    else:
        return

In [6]:
def Get_Linear_Regress_Model(protein,protein_clinical_dat):
    protein_clinical_dat[protein] = protein_clinical_dat[protein].astype('float')
    protein_clinical_dat['CNV_Score'] = protein_clinical_dat['CNV_Score'].astype('float')
    protein_clinical_dat['age'] = pd.to_numeric(protein_clinical_dat['age'],errors='coerce')
    protein_clinical_dat['sex'] = protein_clinical_dat['sex'].astype('category')
    protein_clinical_dat['race'] = protein_clinical_dat['race'].astype('category')
    protein_clinical_dat['histologic_type'] = protein_clinical_dat['histologic_type'].astype('category')
    protein_clinical_dat['tumor_stage_pathological'] = protein_clinical_dat['tumor_stage_pathological'].astype('category')

    protein_clinical_dat = pd.get_dummies(protein_clinical_dat,columns=['sex','race','histologic_type','tumor_stage_pathological'])

    column_vals = protein_clinical_dat.columns.values
    column_vals = np.delete(column_vals,1)

    y = protein_clinical_dat['CNV_Score']
    X = protein_clinical_dat[column_vals]
    X = sm.add_constant(X)
    return sm.OLS(y,X.astype(float),missing='drop').fit()


In [7]:
protein = 'RBL1'
for accessor in data_accessors:
    protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,True)
    if protein_clinical_dat is not None:
        model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
        print('=================')
        print('=================')
        print('=================')
        print(accessor + ': ' + protein)
        print('=================')
        print(model.summary())

brca: RBL1
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.315
Model:                            OLS   Adj. R-squared:                  0.191
Method:                 Least Squares   F-statistic:                     2.545
Date:                Mon, 03 Feb 2025   Prob (F-statistic):             0.0102
Time:                        13:03:56   Log-Likelihood:                -675.26
No. Observations:                  73   AIC:                             1375.
Df Residuals:                      61   BIC:                             1402.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

  res = hypotest_fun_out(*samples, **kwds)


ucec: RBL1
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.422
Model:                            OLS   Adj. R-squared:                  0.184
Method:                 Least Squares   F-statistic:                     1.773
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.158
Time:                        13:09:52   Log-Likelihood:                -233.17
No. Observations:                  25   AIC:                             482.3
Df Residuals:                      17   BIC:                             492.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------

  return np.sqrt(eigvals[0]/eigvals[-1])


In [8]:
protein = 'RBL2'
for accessor in data_accessors:
    protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,True)
    if protein_clinical_dat is not None:
        model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
        print('=================')
        print('=================')
        print('=================')
        print(accessor + ': ' + protein)
        print('=================')
        print(model.summary())

brca: RBL2
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1.130
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.337
Time:                        13:43:04   Log-Likelihood:                -1121.3
No. Observations:                 120   AIC:                             2279.
Df Residuals:                     102   BIC:                             2329.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [9]:
protein = 'CDK4'
for accessor in data_accessors:
    protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,True)
    if protein_clinical_dat is not None:
        model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
        print('=================')
        print('=================')
        print('=================')
        print(accessor + ': ' + protein)
        print('=================')
        print(model.summary())

brca: CDK4
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.220
Model:                            OLS   Adj. R-squared:                  0.090
Method:                 Least Squares   F-statistic:                     1.694
Date:                Mon, 03 Feb 2025   Prob (F-statistic):             0.0556
Time:                        14:19:29   Log-Likelihood:                -1116.7
No. Observations:                 120   AIC:                             2269.
Df Residuals:                     102   BIC:                             2320.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [10]:
protein = 'CDK6'
for accessor in data_accessors:
    protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,True)
    if protein_clinical_dat is not None:
        model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
        print('=================')
        print('=================')
        print('=================')
        print(accessor + ': ' + protein)
        print('=================')
        print(model.summary())

brca: CDK6
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1.130
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.337
Time:                        14:19:56   Log-Likelihood:                -1121.3
No. Observations:                 120   AIC:                             2279.
Df Residuals:                     102   BIC:                             2329.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [13]:
protein = 'RBL1'
for accessor in data_accessors:
    if accessor != 'lscc':
        protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,False)
        if protein_clinical_dat is not None:
            model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
            print('=================')
            print('=================')
            print('=================')
            print(accessor + ': ' + protein)
            print('=================')
            print(model.summary())
    else:
        print("No transcriptomics data for lscc")

brca: RBL1
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.300
Model:                            OLS   Adj. R-squared:                  0.182
Method:                 Least Squares   F-statistic:                     2.543
Date:                Mon, 03 Feb 2025   Prob (F-statistic):            0.00207
Time:                        14:30:01   Log-Likelihood:                -1101.5
No. Observations:                 119   AIC:                             2239.
Df Residuals:                     101   BIC:                             2289.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [14]:
protein = 'RBL2'
for accessor in data_accessors:
    if accessor != 'lscc':
        protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,False)
        if protein_clinical_dat is not None:
            model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
            print('=================')
            print('=================')
            print('=================')
            print(accessor + ': ' + protein)
            print('=================')
            print(model.summary())
    else:
        print("No transcriptomics data for lscc")

brca: RBL2
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.160
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1.129
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.338
Time:                        14:34:14   Log-Likelihood:                -1112.3
No. Observations:                 119   AIC:                             2261.
Df Residuals:                     101   BIC:                             2311.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [15]:
protein = 'CDK4'
for accessor in data_accessors:
    if accessor != 'lscc':
        protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,False)
        if protein_clinical_dat is not None:
            model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
            print('=================')
            print('=================')
            print('=================')
            print(accessor + ': ' + protein)
            print('=================')
            print(model.summary())
    else:
        print("No transcriptomics data for lscc")

brca: CDK4
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.180
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     1.302
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.207
Time:                        14:34:44   Log-Likelihood:                -1110.9
No. Observations:                 119   AIC:                             2258.
Df Residuals:                     101   BIC:                             2308.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [16]:
protein = 'CDK6'
for accessor in data_accessors:
    if accessor != 'lscc':
        protein_clinical_dat = Get_Protein_Clinical_Dat(data_accessors[accessor],protein,False)
        if protein_clinical_dat is not None:
            model = Get_Linear_Regress_Model(protein,protein_clinical_dat)
            print('=================')
            print('=================')
            print('=================')
            print(accessor + ': ' + protein)
            print('=================')
            print(model.summary())
    else:
        print("No transcriptomics data for lscc")

brca: CDK6
                            OLS Regression Results                            
Dep. Variable:              CNV_Score   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     1.456
Date:                Mon, 03 Feb 2025   Prob (F-statistic):              0.127
Time:                        14:34:59   Log-Likelihood:                -1109.6
No. Observations:                 119   AIC:                             2255.
Df Residuals:                     101   BIC:                             2305.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.975]
--

In [None]:
y = protein_clinical_dat['CNV_Score']
XX = protein_clinical_dat[[protein]]
XX = sm.add_constant(XX)
model = sm.OLS(y,XX.astype(float),missing='drop').fit()
print(model.summary())