In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV,SelectKBest,mutual_info_classif,SelectFromModel
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV,LeaveOneOut,StratifiedKFold,RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,auc,confusion_matrix
from mat4py import loadmat
import matplotlib.pyplot as plt 
import math
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Case-based difficuty score
case_feat_normal = pd.read_csv('/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_diffScore/Square_norm_feat.csv')
case_feat_normal = pd.DataFrame(case_feat_normal).drop("Unnamed: 0",axis=1)
print('Whether there is missing value in normal mat: {}'.format(case_feat_normal.isnull().values.any())) 
case_feat_cancer = pd.read_csv('/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_case_cancer_diffScore/Square_norm_feat.csv')
case_feat_cancer = pd.DataFrame(case_feat_cancer).drop("Unnamed: 0",axis=1)
print('Whether there is missing value in cancer mat: {}'.format(case_feat_cancer.isnull().values.any())) 

Whether there is missing value in normal mat: False
Whether there is missing value in cancer mat: False


In [3]:
CC_Normal = case_feat_normal.loc[case_feat_normal['View']=="'CC'",]
MLO_Normal = case_feat_normal.loc[case_feat_normal['View']=="'MLO'",]
left_CC = CC_Normal.loc[CC_Normal["Side"]=="L",]
right_CC =CC_Normal.loc[CC_Normal["Side"]=="R",]
left_MLO = MLO_Normal.loc[MLO_Normal["Side"]=="L",]
right_MLO = MLO_Normal.loc[MLO_Normal["Side"]=="R",]
lesion_side = pd.read_csv("/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/lesion_side.csv")
case_feat_cancer_ = case_feat_cancer.merge(lesion_side, how='left',on = "CaseName").drop("Unnamed: 0",axis=1)
CC_Cancer = case_feat_cancer_.loc[case_feat_cancer_["Side"]==case_feat_cancer_["LesionSide"] ,].loc[case_feat_cancer_["View"]=="'CC'",]
MLO_Cancer = case_feat_cancer_.loc[case_feat_cancer_["Side"]==case_feat_cancer_["LesionSide"] ,].loc[case_feat_cancer_["View"]=="'MLO'",]

CC_Cancer.drop(17, axis=0, inplace=True)
MLO_Cancer.drop(23, axis=0, inplace=True)
print('The shape of CC_Normal is {}'.format(CC_Normal.shape))
print('The shape of CC_CancerLesion is {}'.format(CC_Cancer.shape))
print('The shape of MLO_Normal is {}'.format(MLO_Normal.shape))
print('The shape of MLO_CancerLesion is {}'.format(MLO_Cancer.shape))

The shape of CC_Normal is (80, 210)
The shape of CC_CancerLesion is (20, 212)
The shape of MLO_Normal is (80, 210)
The shape of MLO_CancerLesion is (20, 212)


# algorithms to use: Logistic regression
def self_LR_pipe(X,y,pipe, param):
    random.seed(10)
    cv_outer = LeaveOneOut()
    y_true,y_pred = list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train_, X_test = X[train_ix, :], X[test_ix, :]
        y_train_, y_test = y[train_ix], y[test_ix]

        # inner loop for feature selection and hyperparameter tuning 
        pipe_lr = pipe
        param_grid = param
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
        result = GridSearchCV(pipe_lr, param_grid=param, cv = cv_inner, n_jobs= -1,scoring = 'roc_auc',refit=True).fit(X_train_, y_train_.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out validation set
        yhat_proba = list(best_param.predict_proba(X_test)[:,1])
        y_pred.append(yhat_proba)
        y_true.append(list(y_test[0]))
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
        print('Selected features are: {}'.format(np.where(result.best_estimator_.named_steps['features'].get_support())[0]))
    
    # Calculate roc_auc on the hold out dataset
    print('auc: %.3f' % roc_auc_score(y_true, y_pred))  


In [11]:
# algorithms to use: Random Forest
def self_rf_pipe(X,y,pipe, param):
    features = {}
    row = 0
    random.seed(24)
    cv_outer = LeaveOneOut()
    y_true,y_pred,Predicted_class  = list(),list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        row+=1
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 

        cv_inner = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=22)
        result = GridSearchCV(pipe, param_grid=param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        
        yhat_proba = best_param.predict_proba(X_test)[:,1]# reture the probability of predicting '1'
        y_pred.append(yhat_proba[0])
        Predicted_class.append(best_param.predict(X_test))
        y_true.append(y_test[0])
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
        features[row] = np.where(result.best_estimator_.named_steps['features'].get_support())[0]

    
    Dict = {}
    for key in sorted(features):
        for num in features[key]:
            if num not in Dict:
                Dict[num] = 1
            else:
                Dict[num]+=1
    
    KEY = []

    for key in sorted(Dict):
        print(key,':',Dict[key])
        if Dict[key] >= len(X)/2:
            KEY.append(key)
    print(KEY) 

    # Calculate roc_auc on the hold out dataset
    AUC_score = roc_auc_score(y_true, y_pred)
    Accuracy = accuracy_score(y_true,Predicted_class)
    print('auc: %.3f' % AUC_score)
    print("Accuracy: ", Accuracy)
  
    print(y_true)
    print(y_pred)

    
    cm1 = confusion_matrix(y_true,Predicted_class)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    print('Sensitivity : ', sensitivity1 )

    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    return Dict,AUC_score,Accuracy,sensitivity1,specificity1

In [4]:
# ALGORITHM: to evaluate the entire pipeline
def pipeline_evaluate(X,y,pipe, param):
    random.seed(24)
    cv_outer = LeaveOneOut()
    y_true,y_pred, Predicted_class  = list(),list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 
        cv_inner = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=2)
        result = GridSearchCV(pipe, param_grid=param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        yhat_proba = best_param.predict_proba(X_test)[:,1]# reture the probability of predicting '1'
        y_pred.append(yhat_proba[0])
        Predicted_class.append(best_param.predict(X_test))
        y_true.append(y_test[0])
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
    return y_pred,Predicted_class, y_true


# Normal Cases

In [5]:
Side_View = {}
Side_View = {'left_CC':left_CC,
             'right_CC':right_CC,
             'left_MLO':left_MLO,
             'right_MLO':right_MLO}
rads = ['diffScore.CN','diffScore.AU']
#Bins = [[0, 0.60, 0.8, 1],[0, 0.69, 0.81, 1]] # whole dataset 

df = pd.DataFrame(X)
cor_matrix = df.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


array1 = np.where(upper_tri>0.9)[0]
array2 = np.where(upper_tri>0.9)[1]
to_drop = []
for i in range(len(array1)):
    if np.corrcoef(X[:,array1[i]],y)[1,0] < np.corrcoef(X[:,array2[i]],y)[1,0]:
        to_drop.append(array1[i])
    else:
         to_drop.append(array2[i])

to_drop = np.unique(to_drop)
df1 = df.drop(df.columns[to_drop], axis=1)
X = df1.to_numpy()
print(df1.shape)

## Chinese Normal

# logistic Regression 02 (log, pca + mutual_info, LR)
rad = rads[0]
bins = Bins[0]
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202]
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)

    # prepare data (log transform)
    from scipy.stats import skew
    skewness = X.apply(lambda x: skew(x))
    skewed_feats = skewness[skewness > 0.75]
    skewed_feats = skewed_feats.index
    minimum = X[skewed_feats].min(axis = 1).min()
    skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
    X = X.to_numpy()

    pca = PCA(n_components = 0.9)
    selection = SelectKBest(mutual_info_classif,k=10)
    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    pipe_lr2 = Pipeline([
        ('scaler',MinMaxScaler()),
        ('features',combined_features),
        ('lr',LogisticRegression(solver='liblinear',random_state=42))])
    param_lr2 = {'features__univ_select__k':[5,10,20,30],
                'lr__penalty':['l1', 'l2'],
                'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 

    print('Below is the results from the LR02:')


    cv_outer = LeaveOneOut()
    y_true,y_pred = list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train_, X_test = X[train_ix, :], X[test_ix, :]
        y_train_, y_test = y[train_ix], y[test_ix]

        # inner loop for feature selection and hyperparameter tuning 
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
        result = GridSearchCV(pipe_lr2, param_lr2, cv = cv_inner, n_jobs= -1,scoring = 'roc_auc',refit=True).fit(X_train_, y_train_.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out validation set
        yhat_proba = list(best_param.predict_proba(X_test)[:,1])
        y_pred.append(yhat_proba)
        y_true.append(list(y_test[0]))
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
       # Calculate roc_auc on the hold out dataset
    print('auc: %.3f' % roc_auc_score(y_true, y_pred))      

# For Chinese Rads Logistic regression 1 3 
print('This is for Chinese Radiologists: \n')
rad = rads[0]
bins = Bins[0]
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202]
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)

    # prepare data (log transform)
    from scipy.stats import skew
    skewness = X.apply(lambda x: skew(x))
    skewed_feats = skewness[skewness > 0.75]
    skewed_feats = skewed_feats.index
    minimum = X[skewed_feats].min(axis = 1).min()
    skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
    X = X.to_numpy()

    # Logistic Regression 01 (log transform, Scaler, RFECV, LR)
    pipe01 = Pipeline([('scaler',MinMaxScaler()),
                     ('features',RFECV(estimator = LogisticRegression(solver='liblinear'),cv =3, scoring ='roc_auc')),
                     ('lr',LogisticRegression(solver='liblinear',random_state=42))])
    param01 = {'lr__penalty':['l1','l2'],
            'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 

    print('*'*30)
    print('Below is the results from the LR01:')
    self_LR_pipe(X,y,pipe01,param01) 
    print('*'*30)


    # Logistic Regression 03 (log transform, scaler, SelectFromModel, LR)
    selector = SelectFromModel(estimator=LogisticRegression(solver = 'liblinear'))
    pipe_lr03 = Pipeline([ 
            ('scaler',MinMaxScaler()),
            ('features',selector),
            ('lr',LogisticRegression(solver = 'liblinear',random_state=42))]) 
    param_lr03 = {'features__max_features':[20,30,40],
                    'lr__penalty':['l1', 'l2'],
                    'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
    print('*'*30)
    print('Below is the results from the LR03:') 
    self_LR_pipe(X,y,pipe=pipe_lr03,param=param_lr03)

# For Chinese Rads Random Forest 01
rad = rads[0]
bins = Bins[0]
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)

    # Random forest 01 (scaler, pca + mututal info classifier, RandomForest)
    pca = PCA(n_components = 0.9) 
    selection = SelectKBest(mutual_info_classif,k=10)
    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    pipe = Pipeline([('scaler',MinMaxScaler()),
                    ('features',combined_features),
                     ('rf',RandomForestClassifier(random_state=42))])

    param = {'features__univ_select__k':[10,20,30,40],
             'rf__max_features':['sqrt','log2'],
             'rf__n_estimators':[50,100,1000,2000]} 
    print('*'*50)
    print('Below is the results from the RF01:')

    cv_outer = LeaveOneOut()
    y_true,y_pred = list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
        result = GridSearchCV(pipe, param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        yhat_proba = list(best_param.predict_proba(X_test)[:,1])# reture the probability of predicting '1'
        y_pred.append(yhat_proba)
        y_true.append(list(y_test[0]))
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
        # Calculate roc_auc on the hold out dataset
    print('auc: %.3f' % roc_auc_score(y_true, y_pred))
    print('*'*50)

In [6]:
# # Random Forest 02 (scaler, SelectFromModel, RandomForest)
# rad = rads[0]
# df_cn = dict()
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# #bins = Bins[0]
# for key,value in Side_View.items():
#     Normal_sort = value.sort_values(by = rad)
#     Normal_sort_copy = Normal_sort.copy()
#     Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[1,3,0])
#     Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
#     print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#     X = Normal_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
#     y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
#     over = SMOTE(sampling_strategy="minority",random_state=2)
#     X,y = over.fit_resample(X,y)
  

#     selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
#     pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                      ('features',selector),
#                      ('rf',RandomForestClassifier(random_state=42))]) 
#     param = {'features__max_features':[5,10,15],
#             "rf__max_depth":[1,5,10],
#              "rf__max_samples":[0.1,0.5,1.0],
#                  'rf__n_estimators':[50,100,1000] } 
#     print('*'*50)
#     Dict,AUC_score,Accuracy,sensitivity1,specificity1 = self_rf_pipe(X,y,pipe,param)  
#     print('*'*50)
#     df_cn[key]=Dict
#     AUC[key]=AUC_score
#     ACU[key]=Accuracy
#     SEN[key]=sensitivity1
#     SPE[key]=specificity1

ThIS IS left_CC SIDE VIEW 


**************************************************
>est=0.947, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.940, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.953, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.960, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.927, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.973, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.933, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.953, cfg={'features__max_features': 15, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>e

**************************************************
>est=0.920, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.920, cfg={'features__max_features': 15, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.927, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.960, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.947, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.960, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.933, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.933, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.960, cfg={'fe

In [6]:
# PIPELINE EVALUATE FOR CHINESE 
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[0,3,1])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==0],Normal_sort_copy[Normal_sort_copy['percentile']==1]],axis = 0)
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
    pipe = Pipeline([('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,20],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.append(probability)
    PRED.append(pre_label)
    

ThIS IS left_CC SIDE VIEW 


>est=0.880, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.910, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.872, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.853, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.873, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.892, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.892, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.897, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.964, cfg={'features__max_features': 5,

>est=0.933, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.914, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.940, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.933, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.922, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.911, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.926, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.933, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.931, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__m

In [7]:
pipeline_prob_cn = np.array(PROB).reshape(4,-1)
agg_prob_cn = np.max(pipeline_prob_cn, axis=0)

label = []
for i in range(len(agg_prob_cn)):
    if agg_prob_cn[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, agg_prob_cn)
print("AUC for normal pipeline for Chinese readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(agg_prob_cn), len(agg_prob_cn))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], agg_prob_cn[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.6666666666666666
Sensitivity is : 0.6
Specificity is : 0.75
AUC for normal pipeline for Chinese readers is 0.747
Confidence interval for the score: [0.538 - 0.917]


## Australian Normal

# logistic Regression 02 (log, pca + mutual_info, LR) 
rad = rads[1]
bins = Bins[1]
print('Below is for Australian Radiologists: \n')
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202]
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)

    # prepare data (log transform)
    from scipy.stats import skew
    skewness = X.apply(lambda x: skew(x))
    skewed_feats = skewness[skewness > 0.75]
    skewed_feats = skewed_feats.index
    minimum = X[skewed_feats].min(axis = 1).min()
    skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
    X = X.to_numpy()
    pca = PCA(n_components = 0.9)
    selection = SelectKBest(mutual_info_classif,k=10)
    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])

    pipe_lr2 = Pipeline([
        ('scaler',MinMaxScaler()),
        ('features',combined_features),
        ('lr',LogisticRegression(solver='liblinear',random_state=42))])
    param_lr2 = {'features__univ_select__k':[5,10,20,30],
                'lr__penalty':['l1', 'l2'],
                'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 

    print('Below is the results from the LR02:')
    cv_outer = LeaveOneOut()
    y_true,y_pred = list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train_, X_test = X[train_ix, :], X[test_ix, :]
        y_train_, y_test = y[train_ix], y[test_ix]

        # inner loop for feature selection and hyperparameter tuning 
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
        result = GridSearchCV(pipe_lr2, param_lr2, cv = cv_inner, n_jobs= -1,scoring = 'roc_auc',refit=True).fit(X_train_, y_train_.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out validation set
        yhat_proba = list(best_param.predict_proba(X_test)[:,1])
        y_pred.append(yhat_proba)
        y_true.append(list(y_test[0]))
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
       # Calculate roc_auc on the hold out dataset
    print('auc: %.3f' % roc_auc_score(y_true, y_pred))       

    print('*'*30)

# For Australian Rads Logistic regression 1 3
rad = rads[1]
bins = Bins[1]
print('Below is for Australian Radiologists: \n')
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202]
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)


    # prepare data (log transform)
    from scipy.stats import skew
    skewness = X.apply(lambda x: skew(x))
    skewed_feats = skewness[skewness > 0.75]
    skewed_feats = skewed_feats.index
    minimum = X[skewed_feats].min(axis = 1).min()
    skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
    X = X.to_numpy()

    # Logistic Regression 01 (log transform, Scaler, RFECV, LR)
    pipe01 = Pipeline([('scaler',MinMaxScaler()),
                     ('features',RFECV(estimator = LogisticRegression(solver='liblinear'),cv =3, scoring ='roc_auc')),
                     ('lr',LogisticRegression(solver='liblinear',random_state=42))])
    param01 = {'lr__penalty':['l1','l2'],
            'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
    print('*'*30)
    print('Below is the results from the LR01:')
    self_LR_pipe(X,y,pipe01,param01)
    print('*'*30)


    # Logistic Regression 03 (log transform, scaler, SelectFromModel, LR)
    selector = SelectFromModel(estimator=LogisticRegression(solver = 'liblinear'))
    pipe_lr03 = Pipeline([ 
            ('scaler',MinMaxScaler()),
        ('features',selector),
        ('lr',LogisticRegression(solver = 'liblinear',random_state=42))]) 
    param_lr03 = {'features__max_features':[20,30,40],
                    'lr__penalty':['l1', 'l2'],
                    'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
    print('*'*30)
    print('Below is the results from the LR03:')               
    self_LR_pipe(X,y,pipe=pipe_lr03,param=param_lr03)
    print('*'*30)



# For Australian Rads Random Forest 01
rad = rads[1]
bins = Bins[1]
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.cut(Normal_sort_copy[rad],bins, labels=[1,3,0])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,0:202].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)



    # Random forest 01 (scaler, pca + mututal info classifier, RandomForest)
    pca = PCA(n_components = 0.9) 
    selection = SelectKBest(mutual_info_classif,k=10)
    combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
    pipe = Pipeline([('scaler',MinMaxScaler()),
                    ('features',combined_features),
                     ('rf',RandomForestClassifier(random_state=42))])

    param = {'features__univ_select__k':[10,20,30,40],
             'rf__max_features':['sqrt','log2'],
             'rf__n_estimators':[50,100,1000,2000]} 


    print('*'*50)
    print('Below is the results from the RF01:')
    cv_outer = LeaveOneOut()
    y_true,y_pred = list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 
        cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
        result = GridSearchCV(pipe, param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        yhat_proba = list(best_param.predict_proba(X_test)[:,1])# reture the probability of predicting '1'
        y_pred.append(yhat_proba)
        y_true.append(list(y_test[0]))
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
        # Calculate roc_auc on the hold out dataset
    print('auc: %.3f' % roc_auc_score(y_true, y_pred))
    print('*'*50)

In [7]:
# # Random Forest 02 (scaler, SelectFromModel, RandomForest)
# rad = rads[1]
# df_au = dict()
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# #bins = Bins[0]
# for key,value in Side_View.items():
#     Normal_sort = value.sort_values(by = rad)
#     Normal_sort_copy = Normal_sort.copy()
#     Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[1,3,0])
#     Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
#     print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#     X = Normal_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
#     y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
#     over = SMOTE(sampling_strategy="minority",random_state=2)
#     X,y = over.fit_resample(X,y)
  
#     selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
#     pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                      ('features',selector),
#                      ('rf',RandomForestClassifier(random_state=42))]) 
#     param = {'features__max_features':[5,10,20],
#             "rf__max_depth":[1,5,10],
#              "rf__max_samples":[0.1,0.5,1.0],
#                  'rf__n_estimators':[50,100,1000] } 
#     print('*'*50)
#     Dict,AUC_score,Accuracy,sensitivity1,specificity1 = self_rf_pipe(X,y,pipe,param)  
#     print('*'*50)
#     df_au[key]=Dict
#     AUC[key]=AUC_score
#     ACU[key]=Accuracy
#     SEN[key]=sensitivity1
#     SPE[key]=specificity1

ThIS IS left_CC SIDE VIEW 


**************************************************
>est=0.723, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.735, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.728, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.722, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.761, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.733, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.741, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.705, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators

>est=0.606, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.627, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.659, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.683, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.721, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.730, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.630, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.659, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.634, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__m

**************************************************
>est=0.749, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.706, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.773, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.704, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.708, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.790, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.720, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.749, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.690, cfg={'f

In [8]:
# PIPELINE EVALUATE FOR AUSTRALIAN 
rad = rads[1]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[0,3,1])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==0],Normal_sort_copy[Normal_sort_copy['percentile']==1]],axis = 0)
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
    pipe = Pipeline([('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,20],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)

ThIS IS left_CC SIDE VIEW 


>est=0.571, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.549, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.585, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.593, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.625, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.583, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.617, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.716, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.567, cfg={'features__max_features': 

>est=0.505, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.458, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.435, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.470, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.465, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.447, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.400, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.389, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.403, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf

In [9]:
PROB = np.array(PROB).reshape(4,-1)
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.2
Sensitivity is : 0.21052631578947367
Specificity is : 0.18181818181818182
AUC for normal pipeline for Australian readers is 0.163
Confidence interval for the score: [0.032 - 0.34]


# Cancer Cases

In [12]:
# Cancer lesion side 
Side_View = {}
Side_View = {'Case_based_CC_CancerLesion':CC_Cancer,
             'Case_based_MLO_CancerLesion':MLO_Cancer}
rads = ['diffScore.CN','diffScore.AU']

## Case Based 
### Cancer Lesion Side

# CN and AU Logistic Regression 02
for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.cut(cancer_sort_copy[rad], Bins[index], labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,0:202]
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)

        # prepare data (log transform)
        from scipy.stats import skew
        skewness = X.apply(lambda x: skew(x))
        skewed_feats = skewness[skewness > 0.75]
        skewed_feats = skewed_feats.index
        minimum = X[skewed_feats].min(axis = 1).min()
        skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
        X = X.to_numpy()  

        pipe_lr2 = Pipeline([
            ('scaler',MinMaxScaler()),
            ('features',combined_features),
            ('lr',LogisticRegression(solver='liblinear',random_state=42))])
        param_lr2 = {'features__univ_select__k':[5,10,20,30],
                    'lr__penalty':['l1', 'l2'],
                    'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR02:')
        cv_outer = LeaveOneOut()
        y_true,y_pred = list(),list()
        for train_ix, test_ix in cv_outer.split(X):
            X_train_, X_test = X[train_ix, :], X[test_ix, :]
            y_train_, y_test = y[train_ix], y[test_ix]

            # inner loop for feature selection and hyperparameter tuning 
            cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
            result = GridSearchCV(pipe_lr2, param_lr2, cv = cv_inner, n_jobs= -1,scoring = 'roc_auc',refit=True).fit(X_train_, y_train_.ravel())
            best_param = result.best_estimator_
            # evaluate model on the hold out validation set
            yhat_proba = list(best_param.predict_proba(X_test)[:,1])
            y_pred.append(yhat_proba)
            y_true.append(list(y_test[0]))
            # report progress
            print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
           # Calculate roc_auc on the hold out dataset
        print('auc: %.3f' % roc_auc_score(y_true, y_pred))       
        print('*'*30)

# CN and AU Logistic Regression 1 3 
for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"]
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)


        # prepare data (log transform)
        from scipy.stats import skew
        skewness = X.apply(lambda x: skew(x))
        skewed_feats = skewness[skewness > 0.75]
        skewed_feats = skewed_feats.index
        minimum = X[skewed_feats].min(axis = 1).min()
        skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
        X = X.to_numpy()
        
        # Logistic Regression 01 (log transform, Scaler, RFECV, LR)
        pipe01 = Pipeline([('scaler',MinMaxScaler()),
                         ('features',RFECV(estimator = LogisticRegression(solver='liblinear'),cv =3, scoring ='roc_auc')),
                         ('lr',LogisticRegression(solver='liblinear',random_state=42))])
        param01 = {'lr__penalty':['l1','l2'],
                'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR01:')
        self_LR_pipe(X,y,pipe01,param01)
        print('*'*30)
        
        # Logistic Regression 03 (log transform, scaler, SelectFromModel, LR)
        selector = SelectFromModel(estimator=LogisticRegression(solver = 'liblinear'))
        pipe_lr03 = Pipeline([ 
                ('scaler',MinMaxScaler()),('features',selector),
            ('lr',LogisticRegression(solver = 'liblinear',random_state=42))]) 
        param_lr03 = {'features__max_features':[20,30,40],
                        'lr__penalty':['l1', 'l2'],
                        'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR03:')               
        self_LR_pipe(X,y,pipe=pipe_lr03,param=param_lr03)
        print('*'*30)

# CN and AU Random Forest01

for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)


        # Random forest 01 (scaler, pca + mututal info classifier, RandomForest)
        pca = PCA(n_components = 0.9) 
        selection = SelectKBest(mutual_info_classif,k=10)
        combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
        pipe = Pipeline([ ('scaler',MinMaxScaler()),
                         ('features',combined_features),
                         ('rf',RandomForestClassifier(random_state=42))]) 
        param = {'features__univ_select__k':[10,20,30,40],
                 'rf__max_features':['sqrt','log2'],
                 'rf__n_estimators':[50,100,1000,2000]} 
        print('*'*50)
        print('Below is the results from the RF01:')
        cv_outer = LeaveOneOut()
        y_true,y_pred = list(),list()
        for train_ix, test_ix in cv_outer.split(X):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]

            ## inner loop for feature selection and hyperparameter tuning 
            cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
            result = GridSearchCV(pipe, param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
            best_param = result.best_estimator_
            # evaluate model on the hold out evaluation dataset
            yhat_proba = list(best_param.predict_proba(X_test)[:,1])# reture the probability of predicting '1'
            y_pred.append(yhat_proba)
            y_true.append(list(y_test[0]))
            # report progress
            print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
            # Calculate roc_auc on the hold out dataset
        print('auc: %.3f' % roc_auc_score(y_true, y_pred))
        print('*'*50)
        


In [6]:
# CN and AU Random Forest 2
# df_cancer ={}
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# for index, rad in enumerate(rads):
#     for key,value in Side_View.items():
#         cancer_sort = value.sort_values(by = rad)
#         cancer_sort_copy = cancer_sort.copy()
#         cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
#         cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

#         print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#         X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
#         y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
#         over = SMOTE(sampling_strategy="minority",random_state=2)
#         X,y = over.fit_resample(X,y)
        
#         # Random Forest 02 (scaler, SelectFromModel, RandomForest)
#         selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
#         pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                          ('features',selector),
#                          ('rf',RandomForestClassifier(random_state=42))]) 
#         param = {'features__max_features':[5,10,15],##################
#                          'rf__max_features':['sqrt','log2'], 
#                      'rf__n_estimators':[50,100,1000,2000] } 
#         print('*'*50)
#         print('Below is the results from the RF02:')
#         Dict,AUC_score,Accuracy,sensitivity1,specificity1  = self_rf_pipe(X,y,pipe,param)  
#         print('*'*50)
#         df_cancer[key]=Dict    
#         AUC[key]=AUC_score
#         ACU[key]=Accuracy
#         SEN[key]=sensitivity1
#         SPE[key]=specificity1

ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


**************************************************
Below is the results from the RF02:
>est=0.472, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
>est=0.444, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.759, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 100}
>est=0.648, cfg={'features__max_features': 15, 'rf__max_features': 'log2', 'rf__n_estimators': 2000}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
>est=0.713, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
>est=0.593, cfg={'features__max_features': 10, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
>est=0.620, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
>est=0.639, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 

>est=0.809, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 2000}
>est=0.840, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.738, cfg={'features__max_features': 5, 'rf__max_features': 'log2', 'rf__n_estimators': 1000}
>est=0.682, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 2000}
>est=0.765, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 2000}
>est=0.889, cfg={'features__max_features': 10, 'rf__max_features': 'log2', 'rf__n_estimators': 1000}
>est=0.691, cfg={'features__max_features': 15, 'rf__max_features': 'log2', 'rf__n_estimators': 2000}
>est=0.741, cfg={'features__max_features': 15, 'rf__max_features': 'log2', 'rf__n_estimators': 2000}
>est=0.799, cfg={'features__max_features': 10, 'rf__max_features': 'log2', 'rf__n_estimators': 100}
>est=0.718, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50

In [13]:
# FOR CHINENSE READERS
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==0],cancer_sort_copy[cancer_sort_copy['percentile']==1]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)
    

ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


>est=0.551, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.519, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.736, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.602, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.759, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.606, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.574, cfg={'features__max_

In [14]:
PROB = np.array(PROB).reshape(2,-1)
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.4
Sensitivity is : 0.0
Specificity is : 0.8571428571428571
AUC for normal pipeline for Australian readers is 0.000
Confidence interval for the score: [0.000 - 0.0]


In [15]:
# FOR Australian READERS
rad = rads[1]
PROB = []
PRED = []
TRUE = []

for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==0],cancer_sort_copy[cancer_sort_copy['percentile']==1]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)

ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


>est=0.639, cfg={'features__max_features': 15, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.741, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.639, cfg={'features__max_features': 15, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.676, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.537, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.537, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.713, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.519, cfg={'features__max

In [16]:
PROB = np.array(PROB).reshape(2,-1)
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.3125
Sensitivity is : 0.5
Specificity is : 0.0
AUC for normal pipeline for Australian readers is 0.067
Confidence interval for the score: [0.000 - 0.25]


## Location Based
### Cancer Lesion Side

In [17]:
pd.set_option('display.max_rows', 1200)
case_feat_location = pd.read_csv("/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_location_diffScore/Square_norm_feat.csv")
cancer_location = case_feat_location.merge(lesion_side.rename(columns={"LesionNumber":"LesionNum"}), how='left',on = ["CaseName","LesionNum"])
CC_CL_LC = cancer_location.loc[cancer_location["Side"]==cancer_location["LesionSide"],:].loc[cancer_location["View"]=="'CC'",:]
MLO_CL_LC = cancer_location.loc[cancer_location["Side"]==cancer_location["LesionSide"],:].loc[cancer_location["View"]=="'MLO'",:]

CC_CL_LC = CC_CL_LC[CC_CL_LC.LesionNum!=2]
MLO_CL_LC = MLO_CL_LC[MLO_CL_LC.LesionNum!=2]

In [18]:
# Cancer lesion side 
Side_View = {}
Side_View = {'Loc_based_CC_CancerLesion':CC_CL_LC,
             'Loc_based_MLO_CancerLesion': MLO_CL_LC}
rads = ['diffScore.CN','diffScore.AU']

# CN and AU Logistic Regression 02

for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.cut(cancer_sort_copy[rad], Bins[index], labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,0:202]
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)



        # prepare data (log transform)
        from scipy.stats import skew
        skewness = X.apply(lambda x: skew(x))
        skewed_feats = skewness[skewness > 0.75]
        skewed_feats = skewed_feats.index
        minimum = X[skewed_feats].min(axis = 1).min()
        skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
        X = X.to_numpy()  

        pipe_lr2 = Pipeline([
            ('scaler',MinMaxScaler()),
            ('features',combined_features),
            ('lr',LogisticRegression(solver='liblinear',random_state=42))])
        param_lr2 = {'features__univ_select__k':[5,10,20,30],
                    'lr__penalty':['l1', 'l2'],
                    'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR02:')
        cv_outer = LeaveOneOut()
        y_true,y_pred = list(),list()
        for train_ix, test_ix in cv_outer.split(X):
            X_train_, X_test = X[train_ix, :], X[test_ix, :]
            y_train_, y_test = y[train_ix], y[test_ix]

            # inner loop for feature selection and hyperparameter tuning 
            cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
            result = GridSearchCV(pipe_lr2, param_lr2, cv = cv_inner, n_jobs= -1,scoring = 'roc_auc',refit=True).fit(X_train_, y_train_.ravel())
            best_param = result.best_estimator_
            # evaluate model on the hold out validation set
            yhat_proba = list(best_param.predict_proba(X_test)[:,1])
            y_pred.append(yhat_proba)
            y_true.append(list(y_test[0]))
            # report progress
            print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
           # Calculate roc_auc on the hold out dataset
        print('auc: %.3f' % roc_auc_score(y_true, y_pred))       
        print('*'*30)

# CN and AU Logistic Regression 1 3 
for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.cut(cancer_sort_copy[rad], Bins[index], labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,0:202]
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)


        # prepare data (log transform)
        from scipy.stats import skew
        skewness = X.apply(lambda x: skew(x))
        skewed_feats = skewness[skewness > 0.75]
        skewed_feats = skewed_feats.index
        minimum = X[skewed_feats].min(axis = 1).min()
        skewed_feats = np.log1p(X[skewed_feats]+ abs(minimum))
        X = X.to_numpy()
        
        # Logistic Regression 01 (log transform, Scaler, RFECV, LR)
        pipe01 = Pipeline([('scaler',MinMaxScaler()),
                         ('features',RFECV(estimator = LogisticRegression(solver='liblinear'),cv =3, scoring ='roc_auc')),
                         ('lr',LogisticRegression(solver='liblinear',random_state=42))])
        param01 = {'lr__penalty':['l1','l2'],
                'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR01:')
        self_LR_pipe(X,y,pipe01,param01)
        print('*'*30)
        
        # Logistic Regression 03 (log transform, scaler, SelectFromModel, LR)
        selector = SelectFromModel(estimator=LogisticRegression(solver = 'liblinear'))
        pipe_lr03 = Pipeline([ 
                ('scaler',MinMaxScaler()),('features',selector),
            ('lr',LogisticRegression(solver = 'liblinear',random_state=42))]) 
        param_lr03 = {'features__max_features':[20,30,40],
                        'lr__penalty':['l1', 'l2'],
                        'lr__C':[1000,100,10,1.0,0.1,0.01,0.001]} 
        print('*'*30)
        print('Below is the results from the LR03:')               
        self_LR_pipe(X,y,pipe=pipe_lr03,param=param_lr03)
        print('*'*30)

# CN and AU Random Forest01
for index, rad in enumerate(rads):
    for key,value in Side_View.items():
        cancer_sort = value.sort_values(by = rad)
        cancer_sort_copy = cancer_sort.copy()
        cancer_sort_copy['percentile'] = pd.cut(cancer_sort_copy[rad], Bins[index], labels=[1,3,0])
        cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

        print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
        X = cancer_sort_drop.loc[:,0:202].to_numpy()
        y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)


        
        # Random forest 01 (scaler, pca + mututal info classifier, RandomForest)
        pca = PCA(n_components = 0.9) 
        selection = SelectKBest(mutual_info_classif,k=10)
        combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
        pipe = Pipeline([ ('scaler',MinMaxScaler()),
                         ('features',combined_features),
                         ('rf',RandomForestClassifier(random_state=42))]) 
        param = {'features__univ_select__k':[10,20,30,40],
                 'rf__max_features':['sqrt','log2'],
                 'rf__n_estimators':[50,100,1000,2000]} 
        print('*'*50)
        print('Below is the results from the RF01:')
        cv_outer = LeaveOneOut()
        y_true,y_pred = list(),list()
        for train_ix, test_ix in cv_outer.split(X):
            X_train, X_test = X[train_ix, :], X[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]

            ## inner loop for feature selection and hyperparameter tuning 
            cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
            result = GridSearchCV(pipe, param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
            best_param = result.best_estimator_
            # evaluate model on the hold out evaluation dataset
            yhat_proba = list(best_param.predict_proba(X_test)[:,1])# reture the probability of predicting '1'
            y_pred.append(yhat_proba)
            y_true.append(list(y_test[0]))
            # report progress
            print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
            # Calculate roc_auc on the hold out dataset
        print('auc: %.3f' % roc_auc_score(y_true, y_pred))
        print('*'*50)

In [9]:
# # CN and AU Random Forest 2
# df_location = {}
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# for index, rad in enumerate(rads):
#     for key,value in Side_View.items():
#         cancer_sort = value.sort_values(by = rad)
#         cancer_sort_copy = cancer_sort.copy()
#         cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
#         cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

#         print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#         X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
#         y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
#         over = SMOTE(sampling_strategy="minority",random_state=2)
#         X,y = over.fit_resample(X,y)
    

#         # Random Forest 02 (scaler, SelectFromModel, RandomForest)
#         selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
#         pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                          ('features',selector),
#                          ('rf',RandomForestClassifier(random_state=42))]) 
#         param = {'features__max_features':[5,10,15],
#                          'rf__max_features':['sqrt'], 
#                      'rf__n_estimators':[50,100] } 
#         print('*'*50)
#         print('Below is the results from the RF02:')
#         Dict,AUC_score,Accuracy,sensitivity1,specificity1  = self_rf_pipe(X,y,pipe,param)  
#         print('*'*50) 
#         df_location[key]=Dict
#         AUC[key]=AUC_score
#         ACU[key]=Accuracy
#         SEN[key]=sensitivity1
#         SPE[key]=specificity1

ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


**************************************************
Below is the results from the RF02:
>est=0.681, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.764, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.611, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.713, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.546, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.495, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.463, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.676, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.634, cfg={'features__max_features': 10, 'rf__max_features': 'sqr

>est=0.546, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.577, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.457, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.605, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.568, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.568, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.435, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.389, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.556, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
4 : 2
5 : 5
6 : 1
8 : 1
9 : 2
15 : 1
17 : 1
21 : 1
22 : 1
32 : 1
36 : 1
38 : 1
40 : 2
41 : 1
55 : 1
69 : 3


In [20]:
# FOR CHINENSE READERS
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)

ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


>est=0.644, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.574, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.690, cfg={'features__max_features': 15, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.611, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.560, cfg={'features__max_features': 15, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.731, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.676, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.755, cfg={'features__max_features': 15, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.694, cfg={'features_

In [21]:
PROB = np.array(PROB).reshape(2,-1)
# get the maximum prob among predictions of the four views
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Chinese readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.42857142857142855
Sensitivity is : 0.42857142857142855
Specificity is : 0.42857142857142855
AUC for normal pipeline for Chinese readers is 0.337
Confidence interval for the score: [0.044 - 0.677]


In [23]:
# FOR AU READERS
rad = rads[1]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Square.Normalized.1":"feat.Square.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
#     PRED.append(pre_label)
    

ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.528, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features

In [24]:
PROB = np.array(PROB).reshape(2,-1)
# get the maximum prob among predictions of the four views
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.5333333333333333
Sensitivity is : 0.7777777777777778
Specificity is : 0.16666666666666666
AUC for normal pipeline for Australian readers is 0.389
Confidence interval for the score: [0.000 - 0.769]
