In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import RFECV,SelectKBest,mutual_info_classif,SelectFromModel
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV,LeaveOneOut,RepeatedStratifiedKFold,RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score,auc,accuracy_score,confusion_matrix
from mat4py import loadmat
import matplotlib.pyplot as plt 
import math
import random
import warnings
warnings.filterwarnings("ignore")
import joblib

In [2]:
# Case-based difficuty score
case_feat_normal = pd.read_csv('/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_diffScore/whole_norm_feat.csv')
case_feat_normal = pd.DataFrame(case_feat_normal).drop("Unnamed: 0",axis=1)
print('Whether there is missing value in normal mat: {}'.format(case_feat_normal.isnull().values.any())) 
case_feat_cancer = pd.read_csv('/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_case_cancer_diffScore/whole_norm_feat.csv')
case_feat_cancer = pd.DataFrame(case_feat_cancer).drop("Unnamed: 0",axis=1)
print('Whether there is missing value in cancer mat: {}'.format(case_feat_cancer.isnull().values.any())) 

Whether there is missing value in normal mat: False
Whether there is missing value in cancer mat: False


In [3]:
CC_Normal = case_feat_normal.loc[case_feat_normal['View']=="'CC'",]
MLO_Normal = case_feat_normal.loc[case_feat_normal['View']=="'MLO'",]
left_CC = CC_Normal.loc[CC_Normal["Side"]=="L",]
right_CC =CC_Normal.loc[CC_Normal["Side"]=="R",]
left_MLO = MLO_Normal.loc[MLO_Normal["Side"]=="L",]
right_MLO = MLO_Normal.loc[MLO_Normal["Side"]=="R",]

In [4]:
lesion_side = pd.read_csv("/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/lesion_side.csv")
case_feat_cancer_ = case_feat_cancer.merge(lesion_side, how='left',on = "CaseName").drop("Unnamed: 0",axis=1)
CC_Cancer = case_feat_cancer_.loc[case_feat_cancer_["Side"]==case_feat_cancer_["LesionSide"] ,].loc[case_feat_cancer_["View"]=="'CC'",]
MLO_Cancer = case_feat_cancer_.loc[case_feat_cancer_["Side"]==case_feat_cancer_["LesionSide"] ,].loc[case_feat_cancer_["View"]=="'MLO'",]

CC_Cancer.drop(17, axis=0, inplace=True)
MLO_Cancer.drop(23, axis=0, inplace=True)

In [5]:
print('The shape of CC_Normal is {}'.format(CC_Normal.shape))
print('The shape of CC_CancerLesion is {}'.format(CC_Cancer.shape))
print('The shape of MLO_Normal is {}'.format(MLO_Normal.shape))
print('The shape of MLO_CancerLesion is {}'.format(MLO_Cancer.shape))

The shape of CC_Normal is (80, 210)
The shape of CC_CancerLesion is (20, 212)
The shape of MLO_Normal is (80, 210)
The shape of MLO_CancerLesion is (20, 212)


In [6]:
# ALGORITHM: to evaluate the entire pipeline
def pipeline_evaluate(X,y,pipe, param):
    random.seed(24)
    cv_outer = LeaveOneOut()
    y_true,y_pred, Predicted_class  = list(),list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 
        cv_inner = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=2)
        result = GridSearchCV(pipe, param_grid=param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        yhat_proba = best_param.predict_proba(X_test)[:,1]# reture the probability of predicting '1'
        y_pred.append(yhat_proba[0])
        Predicted_class.append(best_param.predict(X_test))
        y_true.append(y_test[0])
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
    return y_pred,Predicted_class, y_true


In [7]:
# algorithms to use: Random Forest

def self_rf_pipe(X,y,pipe, param):
    features = {}
    row = 0
    random.seed(24)
    cv_outer = LeaveOneOut()
    y_true,y_pred, Predicted_class  = list(),list(),list()
    for train_ix, test_ix in cv_outer.split(X):
        row+=1
        X_train, X_test = X[train_ix, :], X[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        ## inner loop for feature selection and hyperparameter tuning 

        cv_inner = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=2)
        result = GridSearchCV(pipe, param_grid=param, cv = cv_inner, scoring = 'roc_auc',n_jobs = -1,refit=True).fit(X_train, y_train.ravel())
        best_param = result.best_estimator_
        # evaluate model on the hold out evaluation dataset
        
        yhat_proba = best_param.predict_proba(X_test)[:,1]# reture the probability of predicting '1'
        y_pred.append(yhat_proba[0])
        Predicted_class.append(best_param.predict(X_test))
        y_true.append(y_test[0])
        # report progress
        print('>est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
        features[row] = np.where(result.best_estimator_.named_steps['features'].get_support())[0]

    
    Dict = {}
    for key in sorted(features):
        for num in features[key]:
            if num not in Dict:
                Dict[num] = 1
            else:
                Dict[num]+=1
    
    KEY = []

    for key in sorted(Dict):
        print(key,':',Dict[key])
        if Dict[key] >= len(X)/2:
            KEY.append(key)
    print(KEY) 

    # Calculate roc_auc on the hold out dataset
    AUC_score = roc_auc_score(y_true, y_pred)
    Accuracy = accuracy_score(y_true,Predicted_class)
    print('auc: %.3f' % AUC_score)
    print("Accuracy: ", Accuracy)
  
    print(y_true)
    print(y_pred)

    
    cm1 = confusion_matrix(y_true,Predicted_class)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    print('Sensitivity : ', sensitivity1 )

    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    return Dict,AUC_score,Accuracy,sensitivity1,specificity1

# Normal Cases

In [8]:
Side_View = {}
Side_View = {'left_CC':left_CC,
             'right_CC':right_CC,
             'left_MLO':left_MLO,
             'right_MLO':right_MLO}
rads = ['diffScore.CN','diffScore.AU']
#Bins = [[0, 0.60, 0.8, 1],[0, 0.69, 0.81, 1]]

1/3 : for Chinese 0.60 [0.6333, 0.8] 0.83
1/3 : for Australian 0.69 [0.75,0.81] 0.88

## Chinese Normal 

In [9]:
# Random Forest 02 (scaler, SelectFromModel, RandomForest)
# rad = rads[0]
# df_cn = dict()
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# #bins = Bins[0]
# for key,value in Side_View.items():
#     Normal_sort = value.sort_values(by = rad)
#     Normal_sort_copy = Normal_sort.copy()
#     Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[0,3,1])
#     Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
#     print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#     X = Normal_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
#     y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
  
#     selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
#     pipe = Pipeline([('scaler',MinMaxScaler()),
#                      ('features',selector),
#                      ('rf',RandomForestClassifier(random_state=42))]) 
#     param = {'features__max_features':[5,10,20],
#             "rf__max_depth":[1,5,10],
#              "rf__max_samples":[0.1,0.5,1.0],
#             'rf__n_estimators':[50,100,1000] } 
#     print('*'*50)
#     Dict,AUC_score,Accuracy,sensitivity1,specificity1 = self_rf_pipe(X,y,pipe,param)  
#     print('*'*50)
#     df_cn[key]=Dict
#     AUC[key]=AUC_score
#     ACU[key]=Accuracy
#     SEN[key]=sensitivity1
#     SPE[key]=specificity1
    

In [10]:
# PIPELINE EVALUATE FOR CHINESE 
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[0,3,1])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==0],Normal_sort_copy[Normal_sort_copy['percentile']==1]],axis = 0)
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
    pipe = Pipeline([('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,20],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.append(probability)
    PRED.append(pre_label)
    

ThIS IS left_CC SIDE VIEW 


>est=0.847, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.849, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.851, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.858, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.856, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.894, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.869, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.907, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.896, cfg={'features__max_features': 10

>est=0.807, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.803, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.828, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.778, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.812, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.791, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.802, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.785, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.829, cfg={'features__max_features': 5, 'rf__max_depth': 5

In [11]:
pipeline_prob_cn = np.array(PROB).reshape(4,-1)
agg_prob_cn = np.max(pipeline_prob_cn, axis=0)

label = []
for i in range(len(agg_prob_cn)):
    if agg_prob_cn[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)


# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, agg_prob_cn)
print("AUC for normal pipeline for Chinese readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(agg_prob_cn), len(agg_prob_cn))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], agg_prob_cn[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.6666666666666666
Sensitivity is : 0.4666666666666667
Specificity is : 0.9166666666666666
AUC for normal pipeline for Chinese readers is 0.786
Confidence interval for the score: [0.588 - 0.949]


## Austalian normal

In [12]:
# PIPELINE EVALUATE FOR AUSTRALIAN 
rad = rads[1]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    Normal_sort = value.sort_values(by = rad)
    Normal_sort_copy = Normal_sort.copy()
    Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[0,3,1])
    Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==0],Normal_sort_copy[Normal_sort_copy['percentile']==1]],axis = 0)
    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = Normal_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
    pipe = Pipeline([('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,20],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)

ThIS IS left_CC SIDE VIEW 


>est=0.584, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.559, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.633, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.596, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.593, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 1000}
>est=0.627, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.640, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.688, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.616, cfg={'features__max_features': 10, '

>est=0.597, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.505, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.539, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.558, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.539, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 100}
>est=0.546, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.554, cfg={'features__max_features': 20, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.584, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.543, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__ma

In [13]:
PROB = np.array(PROB).reshape(4,-1)
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])

print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)


# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.3333333333333333
Sensitivity is : 0.21052631578947367
Specificity is : 0.5454545454545454
AUC for normal pipeline for Australian readers is 0.309
Confidence interval for the score: [0.115 - 0.528]


In [9]:
# Random Forest 02 (scaler, SelectFromModel, RandomForest)
# rad = rads[1]
# df_au = dict()
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# #bins = Bins[0]
# for key,value in Side_View.items():
#     Normal_sort = value.sort_values(by = rad)
#     Normal_sort_copy = Normal_sort.copy()
#     Normal_sort_copy['percentile']=pd.qcut(Normal_sort_copy[rad],3, labels=[1,3,0])
#     Normal_sort_drop = pd.concat([Normal_sort_copy[Normal_sort_copy['percentile']==1],Normal_sort_copy[Normal_sort_copy['percentile']==0]],axis = 0)
    
#     print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#     X = Normal_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
#     y = Normal_sort_drop['percentile'].to_numpy().reshape(-1,1)
#     over = SMOTE(sampling_strategy="minority",random_state=2)
#     X,y = over.fit_resample(X,y)


#     selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 5)
#     pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                      ('features',selector),
#                      ('rf',RandomForestClassifier(random_state=42))]) 
#     param = {'features__max_features':[5,10,20],
#             "rf__max_depth":[1,5,10],
#              "rf__max_samples":[0.1,0.5,1.0],
#                  'rf__n_estimators':[50,100,1000] } 
#     print('*'*50)
#     Dict,AUC_score,Accuracy,sensitivity1,specificity1 = self_rf_pipe(X,y,pipe,param)  
#     print('*'*50)
#     df_au[key]=Dict
#     AUC[key]=AUC_score
#     ACU[key]=Accuracy
#     SEN[key]=sensitivity1
#     SPE[key]=specificity1

ThIS IS left_CC SIDE VIEW 


**************************************************
>est=0.817, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.813, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.835, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.825, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.821, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.853, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.890, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.834, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimator

>est=0.806, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.824, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.903, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.925, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.858, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.870, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.883, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.722, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.749, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max

**************************************************
>est=0.868, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.792, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.721, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.775, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.719, cfg={'features__max_features': 20, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.807, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.631, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.672, cfg={'features__max_features': 20, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.638, cfg={

# Cancer Cases

In [14]:
# Cancer lesion side  case based
Side_View = {}
Side_View = {'Case_based_CC_CancerLesion':CC_Cancer,
             'Case_based_MLO_CancerLesion':MLO_Cancer}
rads = ['diffScore.CN','diffScore.AU']

Case Based : Chinese CancerLesion 0.50 [0.5444,0.777778] 0.80
Case Based : Chinese CancerNormal  0.4 [0.500000, 0.733333] 0.76
Case Based : AU CancerLesion  0.56 [0.75,0.90] 0.94
Case Based : AU CancerNormal  0.56 [0.75, 0.88] 0.94

Location Based : Chinese CL_LC 0.20 [0.246667,0.580000]
Location Based : Chinese CL_CN 0.17 [0.20,0.57]
Location Based : AU CL_LC     0.50 [0.54,0.77]
Location Based : AU  CL_CN    0.50 [0.56,0.75]

In [15]:
# FOR CHINENSE READERS
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==0],cancer_sort_copy[cancer_sort_copy['percentile']==1]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)
    

ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


>est=0.657, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.565, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.546, cfg={'features__max_features': 15, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.657, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.583, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.731, cfg={'features__max_features': 15, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.620, cfg={'features__max_features': 10, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 1000}
>est=0.528, cfg={'feature

In [16]:
PROB = np.array(PROB).reshape(2,-1)
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)


# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.3333333333333333
Sensitivity is : 0.0
Specificity is : 0.7142857142857143
AUC for normal pipeline for Australian readers is 0.000
Confidence interval for the score: [0.000 - 0.0]


In [17]:
## FOR AUSTRALIAN READERS
rad = rads[1]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==0],cancer_sort_copy[cancer_sort_copy['percentile']==1]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    PROB.extend(probability)
    PRED.append(pre_label)
    


ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


>est=0.593, cfg={'features__max_features': 15, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.657, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.611, cfg={'features__max_features': 10, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.648, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.509, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.556, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.611, cfg={'features__max_features': 10, 'rf__max_depth': 1, 'rf__max_samples': 0.5, 'rf__n_estimators': 1000}
>est=0.500, cfg={'features_

In [18]:
PROB = np.array(PROB).reshape(2,-1)

# get the maximum prob among predictions of the four views
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.375
Sensitivity is : 0.3
Specificity is : 0.5
AUC for normal pipeline for Australian readers is 0.383
Confidence interval for the score: [0.067 - 0.767]


In [9]:
# # CN and AU Random Forest 2
# df_cancer ={}
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# for index, rad in enumerate(rads):
#     for key,value in Side_View.items():
#         cancer_sort = value.sort_values(by = rad)
#         cancer_sort_copy = cancer_sort.copy()
#         cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
#         cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

#         print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#         X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
#         y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
# #         over = SMOTE(sampling_strategy="minority",random_state=2)
# #         X,y = over.fit_resample(X,y)

       
#         # Random Forest 02 (scaler, SelectFromModel, RandomForest)
#         selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
#         pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                          ('features',selector),
#                          ('rf',RandomForestClassifier(random_state=42))]) 
#         param = {'features__max_features':[5,10,15],##################
#                          'rf__max_features':['sqrt'], 
#                      'rf__n_estimators':[50,100,1000] } 
#         print('*'*50)
#         print('Below is the results from the RF02:')
#         Dict,AUC_score,Accuracy,sensitivity1,specificity1  = self_rf_pipe(X,y,pipe,param)  
#         print('*'*50)
#         df_cancer[key]=Dict    
#         AUC[key]=AUC_score
#         ACU[key]=Accuracy
#         SEN[key]=sensitivity1
#         SPE[key]=specificity1

ThIS IS Case_based_CC_CancerLesion SIDE VIEW 


**************************************************
Below is the results from the RF02:
>est=0.731, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.537, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.574, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.694, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.537, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.722, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.565, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.778, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.676, cfg={'features__max_features': 5, 'rf__max_features': 

>est=0.799, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.809, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.827, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.759, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.830, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.793, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.818, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.773, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.920, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.806, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>e

## Location Based
### Cancer Lesion Side

In [19]:
pd.set_option('display.max_rows', 1200)
case_feat_location = pd.read_csv("/Users/jessy/Desktop/笔记本/Radiomics Data Analysis/diffScore w_out 10 dep Cn/case_feat_location_diffScore/whole_norm_feat.csv")
cancer_location = case_feat_location.merge(lesion_side.rename(columns={"LesionNumber":"LesionNum"}), how='left',on = ["CaseName","LesionNum"])
CC_CL_LC = cancer_location.loc[cancer_location["Side"]==cancer_location["LesionSide"],:].loc[cancer_location["View"]=="'CC'",:]
MLO_CL_LC = cancer_location.loc[cancer_location["Side"]==cancer_location["LesionSide"],:].loc[cancer_location["View"]=="'MLO'",:]

CC_CL_LC = CC_CL_LC[CC_CL_LC.LesionNum!=2]
MLO_CL_LC = MLO_CL_LC[MLO_CL_LC.LesionNum!=2]

In [20]:
# Cancer lesion side 
Side_View = {}
Side_View = {'Loc_based_CC_CancerLesion':CC_CL_LC,
             'Loc_based_MLO_CancerLesion': MLO_CL_LC}
rads = ['diffScore.CN','diffScore.AU']
#Bins = [[0,0.2,0.58,1],[0,0.5,0.77,1]] 
# For location 
# Chinese [0,0,25,0.7,1]
# Au [0,0.5625,0.7500,1]

In [21]:
# FOR CHINENSE READERS
rad = rads[0]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
    PRED.append(pre_label)
    


ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


>est=0.676, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.639, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.546, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 0.5, 'rf__n_estimators': 50}
>est=0.713, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 0.5, 'rf__n_estimators': 100}
>est=0.579, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.727, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.565, cfg={'features__max_feat

In [22]:
PROB = np.array(PROB).reshape(2,-1)
# get the maximum prob among predictions of the four views
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Chinese readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.42857142857142855
Sensitivity is : 0.14285714285714285
Specificity is : 0.7142857142857143
AUC for normal pipeline for Chinese readers is 0.327
Confidence interval for the score: [0.062 - 0.673]


In [23]:
# FOR AU READERS
rad = rads[1]
PROB = []
PRED = []
TRUE = []
for key,value in Side_View.items():
    cancer_sort = value.sort_values(by = rad)
    cancer_sort_copy = cancer_sort.copy()
    cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[0,3,1])
    cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

    print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
    X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
    y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
    selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
    pipe = Pipeline([ ('scaler',MinMaxScaler()),
                     ('features',selector),
                     ('rf',RandomForestClassifier(random_state=42))]) 
    param = {"features__max_features":[5,10,15],
            "rf__max_depth":[1,5,10],
             "rf__max_samples":[0.1,0.5,1.0],
            "rf__n_estimators":[50,100,1000] } 
    probability, pre_label,true_value = pipeline_evaluate(X,y,pipe, param)
    
    AUC_score = roc_auc_score(true_value, probability)
    Accuracy = accuracy_score(true_value,pre_label)
    
    cm1 = confusion_matrix(true_value,pre_label)
    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print('Specificity : ', specificity1)
    print("The AUC for this view is :", AUC_score)
    print("The ACC for this view is :", Accuracy)
    print("The sen for this view is :", sensitivity1)
    print("The spe for this view is :", specificity1)
    
    PROB.extend(probability)
#     PRED.append(pre_label)
    

ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.528, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.574, cfg={'features__max_features': 5, 'rf__max_depth': 5, 'rf__max_samples': 1.0, 'rf__n_estimators': 50}
>est=0.546, cfg={'features__max_features': 5, 'rf__max_depth': 10, 'rf__max_samples': 1.0, 'rf__n_estimators': 100}
>est=0.500, cfg={'features__max_features': 5, 'rf__max_depth': 1, 'rf__max_samples': 0.1, 'rf__n_estimators': 50}
>est=0.500, cfg={'features__max_feature

In [24]:
PROB = np.array(PROB).reshape(2,-1)
# get the maximum prob among predictions of the four views
PROB_max = np.max(PROB,axis=0)

label = []
for i in range(len(PROB_max)):
    if PROB_max[i]>0.5:
        label.append(1)
    else:
        label.append(0)
         
# calculate sensitivity, specificity, accuracy
Accuracy = accuracy_score(true_value,label)
cm1 = confusion_matrix(true_value,label)
sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print("Accuray is :",Accuracy )
print("Sensitivity is :", sensitivity1)
print("Specificity is :", specificity1)

# calculate pipeline AUC
AUC_score = roc_auc_score(true_value, PROB_max)
print("AUC for normal pipeline for Australian readers is {:.3f}".format(AUC_score))

# get confidence interval
n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(PROB_max), len(PROB_max))
    if len(np.unique(np.array(true_value)[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = roc_auc_score(np.array(true_value)[indices], PROB_max[indices])
    bootstrapped_scores.append(score)
#     print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))

sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
    confidence_lower, confidence_upper))

Accuray is : 0.4
Sensitivity is : 0.6666666666666666
Specificity is : 0.0
AUC for normal pipeline for Australian readers is 0.333
Confidence interval for the score: [0.000 - 0.692]


In [26]:

PROB

array([[0.28      , 0.06      , 0.28      , 0.28      , 0.28      ,
        0.26      , 0.8       , 0.32      , 0.32      , 0.32      ,
        0.32      , 0.32      , 0.32      , 0.32      , 0.32      ],
       [0.13152536, 0.48966667, 0.        , 0.22      , 0.405     ,
        0.484     , 0.60860245, 0.14      , 0.39      , 0.20275092,
        0.20391256, 0.38      , 0.50033333, 0.24      , 0.74112266]])

In [12]:
# # CN and AU Random Forest 2
# df_location = {}
# AUC = dict()
# ACU = dict()
# SEN = dict()
# SPE = dict()
# for index, rad in enumerate(rads):
#     for key,value in Side_View.items():
#         cancer_sort = value.sort_values(by = rad)
#         cancer_sort_copy = cancer_sort.copy()
#         cancer_sort_copy['percentile'] = pd.qcut(cancer_sort_copy[rad], 3, labels=[1,3,0])
#         cancer_sort_drop = pd.concat([cancer_sort_copy[cancer_sort_copy['percentile']==1],cancer_sort_copy[cancer_sort_copy['percentile']==0]],axis = 0)

#         print('ThIS IS {} SIDE VIEW'.format(key),'\n\n')
#         X = cancer_sort_drop.loc[:,"feat.Whole.Normalized.1":"feat.Whole.Normalized.203"].to_numpy()
#         y = cancer_sort_drop['percentile'].to_numpy().reshape(-1,1)
#         over = SMOTE(sampling_strategy="minority",random_state=2)
#         X,y = over.fit_resample(X,y)


#         # Random Forest 02 (scaler, SelectFromModel, RandomForest)
#         selector = SelectFromModel(estimator=RandomForestClassifier(),max_features = 20)
#         pipe = Pipeline([ #('scaler',MinMaxScaler()),
#                          ('features',selector),
#                          ('rf',RandomForestClassifier(random_state=42))]) 
#         param = {'features__max_features':[5,10,15],
#                          'rf__max_features':['sqrt'], 
#                      'rf__n_estimators':[50,100,1000] } 
#         print('*'*50)
#         print('Below is the results from the RF02:')
#         Dict,AUC_score,Accuracy,sensitivity1,specificity1  = self_rf_pipe(X,y,pipe,param)  
#         print('*'*50) 
#         df_location[key]=Dict
#         AUC[key]=AUC_score
#         ACU[key]=Accuracy
#         SEN[key]=sensitivity1
#         SPE[key]=specificity1

ThIS IS Loc_based_CC_CancerLesion SIDE VIEW 


**************************************************
Below is the results from the RF02:
>est=0.644, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.574, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.667, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.681, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.662, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.676, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.593, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.574, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.431, cfg={'features__max_features': 10, 'rf__max_features': '

>est=0.809, cfg={'features__max_features': 10, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.864, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.815, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
>est=0.642, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.735, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.796, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.691, cfg={'features__max_features': 15, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.586, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 1000}
>est=0.704, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.630, cfg={'features__max_features': 5, 'rf__max_features': 'sqrt', 'rf__n_estimators': 50}
>est=0.7