In [11]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve,ConfusionMatrixDisplay, classification_report, precision_recall_curve, auc, balanced_accuracy_score
import pandas as pd

In [13]:
# Re-assign labels from TCGA to custom NCIT labels and encode the data

D, tcga_maf, samples = pickle.load(open('/home/janaya2/Desktop/ATGC_paper/figures/tumor_classification/data/data.pkl', 'rb'))
del tcga_maf, D

# filtering the NCI-T labels (https://livejohnshopkins-my.sharepoint.com/:x:/r/personal/abaras1_jh_edu/_layouts/15/doc2.aspx?sourcedoc=%7B5f92f0fc-ec6c-40d5-ab17-0d3345f9f2c2%7D&action=edit&activeCell=%27Sheet1%27!B21&wdinitialsession=e072a38f-57c8-4c1f-885b-efaefcc81d35&wdrldsc=2&wdrldc=1&wdrldr=AccessTokenExpiredWarning%2CRefreshingExpiredAccessT)
ncit_labels_kept = ['Muscle-Invasive Bladder Carcinoma','Infiltrating Ductal Breast Carcinoma',
                    'Invasive Lobular Breast Carcinoma','Cervical Squamous Cell Carcinoma',
                    'Colorectal Adenocarcinoma','Glioblastoma','Head and Neck Squamous Cell Carcinoma',
                    'Clear Cell Renal Cell Carcinoma','Papillary Renal Cell Carcinoma','Astrocytoma',
                    'Oligoastrocytoma','Oligodendroglioma','Hepatocellular Carcinoma','Lung Adenocarcinoma',
                    'Lung Squamous Cell Carcinoma','Ovarian Serous Adenocarcinoma','Adenocarcinoma, Pancreas',
                    'Paraganglioma','Pheochromocytoma','Prostate Acinar Adenocarcinoma','Colorectal Adenocarcinoma',
                    'Desmoid-Type Fibromatosis','Leiomyosarcoma','Liposarcoma','Malignant Peripheral Nerve Sheath Tumor',
                    'Myxofibrosarcoma','Synovial Sarcoma','Undifferentiated Pleomorphic Sarcoma',
                    'Cutaneous Melanoma','Gastric Adenocarcinoma','Testicular Non-Seminomatous Germ Cell Tumor',
                    'Testicular Seminoma','Thyroid Gland Follicular Carcinoma','Thyroid Gland Papillary Carcinoma',
                    'Endometrial Endometrioid Adenocarcinoma','Endometrial Serous Adenocarcinoma']
ncit_samples = samples.loc[samples['NCI-T Label'].isin(ncit_labels_kept)]
PCPG_ncit = ['Paraganglioma','Pheochromocytoma']
SARC_ncit = ['Desmoid-Type Fibromatosis','Leiomyosarcoma','Liposarcoma','Malignant Peripheral Nerve Sheath Tumor',
             'Myxofibrosarcoma','Synovial Sarcoma','Undifferentiated Pleomorphic Sarcoma']
TGCT_ncit = ['Testicular Non-Seminomatous Germ Cell Tumor','Testicular Seminoma']
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(PCPG_ncit), 'NCI-T Label'] = 'PCPG'
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(SARC_ncit), 'NCI-T Label'] = 'SARC'
ncit_samples.loc[ncit_samples['NCI-T Label'].isin(TGCT_ncit), 'NCI-T Label'] = 'TGCT'
#print(ncit_samples.loc[ncit_samples['NCI-T Label'] == 'Testicular Seminoma']['NCI-T Label'])
#print(list(set(ncit_samples['NCI-T Label'])))

A = ncit_samples['NCI-T Label'].astype('category')
classes = A.cat.categories.values
##integer values for random forest
classes_onehot = np.eye(len(classes))[A.cat.codes]
y_label = classes_onehot

y_strat = np.argmax(y_label, axis=-1)
class_counts = dict(zip(*np.unique(y_strat, return_counts=True)))
y_weights = np.array([1 / class_counts[_] for _ in y_strat])
y_weights /= np.sum(y_weights)

# Comment gene and context predictions below if plotting for "both" data types predictions later
#test_idx, mil_predictions_gene = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/mil_gene_predictions.pkl', 'rb'))
#test_idx, nn_predictions_gene, _ = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/nn_gene_predictions.pkl', 'rb'))
#test_idx, rf_predictions_gene = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/rf_gene_predictions.pkl', 'rb'))
#test_idx, mil_predictions_context = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/mil_contexts_predictions.pkl', 'rb'))
#test_idx, nn_predictions_context = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/nn_contexts_predictions.pkl', 'rb'))
#test_idx, rf_predictions_context = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/rf_contexts_predictions.pkl', 'rb'))

# Uncomment for plotting for "both" data types predictions later.
test_idx, nn_predictions_both = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/nn_both_predictions.pkl', 'rb'))
test_idx, rf_predictions_both = pickle.load(open('/home/mlee276/Desktop/TCGA-ML-main/results/rf_both_predictions.pkl', 'rb'))

# Comment gene and context predictions below if plotting for "both" data types predictions later
#rf_predictions_context = np.asarray(np.vstack(rf_predictions_context))
#rf_predictions_gene = np.asarray(np.vstack(rf_predictions_gene))

# Uncomment for plotting for "both" data types predictions later.
rf_predictions_both = np.asarray(np.vstack(rf_predictions_both))

correct = (y_strat[np.concatenate(test_idx)])
# one hot correct vals: [1,2,3] - > [[0,1,0,0],[0,0,1,0],[0,0,0,1]]
onehot = np.zeros((correct.size,correct.max()+1))
onehot[np.arange(correct.size),correct] = 1

In [3]:
# Produce a Table of the class AUC for all three models and gene & context data.
def table_AUC_gene_context(true, mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context, classNames, notOneHot):
    %matplotlib
    
    pred_data = [mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context]
    model_names = ["MIL gene", "MIL context", "NN gene", "NN context", "RF gene", "RF context"]
    
    AUCs = []
    
    for i in range(len(pred_data)):
        # AUC
        temp = []
        for j in range(onehot.shape[1]):
            temp.append(roc_auc_score(onehot[:, j], pred_data[i][:, j]))
        temp = ["%.2f" % value for value in temp]
        AUCs.append(temp)
    
    # Construct Table: Pandas Dataframe
    df = pd.DataFrame(list(zip(AUCs[5], AUCs[4], AUCs[3],
                               AUCs[2], AUCs[1], AUCs[0])))
    columns = [('AUC','MIL gene'), ('AUC','MIL context'), ('AUC','NN gene'),
               ('AUC','NN context'), ('AUC','RF gene'), ('AUC','RF context')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    df.insert(loc=0, column='Cancer Type', value=classNames)
    
    # Visualize:
    display(df)

In [4]:
table_AUC_gene_context(onehot,mil_predictions_gene, mil_predictions_context, 
                       nn_predictions_gene, nn_predictions_context, 
                       rf_predictions_gene, rf_predictions_context, classes,correct)

Using matplotlib backend: Qt5Agg


Unnamed: 0_level_0,Cancer Type,AUC,AUC,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,Unnamed: 1_level_1,MIL gene,MIL context,NN gene,NN context,RF gene,RF context
0,"Adenocarcinoma, Pancreas",0.61,0.64,0.6,0.63,0.59,0.66
1,Astrocytoma,0.62,0.64,0.64,0.66,0.66,0.66
2,Cervical Squamous Cell Carcinoma,0.57,0.5,0.55,0.53,0.53,0.52
3,Clear Cell Renal Cell Carcinoma,0.6,0.59,0.61,0.59,0.6,0.59
4,Colorectal Adenocarcinoma,0.67,0.64,0.69,0.67,0.67,0.68
5,Cutaneous Melanoma,0.65,0.59,0.66,0.6,0.66,0.59
6,Endometrial Endometrioid Adenocarcinoma,0.54,0.58,0.54,0.59,0.55,0.59
7,Endometrial Serous Adenocarcinoma,0.56,0.52,0.59,0.54,0.59,0.62
8,Gastric Adenocarcinoma,0.61,0.57,0.64,0.63,0.64,0.6
9,Glioblastoma,0.82,0.8,0.82,0.8,0.83,0.82


In [5]:
# Produce a Table of the class Precision-Recall AUC for all three models and gene & context data.
def table_PrecisionRecall_AUC_gene_context(true, mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context, classNames, notOneHot):
    %matplotlib
    
    pred_data = [mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context]
    model_names = ["MIL gene", "MIL context", "NN gene", "NN context", "RF gene", "RF context"]
    
    AUCs = []
    
    for i in range(len(pred_data)):
        # Precision Recall AUC
        temp = []
        for j in range(onehot.shape[1]):
            #temp.append(roc_auc_score(onehot[:, j], pred_data[i][:, j])) 
            precision, recall, thresholds = precision_recall_curve(onehot[:, j], pred_data[i][:, j])
            temp.append(auc(recall, precision)) 
        temp = ["%.2f" % value for value in temp] 
        AUCs.append(temp)
    
    # Construct Table: Pandas Dataframe
    df = pd.DataFrame(list(zip(AUCs[0], AUCs[1], AUCs[2],
                               AUCs[3], AUCs[4], AUCs[5])))
    columns = [('Precision Recall AUC','MIL gene'), ('Precision Recall AUC','MIL context'), ('Precision Recall AUC','NN gene'),
               ('Precision Recall AUC','NN context'), ('Precision Recall AUC','RF gene'), ('Precision Recall AUC','RF context')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    df.insert(loc=0, column='Cancer Type', value=classNames)
    
    # Visualize:
    display(df)


Using matplotlib backend: Qt5Agg


Unnamed: 0_level_0,Cancer Type,Precision Recall AUC,Precision Recall AUC,Precision Recall AUC,Precision Recall AUC,Precision Recall AUC,Precision Recall AUC
Unnamed: 0_level_1,Unnamed: 1_level_1,MIL gene,MIL context,NN gene,NN context,RF gene,RF context
0,"Adenocarcinoma, Pancreas",0.09,0.07,0.09,0.04,0.11,0.04
1,Astrocytoma,0.05,0.05,0.06,0.04,0.07,0.04
2,Cervical Squamous Cell Carcinoma,0.04,0.04,0.04,0.04,0.03,0.04
3,Clear Cell Renal Cell Carcinoma,0.09,0.1,0.08,0.09,0.09,0.09
4,Colorectal Adenocarcinoma,0.17,0.16,0.16,0.15,0.16,0.15
5,Cutaneous Melanoma,0.15,0.14,0.17,0.15,0.14,0.15
6,Endometrial Endometrioid Adenocarcinoma,0.07,0.06,0.07,0.06,0.07,0.06
7,Endometrial Serous Adenocarcinoma,0.02,0.02,0.02,0.02,0.04,0.01
8,Gastric Adenocarcinoma,0.12,0.14,0.12,0.14,0.09,0.13
9,Glioblastoma,0.31,0.38,0.35,0.29,0.23,0.31


In [None]:
table_PrecisionRecall_AUC_gene_context(onehot,mil_predictions_gene, mil_predictions_context, nn_predictions_gene, 
                            nn_predictions_context, rf_predictions_gene, rf_predictions_context,classes,correct)

In [6]:
# # Produce a Table of the balanced accuracies for all three models and gene & context data.
def table_balancedAccuracy_gene_context(true, mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context, classNames, notOneHot):
    %matplotlib
    
    pred_data = [mil_pred_gene, mil_pred_context, nn_pred_gene, nn_pred_context, rf_pred_gene, rf_pred_context]
    model_names = ["MIL gene", "MIL context", "NN gene", "NN context", "RF gene", "RF context"]
    
    balanced_accuracies = [] 
    
    for i in range(len(pred_data)):
        # Precision Recall AUC
        temp = [] 
        #for j in range(onehot.shape[1]):
        balanced_accuracy = balanced_accuracy_score(correct, pred_data[i].argmax(axis=1)) 
        temp.append(balanced_accuracy) 
        temp = ["%.2f" % value for value in temp] 
        balanced_accuracies.append(temp)
    
    print(balanced_accuracies)
    # Construct Table: Pandas Dataframe
    df = pd.DataFrame(list(zip(balanced_accuracies[0], balanced_accuracies[1], balanced_accuracies[2],
                               balanced_accuracies[3], balanced_accuracies[4], balanced_accuracies[5])))
    columns = [('Balanced Accuracy','MIL gene'), ('Balanced Accuracy','MIL context'), ('Balanced Accuracy','NN gene'),
               ('Balanced Accuracy','NN context'), ('Balanced Accuracy','RF gene'), ('Balanced Accuracy','RF context')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    #df.insert(loc=0, column='Cancer Type', value=classNames)
     
    # Visualize:
    display(df)


Using matplotlib backend: Qt5Agg
[['0.17'], ['0.19'], ['0.17'], ['0.17'], ['0.16'], ['0.17']]


Unnamed: 0_level_0,Balanced Accuracy,Balanced Accuracy,Balanced Accuracy,Balanced Accuracy,Balanced Accuracy,Balanced Accuracy
Unnamed: 0_level_1,MIL gene,MIL context,NN gene,NN context,RF gene,RF context
0,0.17,0.19,0.17,0.17,0.16,0.17


In [None]:
table_balancedAccuracy_gene_context(onehot,mil_predictions_gene, mil_predictions_context, nn_predictions_gene, 
                            nn_predictions_context, rf_predictions_gene, rf_predictions_context,classes,correct)

In [6]:
# Produces a visual the confusion matrix, class precision, and class recall for a selected model.
def nature_plot(true, pred, classNames, notOneHot):
    %matplotlib
    
    cluster_rows = True

    fig, axs = plt.subplots(ncols=2, nrows=2, gridspec_kw= {'width_ratios':[8, .5], 'height_ratios':[0.5,8]})#dict(width_ratios=[1,4,0.2]))
    
    # Get class sizes
    class_sizes = [0]*len(classNames)
    for c in notOneHot:
        class_sizes[c] += 1
    temp = {classNames[i]: class_sizes[i] for i in range(len(classNames))}

    # Confusion Matrix 
    font = {'family' : 'normal',
            'size'   : 7}
    plt.rc('font', **font)
    ax_cm = axs[1,0]
    conf_mat_raw = confusion_matrix(true.argmax(axis=1), pred.argmax(axis=1), normalize='true')
    # reorder rows based on row clustering
    df = pd.DataFrame(conf_mat_raw)
    clustermap = sns.clustermap(df, col_cluster=False)
    reordered_rows = clustermap.dendrogram_row.reordered_ind
    reordered_row_clustered_labels = []
    for i in range(len(classNames)):
        reordered_row_clustered_labels.append(classNames[reordered_rows[i]])
    if cluster_rows == True:
        classNames = reordered_row_clustered_labels 
        conf_mat = confusion_matrix(true.argmax(axis=1), pred.argmax(axis=1), normalize='true', labels=reordered_rows)
        conf_mat = np.asarray([[round(j*100) for j in i] for i in conf_mat]) 
    else:
        #not clustered
        conf_mat = np.asarray([[round(j*100) for j in i] for i in conf_mat_raw]) 
    
    # Construct list with class names and sizes.
    classes_and_sizes = [0]*len(classNames)
    for i in range(len(classNames)):
        classes_and_sizes[i] = classNames[i] + " (" + str(temp[classNames[i]]) + ")"
    
    # Precision and Recall
    clf = classification_report(notOneHot, pred.argmax(axis=1),target_names=classNames,output_dict=True)
    precision = []
    recall = []
    for j in clf:
        if (j!="accuracy" and j!="weighted avg" and j!="macro avg"):
            precision.append(round(float("%.2f" % clf[j]["precision"])*100))
            recall.append(round(float("%.2f" % clf[j]["recall"])*100))
    p_temp = []
    r_temp = []
    if cluster_rows == True:
        for i in range(len(precision)): 
            p_temp.append(precision[reordered_rows[i]])
            r_temp.append(recall[reordered_rows[i]])
        precision = p_temp
        recall = r_temp           
    
    # To dataframe
    confusion_df = pd.DataFrame(conf_mat)
    precision_df = pd.DataFrame(precision) 
    recall_df = pd.DataFrame(recall).T 
    
    # Plotting
    sns.heatmap(confusion_df, annot=True, cbar=False, ax=axs[1,0], cmap=plt.cm.Blues)
    axs[1,0].set(yticks=np.arange(len(classNames)), yticklabels=classes_and_sizes, xticks=np.arange(len(classNames)), xticklabels=classNames)
    #axs[1,0].set_xticklabels(classNames, rotation=90)
    #axs[1,0].set_yticklabels(classes_and_sizes, rotation=0)
    axs[1,0].title.set_text('Confusion Matrix')
    sns.heatmap(precision_df, annot=True, yticklabels=False, cbar=False, ax=axs[1,1], cmap=plt.cm.Blues)
    axs[1,1].title.set_text('Precision')
    axs[1,1].set_xticks([])
    sns.heatmap(recall_df, annot=True, yticklabels=False, cbar=False, ax=axs[0,0], cmap=plt.cm.Blues)
    axs[0,0].title.set_text('Recall')
    axs[0,0].set_xticks([])
    
    # center tick marks
    div = [item + 0.5 for item in range(0, len(classes_and_sizes))]
    axs[1,0].set_yticklabels('') # Hide major tick labels
    axs[1,0].set_yticks(div,      minor=True) # Customize minor tick labels
    axs[1,0].set_yticklabels(classes_and_sizes, minor=True) 
    axs[1,0].set_xticklabels('', rotation=90) # Hide major tick labels
    axs[1,0].set_xticks(div,      minor=True) # Customize minor tick labels
    axs[1,0].set_xticklabels(classNames, minor=True, rotation=90)
    
    # extra formatting
    fig.delaxes(axs[0,1])
    
    fig.suptitle('MIL (with modified NCI-T Lables and context data)', fontsize=16)
    plt.show()
    fig.tight_layout()

In [10]:
# Only run one line at a time.
nature_plot(onehot,nn_predictions_both,classes,correct)
nature_plot(onehot,rf_predictions_both,classes,correct)

Using matplotlib backend: Qt5Agg
