In [1]:
from tensorflow import keras
from tensorflow.keras import datasets, layers, models, optimizers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import numpy as np
from sklearn.metrics import *
from tensorflow.keras import callbacks
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras import backend as K

from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.stats import ttest_ind
import matplotlib

2024-07-04 15:40:38.529377: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
rna_idx=pd.read_csv("../data/dataBulk/common_rna.csv",index_col=0).values.squeeze().tolist()

# prepare training data
files=["95233","26440","57065","4607","8121","9692","13904","26378","28750"]
data = []
label = []

for file in files:
    if file == "13904":
        df=pd.read_csv("../data/dataBulk/exp.gene.mRNA.GSE"+file+'.txt' , sep="\t").T
        df_label=pd.read_csv("../data/dataBulk/label_GSE"+file+'.txt' , sep="\t",header=None)
        x=df[rna_idx].T
        x=x[df_label[0]].T.values
        data.append(x)
        label.append(df_label[2].values>0)
    else:
        df=pd.read_csv("../data/dataBulk/exp.gene.mRNA.GSE"+file+'.txt' , sep="\t").T
        data.append(df[rna_idx].values)
        df_label=pd.read_csv("../data/dataBulk/label_GSE"+file+'.txt' , sep="\t",header=None)[2].values
        label.append(df_label>0)

In [3]:
for i,j in zip(data,label):
    print(i.shape,j.shape)

(124, 2869) (124,)
(130, 2869) (130,)
(107, 2869) (107,)
(123, 2869) (123,)
(75, 2869) (75,)
(45, 2869) (45,)
(70, 2869) (70,)
(103, 2869) (103,)
(30, 2869) (30,)


In [15]:
data[5].max(),data[5].min()

(77.0997, 0.0123)

In [16]:
mean,std = 9.435935231248593e-19, 1.0000000000000007
data = [(arr - mean) / std for arr in data]
data[5].max(),data[5].min()

(77.09969999999994, 0.01229999999999999)

In [9]:
df_rna_idx = pd.Series(rna_idx)
df_rna_idx.name='gene'

dfseq1 = pd.read_csv('../data/dataBulk/RNAseq/tpm_185263.txt',sep = '\t')
dfseq1.index.name='gene'

dfseq1 = pd.merge(df_rna_idx, dfseq1, on='gene', how = 'left').fillna(0)
dfseq1 = dfseq1.drop(columns='gene').T
seq1 = dfseq1.values

seq1_label = pd.read_csv('../data/dataBulk/RNAseq/185263_label.txt').values.squeeze()
seq1_label.shape

(392,)

In [11]:
seq1.max(),seq1.min()

(257664.306277662, 0.0)

In [7]:
mean,std = 9.435935231248593e-19, 1.0000000000000007

In [12]:
seq1 = (seq1 - mean) / std
seq1.max(),seq1.min()

(257664.30627766182, -9.435935231248587e-19)

In [13]:
seq_train,seq_val, seqlabel_train,seqlabel_val = train_test_split(seq1,seq1_label,test_size=0.7,random_state=42)
print(seqlabel_train.shape,seqlabel_val.shape)

(117,) (275,)


In [7]:
print(sum(seqlabel_train))
print(sum(seqlabel_val))

103
245


In [5]:
def loadmodel():
    org_model = keras.models.load_model("../modelsave/0623weightmodel_domaindpec")
    org_model = models.Model(inputs=org_model.input, outputs=org_model.get_layer('dense_6').output) # do not include final classifier
    return org_model

In [6]:
def transfer(org_model,sample,label):
    xtr,xval, ytr,yval = train_test_split(sample,label,test_size=0.2,random_state=42)
    cw = class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(ytr),y=ytr)
    cw = {0:cw[0],1:cw[1]}
    
    org_model.trainable = False
    inputs = keras.Input(shape=(2869,1))
    x = org_model(inputs, training=False)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    model.fit(xtr, ytr, batch_size=10, epochs=30, validation_data=(xval, yval),shuffle=True,class_weight=cw)

    org_model.trainable = True
    model.compile(optimizer=keras.optimizers.Adam(1e-5),  # Very low learning rate
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[keras.metrics.BinaryAccuracy()])
    earlystopping = callbacks.EarlyStopping(monitor ="val_binary_accuracy", mode ="min", patience = 5, 
                                        restore_best_weights = True)
    model.fit(xtr, ytr, batch_size=10, epochs=30,validation_data=(xval, yval),
               shuffle=True,class_weight=cw)

    return model

In [12]:
def drawsota(roc1,file_train):
    cohorts = roc1['Cohort'].unique().tolist()
    model_names = roc1['Model'].unique().tolist()

    model_aucs = []
    for model in model_names:
        model_aucs.append(roc1[roc1['Model']==model]['AUC'].tolist())

    num_models = len(model_names)

    shift = num_models // 2 # the middle point to shift

    x = np.arange(len(cohorts))*2  # the label locations
    width = 0.4  # the width of the bars

    fig, ax = plt.subplots(figsize=(10,6))

    model_axs = []
    for i in range(num_models):
        model_axs.append(ax.bar(x + (i-shift)*width, model_aucs[i], width, label=model_names[i],color = colors[i]))
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('AUROC Score',size = 20)
    ax.set_title('AUROC on microarray compared with biomarkers',size=20)
    ax.set_ylim(ymin=0.4,ymax=1.1)
    ax.set_xticks(x, cohorts)
    ax.legend()

    for model_ax in model_axs:
        ax.bar_label(model_ax, padding=-45, fmt='%.3f', rotation='vertical',size=15)

    plt.legend(loc='lower right',prop={'size':15})

    fig.tight_layout()

    plt.savefig('./'+file_train+'sota0606.png',dpi=100)

    plt.show()

In [20]:
colors = ['#9e2a2b','#897966', '#EAB69F',  '#8F5D5D',
         '#3D405B', '#5F797B', '#81B29A', '#BABF95', '#F2CC8F']
names = ["K-Nearest Neighbors", "DecisionTree",
         "Random Forest", "Naive Bayes", "Quadratic Discriminant Analysis"]
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(random_state=7),
    RandomForestClassifier(random_state=7),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [14]:
def drawML(roc2,file_train):
    cohorts = roc2['Cohort'].unique().tolist()
    model_names = roc2['Model'].unique().tolist()

    model_aucs = []
    for model in model_names:
        model_aucs.append(roc2[roc2['Model']==model]['AUC'].tolist())

    num_models = len(model_names)

    shift = num_models // 2 # the middle point to shift

    x = np.arange(len(cohorts))*4  # the label locations
    width = 0.4  # the width of the bars

    fig, ax = plt.subplots(figsize=(15,5))

    model_axs = []
    colors = ['#9e2a2b','#897966', '#EAB69F',  '#8F5D5D','#3D405B', '#5F797B', '#81B29A', '#BABF95', '#F2CC8F']

    for i in range(num_models):
        model_axs.append(ax.bar(x + (i-shift)*width, model_aucs[i], width, label=model_names[i],color = colors[i]))
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('AUROC Score',size=15)
    ax.set_title('AUROC on microarray data',size=15)
    ax.set_xticks(x, cohorts,fontsize=14)
    ax.set_ylim(ymin=0.3,ymax=1.1)
    ax.legend()

    for model_ax in model_axs:
        ax.bar_label(model_ax, padding=-40, fmt='%.3f', rotation='vertical', size=13)

    plt.legend(loc='lower left',bbox_to_anchor = (0,-0.3), ncol=5,prop={'size':13})

    fig.tight_layout()
    plt.savefig('./'+file_train+'ML0613.png',dpi=100)

    plt.show()

In [19]:
x_train = np.append(np.append(data[0], data[1], axis=0),data[2], axis=0)
y_train = np.append(np.append(label[0],label[1], axis=0),label[2], axis=0)
file_train = ["95233","26440","57065"]
save_name = "95233+26440+57065"
results_cat = []
org_model = loadmodel()
model = transfer(org_model,x_train,y_train)
# model = keras.models.load_model("./model/transferred_model")
results = pd.DataFrame(columns = ['Cohort','AUC','PRC','Model'])

# test CapTrans
for x_test,y_test,file_test in zip(data[3:],label[3:],files[3:]):
        testresult=model.predict(x_test)
        fpr, tpr, _ = roc_curve(y_test,testresult)
        roc_auc = auc(fpr,tpr)
        precision, recall, _ = precision_recall_curve(y_test,testresult)
        prc_auc = auc(recall,precision)
        print(roc_auc)
        results = results.append({'Cohort':'GSE'+file_test,'AUC':roc_auc,'PRC':prc_auc,'Model':'CaT'},ignore_index=True)
        results_cat.append({'Cohort': 'GSE' + file_test, 'Model': 'CaT', 'y_test': y_test, 'testresult': testresult})

results_CaT = pd.DataFrame(results_cat)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
0.9827160493827161
0.981111111111111
0.9844444444444445
0.9433760683760684
0.9994192799070847
1.0


In [20]:
results

Unnamed: 0,Cohort,AUC,PRC,Model
0,GSE4607,0.982716,0.997747,CaT
1,GSE8121,0.981111,0.995861,CaT
2,GSE9692,0.984444,0.993606,CaT
3,GSE13904,0.943376,0.98338,CaT
4,GSE26378,0.999419,0.999852,CaT
5,GSE28750,1.0,1.0,CaT


In [None]:
results_Domain_specific_Normalization1 = results

In [21]:
results_CaT['y_test'] = results_CaT['y_test'].apply(lambda x: [int(item) for item in x])
results_CaT['testresult'] = results_CaT['testresult'].apply(lambda x: [item for sublist in x for item in sublist])
results_CaT

Unnamed: 0,Cohort,Model,y_test,testresult
0,GSE4607,CaT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.034010068, 0.020908948, 0.03866634, 0.00753..."
1,GSE8121,CaT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.014820819, 0.014146972, 0.011832056, 0.0045..."
2,GSE9692,CaT,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.99930644, 0.9945129, 0.9999981, 0.99960285,..."
3,GSE13904,CaT,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.036681905, 0.13805662, 0.053169984, 0.03146..."
4,GSE26378,CaT,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0.9992724, 0.9934, 0.99974173, 0.97568583, 0...."
5,GSE28750,CaT,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0.945032, 0.9050596, 0.99984944, 0.99989486, ..."


In [24]:
results_cat1 = []
            
seq_train,seq_val, seqlabel_train,seqlabel_val = train_test_split(seq1,seq1_label,test_size=0.7,random_state=42)
file_train = ["185263"]
save_name = "185263"

org_model = loadmodel()
model = transfer(org_model,seq_train,seqlabel_train)
# model = keras.models.load_model("./model/transferred_model")

# test CapTrans
testresult=model.predict(seq_val)
fpr, tpr, _ = roc_curve(seqlabel_val,testresult)
roc_auc = auc(fpr,tpr)
precision, recall, _ = precision_recall_curve(seqlabel_val,testresult)
prc_auc = auc(recall,precision)
print(roc_auc)
results = results.append({'Cohort':'GSE'+'185263','AUC':roc_auc,'PRC':prc_auc,'Model':'CaT'},ignore_index=True)
results_cat1.append({'Cohort': 'GSE' + '185263', 'Model': 'CaT', 'y_test': seqlabel_val, 'testresult': testresult})
results_CaT1 = pd.DataFrame(results_cat1)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
0.9669387755102041


In [25]:
#results_Domain_specific_Normalization
results

Unnamed: 0,Cohort,AUC,PRC,Model
0,GSE4607,0.982716,0.997747,CaT
1,GSE8121,0.981111,0.995861,CaT
2,GSE9692,0.984444,0.993606,CaT
3,GSE13904,0.943376,0.98338,CaT
4,GSE26378,0.999419,0.999852,CaT
5,GSE28750,1.0,1.0,CaT
6,GSE185263,0.966939,0.995788,CaT


In [59]:
results_CaT1.at[0, 'testresult'] = [item for sublist in results_CaT1.at[0, 'testresult'] for item in sublist]
results_CaT1

Unnamed: 0,Cohort,Model,y_test,testresult
0,GSE185263,CaT,"[1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ...","[0.87594706, 0.22732697, 0.70470345, 0.7983318..."
