In [None]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
import warnings
warnings.filterwarnings('ignore')

In [None]:
coldf=pd.read_csv('colData_GSE137140.csv',delimiter=';')
rowdf=pd.read_csv('rowData_GSE137140.csv',delimiter=';')
assaydf=pd.read_csv('assay_GSE137140.csv',delimiter=';')
display(coldf.head())
display(rowdf.head())
display(assaydf.head())

In [None]:
assaydf=assaydf.T
assaydf

In [None]:
assaydf.columns=assaydf.iloc[0,:]
assaydf=assaydf[1:]
assaydf

In [None]:
label_df=coldf[["geo_accession","disease.state.ch1"]]
label_df.head()

In [None]:
coldf.shape

In [None]:
assaydf.index.name="geo_accession"
assaydf.head()

In [None]:
for col in assaydf.columns:
    assaydf[col]=assaydf[col].apply(lambda x: x.replace(',','.'))
assaydf.head()

In [None]:
assaydf=assaydf.astype(np.float32)
assaydf.info()

In [None]:
label_df.set_index('geo_accession',inplace=True)
label_df.head()

In [None]:
df=pd.merge(left=assaydf,right=label_df,left_index=True,right_index=True)
df.head()

In [None]:
X=df.drop(['disease.state.ch1'],axis=1)
y=df['disease.state.ch1']

In [None]:
X.info()

In [None]:
from scipy.stats import kstest

In [None]:
p_list=[]
for col in X.columns:
    _,p_value=kstest(X[col],'norm')
    if p_value<0.05:
        k='Not Normally Distributed'
    else:
        k='Normally Distributed'
    p_list.append({'col_name':col,'status':k})
p_list=pd.DataFrame(p_list)
p_list['status'].unique()

Semuanya tidak berdistribusi normal

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=42)
x_valid,x_test,y_valid,y_test=train_test_split(x_valid,y_valid,test_size=0.5,random_state=42)

In [None]:
x_train.head()

In [None]:
(x_train.shape,x_valid.shape,x_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
standard=StandardScaler()

In [None]:
x_train_standard=standard.fit_transform(x_train)
x_test_standard=standard.transform(x_test)
x_valid_standard=standard.transform(x_valid)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca1=PCA(n_components=390,random_state=42)

In [None]:
pca1.fit(x_train_standard)
plt.plot(np.cumsum(pca1.explained_variance_ratio_))
plt.xlabel('n_components')
plt.xticks(np.arange(0,395,step=5))
plt.ylabel('Cumulative Explained variance')
plt.show()

bad cumulative explained variance ratio (<0.8), akan dipakai metode dekomposisi selain PCA

In [None]:
from sklearn.decomposition import KernelPCA
from sklearn.metrics import mean_squared_error
import optuna

In [None]:
def objective(trial):
    components=trial.suggest_int("n_components",50,450,step=50)
    kernel=trial.suggest_categorical("kernel",["poly","rbf","sigmoid","linear"])

    kpca=KernelPCA(n_components=components,
                   kernel=kernel,
                   random_state=42,
                   n_jobs=-1,
                   fit_inverse_transform=True)
    x_reduced=kpca.fit_transform(x_train_standard)
    x_preimage=kpca.inverse_transform(x_reduced)

    return -1*np.sqrt(mean_squared_error(x_train_standard,x_preimage))

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=30)

In [None]:
study.best_value

In [None]:
kpca_params=study.best_params
kpca_params['random_state']=42
kpca_params['n_jobs']=-1
kpca_params

In [None]:
kpca1=KernelPCA(**kpca_params)

In [None]:
x_train_preprocessed=kpca1.fit_transform(x_train_standard)
x_test_preprocessed=kpca1.transform(x_test_standard)
x_valid_preprocessed=kpca1.transform(x_valid_standard)
(x_train_preprocessed.shape,x_test_preprocessed.shape,x_valid_preprocessed.shape)

In [None]:
x_train_preprocessed=pd.DataFrame(x_train_preprocessed,columns=kpca1.get_feature_names_out())
x_test_preprocessed=pd.DataFrame(x_test_preprocessed,columns=kpca1.get_feature_names_out())
x_valid_preprocessed=pd.DataFrame(x_valid_preprocessed,columns=kpca1.get_feature_names_out())
display(x_train_preprocessed.head())
display(x_valid_preprocessed.head())
display(x_test_preprocessed.head())

In [None]:
from sklearn.preprocessing import LabelBinarizer
LB=LabelBinarizer()

In [None]:
y_train=LB.fit_transform(y_train)
y_train.shape

In [None]:
y_test=LB.fit_transform(y_test)
y_valid=LB.fit_transform(y_valid)
(y_test.shape,y_valid.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
def objective(trial):
    estimators=trial.suggest_int("n_estimators",100,900,step=100)
    criterion=trial.suggest_categorical("criterion",['gini','entropy','log_loss'])
    depth=trial.suggest_int("max_depth",6,20,step=1)
    leaves=trial.suggest_int("max_leaf_nodes",12,40,step=2)
    max_features=trial.suggest_categorical("max_features",['sqrt','log2'])
    rfc=RandomForestClassifier(n_estimators=estimators,
                               criterion=criterion,
                               max_depth=depth,
                               max_leaf_nodes=leaves,
                               max_features=max_features,
                               n_jobs=-1,
                               random_state=42)
    rfc.fit(x_train_preprocessed,y_train)
    y_pred=rfc.predict(x_valid_preprocessed)

    return f1_score(y_valid,y_pred,average='macro')

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=30)

In [None]:
study.best_value

In [None]:
rfc_params=study.best_params
rfc_params['random_state']=42
rfc_params['n_jobs']=-1
rfc_params

In [None]:
rfc1=RandomForestClassifier(**rfc_params)
rfc1.fit(x_train_preprocessed,y_train)

In [None]:
y_pred_val=rfc1.predict(x_valid_preprocessed)
y_pred_val.shape

In [None]:
y_pred_test=rfc1.predict(x_test_preprocessed)
y_pred_test.shape

In [None]:
from sklearn.metrics import multilabel_confusion_matrix,ConfusionMatrixDisplay,classification_report

In [None]:
cm_test=multilabel_confusion_matrix(y_test,y_pred_test)
t1=ConfusionMatrixDisplay(cm_test[0])
t2=ConfusionMatrixDisplay(cm_test[1])
t3=ConfusionMatrixDisplay(cm_test[2])
print("\nClassification report")
print(classification_report(y_test, y_pred_test))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
cm_valid=multilabel_confusion_matrix(y_valid,y_pred_val)
t1=ConfusionMatrixDisplay(cm_valid[0])
t2=ConfusionMatrixDisplay(cm_valid[1])
t3=ConfusionMatrixDisplay(cm_valid[2])
print("\nClassification report")
print(classification_report(y_valid, y_pred_val))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
(f1_score(y_test,y_pred_test,average='macro'),accuracy_score(y_test,y_pred_test))

In [None]:
(f1_score(y_valid,y_pred_val,average='macro'),accuracy_score(y_valid,y_pred_val))

In [None]:
from lightgbm import LGBMClassifier
def objective(trial):
   depth=trial.suggest_int("max_depth",6,20,step=1)
   estimators=trial.suggest_int("n_estimators",100,900,step=100)
   subsample=trial.suggest_float("subsample",0.5,1,step=0.1)
   colsample=trial.suggest_float("colsample_bytree",0.5,1,step=0.1)
   reg_alpha=trial.suggest_float("reg_alpha",0,2e-1,step=0.025)
   reg_lambda=trial.suggest_float("reg_lambda",0,2e-1,step=0.025)
   leaves=trial.suggest_int("num_leaves",12,40,step=2)
   lr=trial.suggest_float("learning_rate",0.05,0.3,step=0.05)
   bin_sample=trial.suggest_int("subsample_for_bin",200000,400000,step=16)

   model=LGBMClassifier(num_leaves=leaves,
                     max_depth=depth,
                     learning_rate=lr,
                     n_estimators=estimators,
                     subsample=subsample,
                     colsample_bytree=colsample,
                     reg_alpha=reg_alpha,
                     reg_lambda=reg_lambda,
                     subsample_for_bin=bin_sample,
                     n_jobs=-1,
                     random_state=42)
   model.fit(x_train_preprocessed,LB.inverse_transform(y_train))
   y_pred=model.predict(x_valid_preprocessed)
   y_pred=LB.transform(y_pred)
   return f1_score(y_valid,y_pred,average='macro')

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=30)

In [None]:
study.best_value

In [None]:
best_params=study.best_params
best_params['random_state']=42
best_params['n_jobs']=-1
best_params

In [None]:
lgb1=LGBMClassifier(**best_params)
lgb1.fit(x_train_preprocessed,LB.inverse_transform(y_train))

In [None]:
y_pred_val=lgb1.predict(x_valid_preprocessed)
y_pred_val=LB.transform(y_pred_val)
y_pred_val.shape

In [None]:
y_pred_test=lgb1.predict(x_test_preprocessed)
y_pred_test=LB.transform(y_pred_test)
y_pred_test.shape

In [None]:
cm_test=multilabel_confusion_matrix(y_test,y_pred_test)
t1=ConfusionMatrixDisplay(cm_test[0])
t2=ConfusionMatrixDisplay(cm_test[1])
t3=ConfusionMatrixDisplay(cm_test[2])
print("\nClassification report")
print(classification_report(y_test, y_pred_test))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
cm_valid=multilabel_confusion_matrix(y_valid,y_pred_val)
t1=ConfusionMatrixDisplay(cm_valid[0])
t2=ConfusionMatrixDisplay(cm_valid[1])
t3=ConfusionMatrixDisplay(cm_valid[2])
print("\nClassification report")
print(classification_report(y_valid, y_pred_val))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
(f1_score(y_test,y_pred_test,average='macro'),accuracy_score(y_test,y_pred_test))

In [None]:
(f1_score(y_valid,y_pred_val,average='macro'),accuracy_score(y_valid,y_pred_val))

In [None]:
from sklearn.svm import SVC
def objective(trial):
   c=trial.suggest_float("C",0.05,3,step=0.05)
   kernel=trial.suggest_categorical("kernel",['linear','poly','rbf','sigmoid'])
   degree=trial.suggest_int('degree',2,5,step=1)
   gamma=trial.suggest_categorical('gamma',['scale','auto'])

   model=SVC(C=c,kernel=kernel,degree=degree,gamma=gamma,random_state=42)
   model.fit(x_train_preprocessed,LB.inverse_transform(y_train))
   y_pred=model.predict(x_valid_preprocessed)
   y_pred=LB.transform(y_pred)
   return f1_score(y_valid,y_pred,average='macro')

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=30)

In [None]:
study.best_value

In [None]:
best_params=study.best_params
best_params['random_state']=42
best_params

In [None]:
svc1=SVC(**best_params)

In [None]:
svc1.fit(x_train_preprocessed,LB.inverse_transform(y_train))

In [None]:
y_pred_val=svc1.predict(x_valid_preprocessed)
y_pred_val=LB.transform(y_pred_val)
y_pred_val.shape

In [None]:
y_pred_test=svc1.predict(x_test_preprocessed)
y_pred_test=LB.transform(y_pred_test)
y_pred_test.shape

In [None]:
cm_test=multilabel_confusion_matrix(y_test,y_pred_test)
t1=ConfusionMatrixDisplay(cm_test[0])
t2=ConfusionMatrixDisplay(cm_test[1])
t3=ConfusionMatrixDisplay(cm_test[2])
print("\nClassification report")
print(classification_report(y_test, y_pred_test))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
cm_valid=multilabel_confusion_matrix(y_valid,y_pred_val)
t1=ConfusionMatrixDisplay(cm_valid[0])
t2=ConfusionMatrixDisplay(cm_valid[1])
t3=ConfusionMatrixDisplay(cm_valid[2])
print("\nClassification report")
print(classification_report(y_valid, y_pred_val))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
from sklearn.metrics import f1_score, accuracy_score
(f1_score(y_test,y_pred_test,average='macro'),accuracy_score(y_test,y_pred_test))

In [None]:
(f1_score(y_valid,y_pred_val,average='macro'),accuracy_score(y_valid,y_pred_val))

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
train_dataset=tf.data.Dataset.from_tensor_slices((x_train_standard,y_train)).batch(36).prefetch(tf.data.AUTOTUNE)
test_dataset=tf.data.Dataset.from_tensor_slices((x_test_standard,y_test)).batch(36).prefetch(tf.data.AUTOTUNE)
valid_dataset=tf.data.Dataset.from_tensor_slices((x_valid_standard,y_valid)).batch(36).prefetch(tf.data.AUTOTUNE)

In [None]:
def block_1(x,filter):
    x_skip=x
    x=keras.layers.Conv1D(filter,3,padding='same')(x)
    x=keras.layers.BatchNormalization()(x)
    x=keras.layers.Activation('relu')(x)
    x=keras.layers.Conv1D(filter,3,padding='same')(x)
    x=keras.layers.BatchNormalization()(x)
    x=keras.layers.Add()([x_skip,x])
    return x

In [None]:
def block_2(x,filter):
    x_skip=x
    x_skip=keras.layers.Conv1D(filter,1,2,padding='same',activation='linear')(x_skip)
    x=keras.layers.Conv1D(filter,3,2,padding='same')(x)
    x=keras.layers.BatchNormalization()(x)
    x=keras.layers.Activation('relu')(x)
    x=keras.layers.Conv1D(filter,3,padding='same')(x)
    x=keras.layers.BatchNormalization()(x)
    x=keras.layers.Add()([x_skip,x])
    return x


In [None]:
def build_model():
    inputs=keras.Input((X.shape[1],))
    reshape=keras.layers.Reshape((X.shape[1],1))(inputs)

    x0=keras.layers.Conv1D(64,
                           kernel_size=7,
                           strides=2,
                           padding='same')(reshape)
    norm0=keras.layers.BatchNormalization()(x0)
    relu0=keras.layers.Activation('relu')(norm0)
    pool0=keras.layers.MaxPooling1D(3,strides=2,padding='same')(relu0)

    conv_1=block_1(pool0,64)
    conv_2=block_1(conv_1,64)
    conv_3=block_1(conv_2,64)

    conv_4=block_2(conv_3,128)
    conv_5=block_1(conv_4,128)
    conv_6=block_1(conv_5,128)
    conv_7=block_1(conv_6,128)

    conv_8=block_2(conv_7,256)
    conv_9=block_1(conv_8,256)
    conv_10=block_1(conv_9,256)
    conv_11=block_1(conv_10,256)
    conv_12=block_1(conv_11,256)
    conv_13=block_1(conv_12,256)

    conv_14=block_2(conv_13,512)
    conv_15=block_1(conv_14,512)
    conv_16=block_1(conv_15,512)

    global_pooling=keras.layers.GlobalAveragePooling1D()(conv_16)
    x=keras.layers.Dense(1000,activation='relu')(global_pooling)
    output=keras.layers.Dense(3,activation='softmax')(x)

    model=keras.Model(inputs=inputs,outputs=output)
    opt=keras.optimizers.Adam(learning_rate=8e-4)

    model.compile(opt,
                  loss=keras.losses.CategoricalCrossentropy(),
                  metrics=[keras.metrics.CategoricalAccuracy()])
    return model

In [None]:
model=build_model()
model.summary()

In [None]:
keras.utils.plot_model(model,show_shapes=True,show_layer_activations=True)

In [None]:
reduce_lr=keras.callbacks.ReduceLROnPlateau('val_loss',factor=0.5,patience=3,verbose=1)
early_stop=keras.callbacks.EarlyStopping('val_loss',patience=10,verbose=1,restore_best_weights=True)

In [None]:
history=model.fit(train_dataset,
                  epochs=200,
                  callbacks=[reduce_lr,early_stop],
                  validation_data=valid_dataset)

In [None]:
_, ax = plt.subplots(1, 2, figsize=(20,7))
loss = history.history['loss']
val_loss = history.history['val_loss']
accuracy=history.history['categorical_accuracy']
val_accuracy=history.history['val_categorical_accuracy']
epochs = range(len(loss))

ax[0].plot(epochs, loss)
ax[0].plot(epochs, val_loss)
ax[0].legend(['loss', 'val_loss'], loc='upper right')

ax[1].plot(epochs, accuracy)
ax[1].plot(epochs, val_accuracy)
ax[1].legend(['categorical_accuracy', 'val_categorical_accuracy'], loc='lower right')

In [None]:
y_pred_val=model.predict(valid_dataset)
y_pred_val=np.round(y_pred_val)

In [None]:
y_pred_test=model.predict(test_dataset)
y_pred_test=np.round(y_pred_test)

In [None]:
y_pred_val.shape,y_pred_test.shape

In [None]:
cm_test=multilabel_confusion_matrix(y_test,y_pred_test)
t1=ConfusionMatrixDisplay(cm_test[0])
t2=ConfusionMatrixDisplay(cm_test[1])
t3=ConfusionMatrixDisplay(cm_test[2])
print("\nClassification report")
print(classification_report(y_test, y_pred_test))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
cm_valid=multilabel_confusion_matrix(y_valid,y_pred_val)
t1=ConfusionMatrixDisplay(cm_valid[0])
t2=ConfusionMatrixDisplay(cm_valid[1])
t3=ConfusionMatrixDisplay(cm_valid[2])
print("\nClassification report")
print(classification_report(y_valid, y_pred_val))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
from sklearn.metrics import f1_score, accuracy_score
(f1_score(y_test,y_pred_test,average='macro'),accuracy_score(y_test,y_pred_test))

In [None]:
(f1_score(y_valid,y_pred_val,average='macro'),accuracy_score(y_valid,y_pred_val))

In [None]:
model.save('Conv1DRes34.h5')

In [None]:
from xgboost import XGBClassifier
def objective(trial):
   depth=trial.suggest_int("max_depth",6,20,step=1)
   estimators=trial.suggest_int("n_estimators",100,900,step=100)
   subsample=trial.suggest_float("subsample",0.5,1,step=0.1)
   colsample=trial.suggest_float("colsample_bytree",0.5,1,step=0.1)
   reg_alpha=trial.suggest_float("reg_alpha",0,2e-1,step=0.025)
   reg_lambda=trial.suggest_float("reg_lambda",0,2e-1,step=0.025)
   leaves=trial.suggest_int("max_leaves",12,40,step=2)
   lr=trial.suggest_float("learning_rate",0.05,0.3,step=0.05)
   bin_sample=trial.suggest_int("max_bin",256,4096,step=16)

   model=XGBClassifier(n_estimators=estimators,
                       max_depth=depth,
                       max_leaves=leaves,
                       max_bin=bin_sample,
                       grow_policy='lossguide',
                       learning_rate=lr,
                       subsample=subsample,
                       colsample_bytree=colsample,
                       reg_alpha=reg_alpha,
                       reg_lambda=reg_lambda,
                       random_state=42,
                       n_jobs=-1)
   model.fit(x_train_preprocessed,y_train)
   y_pred=model.predict(x_valid_preprocessed)
   return f1_score(y_valid,y_pred,average='macro')

In [None]:
sampler=optuna.samplers.TPESampler(seed=42)
study=optuna.create_study(sampler=sampler,direction='maximize')
study.optimize(objective,n_trials=10)

In [None]:
study.best_value

In [None]:
best_params=study.best_params
best_params['random_state']=42
best_params['n_jobs']=-1
best_params['grow_policy']='lossguide'
best_params

In [None]:
xgb1=XGBClassifier(**best_params)
xgb1.fit(x_train_preprocessed,y_train)

In [None]:
y_pred_val=xgb1.predict(x_valid_preprocessed)
y_pred_val.shape

In [None]:
y_pred_test=xgb1.predict(x_test_preprocessed)
y_pred_test.shape

In [None]:
cm_test=multilabel_confusion_matrix(y_test,y_pred_test)
t1=ConfusionMatrixDisplay(cm_test[0])
t2=ConfusionMatrixDisplay(cm_test[1])
t3=ConfusionMatrixDisplay(cm_test[2])
print("\nClassification report")
print(classification_report(y_test, y_pred_test))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
cm_valid=multilabel_confusion_matrix(y_valid,y_pred_val)
t1=ConfusionMatrixDisplay(cm_valid[0])
t2=ConfusionMatrixDisplay(cm_valid[1])
t3=ConfusionMatrixDisplay(cm_valid[2])
print("\nClassification report")
print(classification_report(y_valid, y_pred_val))
t1.plot()
t1.ax_.set_title('Lung cancer, post-operation')
t2.plot()
t2.ax_.set_title('Lung cancer, pre-operation')
t3.plot()
t3.ax_.set_title('Non-cancer control')

In [None]:
(f1_score(y_test,y_pred_test,average='macro'),accuracy_score(y_test,y_pred_test))

In [None]:
(f1_score(y_valid,y_pred_val,average='macro'),accuracy_score(y_valid,y_pred_val))