In [None]:
# @title Library Import
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')


In [None]:
# @title Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [32]:
# @title Common Parameters

### Number of features selected by PCA
No_Features=10

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=5,
    criterion='gini',
    max_depth=2,
    min_samples_split=5,
    min_samples_leaf=3,
    bootstrap=True,
    random_state=42
)
# Define the number of folds for cross-validation
num_folds = 5

In [75]:
# @title Load data

LC_imaging_features = pd.read_excel(".../LC_RF_PT.xlsx" , engine='openpyxl',header=None)   ## Load you Lung Cancer Dataset
HC_imaging_features = pd.read_excel(".../HC_RF_PT.xlsx" , engine='openpyxl',header=None)   ## Load you Head and Neck Cancer Dataset
lung_outcome = pd.read_excel(".../LC_outcome.xlsx" , engine='openpyxl',header=None).iloc[:, 2]   ## Load you Lung cancer Outcomes

In [111]:
# @title Split lung data for five fold cross validation and external testing

LC_X_train, LC_X_test, LC_Y_train, LC_Y_test = train_test_split(LC_imaging_features, lung_outcome, test_size=0.20)

In [77]:
# @title Concatinate Datasets for scaling and PCA

concatinated_imaging_features = np.concatenate((LC_X_train, HC_imaging_features), axis=0)

In [78]:
# @title Normalize datasets

scaler = StandardScaler()

scaled_concatinated_imaging_features = scaler.fit_transform(concatinated_imaging_features)
scaled_LC_X_test = scaler.transform(LC_X_test)

In [79]:
# @title  Reduce feature size by PCA

pca = PCA(n_components=No_Features)
pca.fit(scaled_concatinated_imaging_features)

pca_scaled_concatinated_imaging_features = pca.transform(scaled_concatinated_imaging_features)
pca_scaled_LC_X_test = pca.transform(scaled_LC_X_test)

In [None]:
# @title  Print total variance explained by 10 components

explained_variance_ratio = pca.explained_variance_ratio_
total_explained_variance = np.sum(explained_variance_ratio) * 100
print(f"Total variance explained by 10 components: {total_explained_variance:.2f}%")

In [None]:
# @title MLP Classifier

# Multi-layer Perceptron (MLP)
mlp_param_space = {
    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
    'max_iter' : [100],
    'activation' :['logistic', 'tanh', 'relu']
}

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    ### Semisupervised ###
    Semisupervised_grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=mlp_param_space, cv=5)
    Semisupervised_grid_search.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    MLP_Semisupervised_best_regressor = Semisupervised_grid_search.best_estimator_

    #Internal
    MLP_Semisupervised_best_regressor.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    y_pred = MLP_Semisupervised_best_regressor.predict(X_val_fold)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = MLP_Semisupervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SSL_proba = MLP_Semisupervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SSL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)
    auc_SSL = roc_auc_score(LC_Y_test, accuracy_SSL_proba)

    ### Supervised ###
    Supervised_grid_search = GridSearchCV(estimator=MLPClassifier(), param_grid=mlp_param_space, cv=5)
    Supervised_grid_search.fit(X_train_fold, y_train_fold)
    MLP_Supervised_best_regressor = Supervised_grid_search.best_estimator_

    #Internal
    MLP_Supervised_best_regressor.fit(X_train_fold, y_train_fold)
    y_pred = MLP_Supervised_best_regressor.predict(X_val_fold)
    accuracy_SL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SL)

    #External
    y_pred = MLP_Supervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SL_proba = MLP_Supervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SL_EX.append(accuracy_SL)
    auc_SL = roc_auc_score(LC_Y_test, accuracy_SL_proba)

print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title SVM Classifier

# Support Vector Machine (SVM)
svc_param_space = {
    'C': [0.001, 0.1, 1],
    'kernel': ['linear', 'sigmoid'],
    'degree': [3, 5],
    'probability':[True]
}

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    ### Semisupervised ###
    Semisupervised_grid_search = GridSearchCV(estimator=SVC(), param_grid=svc_param_space, cv=5)
    Semisupervised_grid_search.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    SVC_Semisupervised_best_regressor = Semisupervised_grid_search.best_estimator_

    #Internal
    SVC_Semisupervised_best_regressor.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    y_pred = SVC_Semisupervised_best_regressor.predict(X_val_fold)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = SVC_Semisupervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SSL_proba = SVC_Semisupervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SSL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)
    auc_SSL = roc_auc_score(LC_Y_test, accuracy_SSL_proba)

    ### Supervised ###
    Supervised_grid_search = GridSearchCV(estimator=SVC(), param_grid=svc_param_space, cv=5)
    Supervised_grid_search.fit(X_train_fold, y_train_fold)
    SVC_Supervised_best_regressor = Supervised_grid_search.best_estimator_

    #Internal
    SVC_Supervised_best_regressor.fit(X_train_fold, y_train_fold)
    y_pred = SVC_Supervised_best_regressor.predict(X_val_fold)
    accuracy_SL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SL)

    #External
    y_pred = SVC_Supervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SL_proba = SVC_Supervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SL_EX.append(accuracy_SL)
    auc_SL = roc_auc_score(LC_Y_test, accuracy_SL_proba)

print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title KNN Classifier

# k-nearest neighbors (KNN)
knn_param_space = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    ### Semisupervised ###
    Semisupervised_grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param_space, cv=5)
    Semisupervised_grid_search.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    KNN_Semisupervised_best_regressor = Semisupervised_grid_search.best_estimator_

    #Internal
    KNN_Semisupervised_best_regressor.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    y_pred = KNN_Semisupervised_best_regressor.predict(X_val_fold)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = KNN_Semisupervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SSL_proba = KNN_Semisupervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SSL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)
    auc_SSL = roc_auc_score(LC_Y_test, accuracy_SSL_proba)

    ### Supervised ###
    Supervised_grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_param_space, cv=5)
    Supervised_grid_search.fit(X_train_fold, y_train_fold)
    KNN_Supervised_best_regressor = Supervised_grid_search.best_estimator_

    #Internal
    KNN_Supervised_best_regressor.fit(X_train_fold, y_train_fold)
    y_pred = KNN_Supervised_best_regressor.predict(X_val_fold)
    accuracy_SL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SL)

    #External
    y_pred = KNN_Supervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SL_proba = KNN_Supervised_best_regressor.predict_proba(pca_scaled_LC_X_test)[:, 1]
    accuracy_SL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SL_EX.append(accuracy_SL)
    auc_SL = roc_auc_score(LC_Y_test, accuracy_SL_proba)

print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title EV Classifier

# Ensemble Voting (EV)
estimators_SL = [
                ("mlp_classifier_SL", MLP_Supervised_best_regressor),
                ("svm_clf_SL", SVC_Supervised_best_regressor),
                ("KNN_clf_SL", KNN_Supervised_best_regressor)]

estimators_SSL = [
                ("mlp_classifier_SL", MLP_Semisupervised_best_regressor),
                ("svm_clf_SL", SVC_Semisupervised_best_regressor),
                ("KNN_clf_SL", KNN_Semisupervised_best_regressor)]

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    ### Semisupervised ###
    Semisupervised_best_regressor = VotingClassifier(estimators_SSL)

    #Internal
    Semisupervised_best_regressor.fit(Concatinated_X_Folds, Concatinated_Y_Folds)
    y_pred = Semisupervised_best_regressor.predict(X_val_fold)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = Semisupervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SSL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)

    ### Supervised ###
    Supervised_best_regressor = VotingClassifier(estimators_SL)

    #Internal
    Supervised_best_regressor.fit(X_train_fold, y_train_fold)
    y_pred = Supervised_best_regressor.predict(X_val_fold)
    accuracy_SL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SL)

    #External
    y_pred = Supervised_best_regressor.predict(pca_scaled_LC_X_test)
    accuracy_SL = accuracy_score(LC_Y_test, y_pred)
    cv_scores_SL_EX.append(accuracy_SL)

print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title XGB Classifier

# eXtreme Gradient Boosting (XGB)
xgb_params = {
                'objective': 'binary:logistic',
                'eval_metric': 'logloss',
                'max_depth': 10,
                'eta': 0.9,
                'seed': 42}

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    # Change 1 -> 0 and 2 -> 1
    Concatinated_Y_Folds = (Concatinated_Y_Folds - 1)
    y_val_fold = (y_val_fold - 1)
    y_train_fold = (y_train_fold - 1)
    LC_Y_test_n = (LC_Y_test - 1)

    dtest = xgb.DMatrix(pca_scaled_LC_X_test, label=LC_Y_test_n)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)

    ### Semisupervised ###
    ssl_train = xgb.DMatrix(Concatinated_X_Folds, label=Concatinated_Y_Folds)
    XGB_Semisupervised_best_regressor = xgb.train(params=xgb_params, dtrain=ssl_train, num_boost_round=100)

    #Internal
    y_pred = (XGB_Semisupervised_best_regressor.predict(dval)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = (XGB_Semisupervised_best_regressor.predict(dtest)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(LC_Y_test_n, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)

    ### Supervised ###
    sl_train = xgb.DMatrix(X_train_fold, label=y_train_fold)
    XGB_Supervised_best_regressor = xgb.train(params=xgb_params, dtrain=sl_train, num_boost_round=100)

    #Internal
    y_pred = (XGB_Supervised_best_regressor.predict(dval)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SSL)

    dtest = xgb.DMatrix(pca_scaled_LC_X_test, label=LC_Y_test_n)

    #External
    y_pred = (XGB_Supervised_best_regressor.predict(dtest)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(LC_Y_test_n, y_pred)
    cv_scores_SL_EX.append(accuracy_SSL)


print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title LGB Classifier

# Light Gradient Boosting Machine (LGB)
lgb_params = {
                'objective': 'binary',
                'metric': 'binary_logloss',
                'boosting_type': 'gbdt',
                'num_leaves': 10,
                'learning_rate': 0.5,
                'feature_fraction': 0.5,
                'seed': 42}

labeled_LC_X = pca_scaled_concatinated_imaging_features[:LC_X_train.shape[0]]
unlabeled_HC_X = pca_scaled_concatinated_imaging_features[LC_X_train.shape[0]:]

labeled_LC_X = pd.DataFrame(labeled_LC_X)
labeled_LC_X.reset_index(drop=True, inplace=True)

LC_Y_train = pd.DataFrame(LC_Y_train)[2]
LC_Y_train.reset_index(drop=True, inplace=True)

cv_scores_SSL= []
cv_scores_SL= []
cv_scores_SSL_EX = []
cv_scores_SL_EX = []

# Initialize KFold with shuffling
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform cross-validation
for i, (train_index, val_index) in enumerate(kf.split(labeled_LC_X)):

    # Split data into training and validation sets
    X_train_fold, X_val_fold = labeled_LC_X.iloc[train_index], labeled_LC_X.iloc[val_index]
    y_train_fold, y_val_fold = LC_Y_train.iloc[train_index], LC_Y_train.iloc[val_index]

    # Train the classifier
    rf_classifier.fit(X_train_fold, y_train_fold)

    # Label HC datasets sudo_labeling
    predictions_HC = rf_classifier.predict(unlabeled_HC_X)

    # Concatenate predicted HC patients with LC data
    Concatinated_X_Folds = np.concatenate((X_train_fold, unlabeled_HC_X), axis=0)
    Concatinated_Y_Folds = np.concatenate((y_train_fold, predictions_HC), axis=0)

    # Change 1 -> 0 and 2 -> 1
    Concatinated_Y_Folds = (Concatinated_Y_Folds - 1)
    y_val_fold = (y_val_fold - 1)
    y_train_fold = (y_train_fold - 1)
    LC_Y_test_n = (LC_Y_test - 1)

    dtest = lgb.Dataset(pca_scaled_LC_X_test, label=LC_Y_test_n)
    dval = lgb.Dataset(X_val_fold, label=y_val_fold)

    ### Semisupervised ###
    ssl_train = lgb.Dataset(Concatinated_X_Folds, label=Concatinated_Y_Folds)
    LGB_Semisupervised_best_regressor = lgb.train(params=lgb_params, train_set=ssl_train, num_boost_round=100)

    #Internal
    y_pred = (LGB_Semisupervised_best_regressor.predict(X_val_fold)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SSL.append(accuracy_SSL)

    #External
    y_pred = (LGB_Semisupervised_best_regressor.predict(pca_scaled_LC_X_test)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(LC_Y_test_n, y_pred)
    cv_scores_SSL_EX.append(accuracy_SSL)

    ### Supervised ###
    sl_train = lgb.Dataset(X_train_fold, label=y_train_fold)
    LGB_Supervised_best_regressor = lgb.train(params=lgb_params, train_set=sl_train, num_boost_round=100)

    #Internal
    y_pred = (LGB_Supervised_best_regressor.predict(X_val_fold)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(y_val_fold, y_pred)
    cv_scores_SL.append(accuracy_SSL)

    #External
    y_pred = (LGB_Supervised_best_regressor.predict(pca_scaled_LC_X_test)> 0.5).astype(int)
    accuracy_SSL = accuracy_score(LC_Y_test_n, y_pred)
    cv_scores_SL_EX.append(accuracy_SSL)


print("Mean accuracy_SSL:", [(round(np.mean(cv_scores_SSL),2)),round(np.std(cv_scores_SSL),2)])
print("Mean accuracy_SL:", [round(np.mean(cv_scores_SL),2),round(np.std(cv_scores_SL),2)])
print("Mean accuracy_NestedXT_SSL:", [round(np.mean(cv_scores_SSL_EX),2),round(np.std(cv_scores_SSL_EX),2)])
print("Mean accuracy_NestedXT_SL:", [round(np.mean(cv_scores_SL_EX),2),round(np.std(cv_scores_SL_EX),2)])

In [None]:
# @title ROC Plotting
fpr, tpr, thresholds = roc_curve(Y_test_L_EX, ...)
auc_1 = roc_auc_score(Y_test_L_EX, ...)

plt.plot(fpr, tpr, label=f"... SSL ROC (AUC = {auc_1:.2f}) ")

fpr, tpr, thresholds = roc_curve(Y_test_L_EX, ...)
auc_2 = roc_auc_score(Y_test_L_EX, ...)

plt.plot(fpr, tpr, label=f"... SL ROC (AUC = {auc_2:.2f}) ")

# Plot the ROC curve
plt.plot([0, 1], [0, 1], color='grey', linestyle='--', lw=1)  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('(ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
# @title Read curated data and Plot *it should be modified based on results.xlsx
sup_results = pd.read_excel(".../Results.xlsx",sheet_name = "Total",header=0)

# Setting bold font globally
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titleweight'] = 'bold'

for i,row in sup_results.iterrows():

  # Creating the bars with error bars
  fig, ax = plt.subplots()
  bars1 = ax.bar(x - width/2, sales_2019, width, label='Semi-supervised', yerr=std_2019, capsize=2, color='red')
  bars2 = ax.bar(x + width/2, sales_2020, width, label='Supervised', yerr=std_2020, capsize=2, color='green')

  # Adding some text for labels, title, and custom x-axis tick labels, etc.
  ax.set_ylabel('Accuracy', fontsize=20)  # Increased font size for y-axis label
  ax.set_title(row['Feature type']+"-"+row['Modalities'], fontsize=20)  # Increased title font size for consistency
  ax.set_xticks(x)
  ax.set_xticklabels(stores, fontsize=16, fontweight='bold')  # Maintaining increased font size for x-tick labels

  # Set y-axis limits
  ax.set_ylim(0, 1)

  # Increase font size of y-axis tick labels
  ax.tick_params(axis='y', labelsize=14)  # Set font size for y-axis tick labels

  # Adding grid
  ax.grid(True, linestyle='--', which='major', color='grey', alpha=0.5)

  # Show the plot
  plt.show()
  fig.savefig(row['Feature type']+"_"+row['Modalities']+'.png')