In [None]:
# import packages
import pandas as pd
from collections import defaultdict
import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import balanced_accuracy_score, accuracy_score, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
seed = 2024
dataframe = pd.read_csv('radiomics.csv')

In [None]:
X = dataframe.drop(columns=["id", 'label'])
y = dataframe['label']

In [None]:
models = {'RandomForest': RandomForestClassifier(max_depth=10, random_state=seed),
'AdaBoost': AdaBoostClassifier(random_state=seed),
'L-SVM':make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=seed, probability=True)),
'LR': make_pipeline(StandardScaler(), LogisticRegression(random_state=seed))}
import random
random.seed(seed)
np.random.seed(seed)

In [None]:
kfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=100, random_state=seed)
metrics_all = defaultdict(list)
preds_all = defaultdict(lambda: defaultdict(list))
importances = []
for i, (train_index, test_index) in tqdm.tqdm(enumerate(kfold.split(X, y)), total=(kfold.get_n_splits(X, y))):
    x_train, x_test, y_train, y_test = X.values[train_index], X.values[test_index], y.values[train_index], y.values[test_index]
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred_pb = model.predict_proba(x_test)
        pred_max, pred_argmax = y_pred_pb.max(1), y_pred_pb.argmax(1)
        y_pred = np.where(pred_argmax == 1, pred_max, 1 - pred_max)
        for i, idx in enumerate(test_index):
            preds_all[f'{name}'][idx].append(y_pred[i])
        metrics_all[f'{name}/recall'].append(recall_score(y_test, [round(y) for y in y_pred]))
        metrics_all[f'{name}/precision'].append(precision_score(y_test, [round(y) for y in y_pred]))
        metrics_all[f'{name}/f1'].append(f1_score(y_test, [round(y) for y in y_pred]))
        metrics_all[f'{name}/auc'].append(roc_auc_score(y_test, y_pred))
        metrics_all[f'{name}/accuracy'].append(accuracy_score(y_test, [round(y) for y in y_pred]))
        metrics_all[f'{name}/balanced_accuracy'].append(balanced_accuracy_score(y_test, [round(y) for y in y_pred]))

In [None]:
for name, scores in metrics_all.items():
    print(f'{name : <40} \t {np.mean(scores):.2f} +/- {np.std(scores):.2f}')

In [None]:
features = defaultdict(int)
k=10
kfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=100, random_state=seed)
metrics_Fscore = defaultdict(list)
preds_Fscore = defaultdict(lambda: defaultdict(list))
for i, (train_index, test_index) in tqdm.tqdm(enumerate(kfold.split(X, y)), total=(kfold.get_n_splits(X, y))):
    x_train, x_test, y_train, y_test = X.values[train_index], X.values[test_index], y.values[train_index], y.values[test_index]
    s = SelectKBest(f_classif, k=k)
    x_train_reduced = s.fit_transform(x_train, y_train)
    x_test_reduced = s.transform(x_test)
    ft = X.columns[np.flip(s.scores_.argsort())[:k]]
    for f in ft:
        features[f] += 1
    for name, model in models.items():
        model.fit(x_train_reduced, y_train)
        y_pred_pb = model.predict_proba(x_test_reduced)
        pred_max, pred_argmax = y_pred_pb.max(1), y_pred_pb.argmax(1)
        y_pred = np.where(pred_argmax == 1, pred_max, 1 - pred_max)
        for i, idx in enumerate(test_index):
            preds_Fscore[f'{name}'][idx].append(y_pred[i])
        metrics_Fscore[f'{name}/recall'].append(recall_score(y_test, [round(y) for y in y_pred]))
        metrics_Fscore[f'{name}/precision'].append(precision_score(y_test, [round(y) for y in y_pred]))
        metrics_Fscore[f'{name}/f1'].append(f1_score(y_test, [round(y) for y in y_pred]))
        metrics_Fscore[f'{name}/auc'].append(roc_auc_score(y_test, y_pred))
        metrics_Fscore[f'{name}/accuracy'].append(accuracy_score(y_test, [round(y) for y in y_pred]))
        metrics_Fscore[f'{name}/balanced_accuracy'].append(balanced_accuracy_score(y_test, [round(y) for y in y_pred]))

In [None]:
for name, scores in metrics_Fscore.items():
    print(f'{name : <40} \t {np.mean(scores):.2f} +/- {np.std(scores):.2f}')

In [None]:
from sklearn.feature_selection import mutual_info_classif
features = defaultdict(int)
k=10
kfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=100, random_state=seed)
metrics_mi = defaultdict(list)
preds_mi = defaultdict(lambda: defaultdict(list))
for i, (train_index, test_index) in tqdm.tqdm(enumerate(kfold.split(X, y)), total=(kfold.get_n_splits(X, y))):
    x_train, x_test, y_train, y_test = X.values[train_index], X.values[test_index], y.values[train_index], y.values[test_index]
    s = SelectKBest(mutual_info_classif, k=k)
    x_train_reduced = s.fit_transform(x_train, y_train)
    x_test_reduced = s.transform(x_test)
    ft = X.columns[np.flip(s.scores_.argsort())[:k]]
    for f in ft:
        features[f] += 1
    for name, model in models.items():
        model.fit(x_train_reduced, y_train)
        y_pred_pb = model.predict_proba(x_test_reduced)
        pred_max, pred_argmax = y_pred_pb.max(1), y_pred_pb.argmax(1)
        y_pred = np.where(pred_argmax == 1, pred_max, 1 - pred_max)
        for i, idx in enumerate(test_index):
            preds_mi[f'{name}'][idx].append(y_pred[i])
        metrics_mi[f'{name}/recall'].append(recall_score(y_test, [round(y) for y in y_pred]))
        metrics_mi[f'{name}/precision'].append(precision_score(y_test, [round(y) for y in y_pred]))
        metrics_mi[f'{name}/f1'].append(f1_score(y_test, [round(y) for y in y_pred]))
        metrics_mi[f'{name}/auc'].append(roc_auc_score(y_test, y_pred))
        metrics_mi[f'{name}/accuracy'].append(accuracy_score(y_test, [round(y) for y in y_pred]))
        metrics_mi[f'{name}/balanced_accuracy'].append(balanced_accuracy_score(y_test, [round(y) for y in y_pred]))

In [None]:
for name, scores in metrics_mi.items():
    print(f'{name : <40} \t {np.mean(scores):.2f} +/- {np.std(scores):.2f}')

In [None]:
from sklearn.feature_selection import RFE
features = defaultdict(int)
k=10
kfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=100, random_state=seed)
metrics_rfe = defaultdict(list)
preds_rfe = defaultdict(lambda: defaultdict(list))
for i, (train_index, test_index) in tqdm.tqdm(enumerate(kfold.split(X, y)), total=(kfold.get_n_splits(X, y))):
    x_train, x_test, y_train, y_test = X.values[train_index], X.values[test_index], y.values[train_index], y.values[test_index]
    for (name, model), fi in zip(models.items(), ['auto', 'auto', 'named_steps.svc.coef_', 'named_steps.logisticregression.coef_']):
        selector = RFE(model, n_features_to_select=k, importance_getter=fi, step=5)
        selector.fit(x_train, y_train)
        y_pred_pb = selector.predict_proba(x_test)
        pred_max, pred_argmax = y_pred_pb.max(1), y_pred_pb.argmax(1)
        y_pred = np.where(pred_argmax == 1, pred_max, 1 - pred_max)
        for i, idx in enumerate(test_index):
            preds_rfe[f'{name}'][idx].append(y_pred[i])
        metrics_rfe[f'{name}/recall'].append(recall_score(y_test, [round(y) for y in y_pred]))
        metrics_rfe[f'{name}/precision'].append(precision_score(y_test, [round(y) for y in y_pred]))
        metrics_rfe[f'{name}/f1'].append(f1_score(y_test, [round(y) for y in y_pred]))
        metrics_rfe[f'{name}/auc'].append(roc_auc_score(y_test, y_pred))
        metrics_rfe[f'{name}/accuracy'].append(accuracy_score(y_test, [round(y) for y in y_pred]))
        metrics_rfe[f'{name}/balanced_accuracy'].append(balanced_accuracy_score(y_test, [round(y) for y in y_pred]))

In [None]:
for name, scores in metrics_rfe.items():
    print(f'{name : <40} \t {np.mean(scores):.2f} +/- {np.std(scores):.2f}')