In [19]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier

In [2]:
core_path = '/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/github/myeloma_standal/phenotyping/ensemble/standard'
data = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/github/myeloma_standal/phenotyping/manual_phenotypes_standard.csv')

In [3]:
data.drop(columns=["index", "Y_centroid", "X_centroid"], inplace=True)
# phenotypes['distance_to_bone'] = phenotypes['distance_to_bone'].replace(-999, np.nan)

In [4]:
transformed = np.arcsinh(data.iloc[:, 0:32, ])
data.drop(columns=data.columns[0:32], inplace=True)
data = pd.concat([transformed, data], axis=1)

In [5]:
rs = 20240611

In [6]:
y = data['phenotype']
X = data.drop(columns=['phenotype'])  # Adjust the column name as per your dataset
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
label_mapping = pd.DataFrame({
    'OriginalLabel': label_encoder.classes_,
    'EncodedLabel': range(len(label_encoder.classes_))
})
#label_mapping.to_csv('label_mapping.csv', index=False)

In [None]:
label_mapping

In [7]:
classes = np.unique(y_encoded)
weights = compute_class_weight('balanced', classes=classes, y=y_encoded)
class_weights = dict(zip(classes, weights))

In [8]:
kf = KFold(n_splits = 5, shuffle=True, random_state=rs)

In [9]:
adaboost = AdaBoostClassifier(n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=rs)
svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, class_weight=class_weights, random_state=rs)
random_forest = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=2, max_features=6, class_weight=class_weights, random_state=rs)

In [10]:
meta_features = np.zeros((X.shape[0], len(classes) * 3))
meta_targets = np.zeros(y_encoded.shape[0])

In [11]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_encoded, test_size=0.10, random_state=rs)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/9, random_state=rs)

In [12]:
fold = 0
for train_index, test_index in kf.split(X_train_val, y_train_val):
    X_train_base, X_test_base = X_train_val.iloc[train_index], X_train_val.iloc[test_index]
    y_train_base, y_test_base = y_train_val[train_index], y_train_val[test_index]

    for clf, meta_start in zip([adaboost, svm, random_forest], range(0, meta_features.shape[1], len(classes))):
        clf_clone = clone(clf)
        clf_clone.fit(X_train_base, y_train_base)
        meta_features[test_index, meta_start:meta_start+len(classes)] = clf_clone.predict_proba(X_test_base)

    meta_targets[test_index] = y_test_base
    print(f"Completed fold {fold + 1}")
    fold += 1



Completed fold 1




Completed fold 2




Completed fold 3




Completed fold 4




Completed fold 5


In [13]:
sample_weights = np.array([class_weights[class_label] for class_label in y_train])

In [24]:
meta_cv_scores = []
meta_f1_scores = []
all_meta_predictions = []
all_y_test_meta = []

In [25]:
%%capture 
xgb_meta = XGBClassifier(n_estimators=600, max_depth=8, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='mlogloss', random_state=rs)

# Use cross-validation for the meta-classifier

for train_index, test_index in kf.split(meta_features):
    X_train_meta, X_test_meta = meta_features[train_index], meta_features[test_index]
    y_train_meta, y_test_meta = meta_targets[train_index], meta_targets[test_index]
    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(X_train_meta, y_train_meta, test_size=0.20, random_state=rs)

    # Define the validation set for early stopping
    eval_set = [(X_val_fold, y_val_fold)]
    xgb_meta.fit(X_train_meta, y_train_meta, sample_weight = sample_weights, early_stopping_rounds=60, eval_set=eval_set)
    meta_predictions = xgb_meta.predict(X_test_meta)
    meta_cv_scores.append(accuracy_score(y_test_meta, meta_predictions))
    meta_f1_scores.append(f1_score(y_test_meta, meta_predictions, average='weighted'))
    all_meta_predictions.extend(meta_predictions)
    all_y_test_meta.extend(y_test_meta)
# Display the results
print("10-fold CV Accuracy for Meta Classifier:")
print(meta_cv_scores)
print("Mean Accuracy:", np.mean(meta_cv_scores))

In [27]:
conf_matrix = confusion_matrix(all_y_test_meta, all_meta_predictions)
class_report = classification_report(all_y_test_meta, all_meta_predictions)

print("Confusion Matrix:")
print(conf_matrix)
print('\n')
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[12048     1     1    13     1     0    47     0     1    19     6     1
    252   127     0]
 [    7  1635   235   130   204     9     8    37    52   103     1     1
     39    10    14]
 [    2   192  3983    85   151    21    39    30    18    66     5     4
     50     7    32]
 [   29    92    33  4923    99    29    31   153   108   556    17     1
    797   161    77]
 [    0   190   125    82  7446    16    21    51    66   160     2     1
     64     4    32]
 [    0     7    35    85    40  1154    20    40    14    95     5     5
     62     4   181]
 [   51     7    34    40    44    17  2053   267    32   109    12     5
    296   200    19]
 [    2    48    51   234   119    39   267  1693    61   195    11     1
     71    19    27]
 [    2    45    22   164   103     8    35    53  3347   519     3     3
    156     4   151]
 [   13    58    51   523   138    36    98   136   361  9814    14     2
    718    88   130]
 [    4     1    10     9     3 