In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import warnings
from sklearn.feature_selection import VarianceThreshold, RFE
import pickle

warnings.filterwarnings('ignore')


In [None]:
# Loading data
df = pd.read_csv("data/Radiomic_features_all.csv" , sep = ",")
df.head

In [None]:
# Setting parameter
rand_seed = 5

In [None]:
## Data split
X = df.drop('label', axis=1)  # Features
Y = df['label']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=rand_seed, stratify=Y)


print(f'Train: {X_train.shape} | Test: {X_test.shape}')


In [None]:
# Classifying data using RandomForest
from sklearn.ensemble import RandomForestClassifier
base_model = RandomForestClassifier(random_state=rand_seed)
base_model.fit(X_train, y_train)

In [None]:
# Check 1. base model
print("Baseline Accuray without Feature Selection")
print(f"Train: {base_model.score(X_train, y_train):.4f}")
print(f"Test : {base_model.score(X_test, y_test):.4f}")

In [None]:
# 2. Variance Thresholding
variance_thresh = VarianceThreshold(threshold=0.1)  # 분산이 0.1 이하인 특징 제거
X_train_var = variance_thresh.fit_transform(X_train)
X_test_var = variance_thresh.transform(X_test)

# 선택된 특징 확인
var_selected_features = X_train.columns[variance_thresh.get_support()]
print(f"Selected Features after Variance Thresholding: {list(var_selected_features)}")

# 선택된 특징 적용
var_model = RandomForestClassifier(random_state=rand_seed)
var_model.fit(X_train_var, y_train)
print("Accuray with Feature Selection: Varience thresholding")
print(f"Train: {var_model.score(X_train_var, y_train):.4f}")
print(f"Test : {var_model.score(X_test_var, y_test):.4f}")

In [None]:
# 3. Recursive Feature Elimination (RFE)
rfe = RFE(estimator=RandomForestClassifier(random_state=rand_seed), n_features_to_select=10)  # 상위 10개 특징 선택
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# 선택된 특징 확인
rfe_selected_features = X_train.columns[rfe.get_support()]
print(f"Selected Features after RFE: {list(rfe_selected_features)}")

# 선택된 특징 적용
rfe_model = RandomForestClassifier(random_state=rand_seed)
rfe_model.fit(X_train_rfe, y_train)
print("Accuray with Feature Selection: Recursive Feature Elimination")
print(f"Train: {rfe_model.score(X_train_rfe, y_train):.4f}")
print(f"Test : {rfe_model.score(X_test_rfe, y_test):.4f}")

In [None]:
# 4. Tree-based Feature Importance
base_model.fit(X_train, y_train)
importances = base_model.feature_importances_
threshold = np.mean(importances)  # 평균 이상의 중요도를 가진 특징 선택
tree_selected_features = X_train.columns[importances > threshold]
X_train_tree = X_train[tree_selected_features]
X_test_tree = X_test[tree_selected_features]

# 선택된 특징 확인
print(f"Selected Features after Tree-based Feature Importance: {list(tree_selected_features)}")

# 선택된 특징 적용
base_model.fit(X_train_tree, y_train)
print("Accuray with Feature Selection: Tree-based Feature Importance")
print(f"Train: {base_model.score(X_train_tree, y_train):.4f}")
print(f"Test : {base_model.score(X_test_tree, y_test):.4f}")

In [None]:
# 모델 저장
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(base_model, f)
    
# 선별된 특징들로 구성된 데이터셋 저장
selected_df = df[tree_selected_features]
selected_df['label'] = df['label']  # label 열 추가
selected_df.to_csv(f'./data/selected_tree.csv', index=False)

In [None]:
# 성능 결과 확인
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
print(classification_report(y_test,base_model.predict(X_test_tree)))

# Comufsion matrix 확인

cm = confusion_matrix(y_test,base_model.predict(X_test_tree))
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['benign', 'malignant'])
disp.plot()

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, auc, roc_curve, RocCurveDisplay

y_pred = base_model.predict(X_test_tree)
print(classification_report(y_test,y_pred))

label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)
y_onehot_test.shape  # (n_samples, n_classes)
y_pred = label_binarizer.transform(y_pred)

n_classes = y_onehot_test.shape[-1]

# store the fpr, tpr, and roc_auc for all averaging strategies
fpr, tpr, roc_auc = dict(), dict(), dict()
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_onehot_test.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['micro']:.2f}")


for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_onehot_test[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

fpr_grid = np.linspace(0.0, 1.0, 1000)

# Interpolate all ROC curves at these points
mean_tpr = np.zeros_like(fpr_grid)

for i in range(n_classes):
    mean_tpr += np.interp(fpr_grid, fpr[i], tpr[i])  # linear interpolation

# Average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = fpr_grid
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{roc_auc['macro']:.2f}")

In [None]:
from itertools import cycle

fig, ax = plt.subplots(figsize=(6, 6))

plt.plot(
    fpr["micro"],
    tpr["micro"],
    label=f"micro-average ROC curve (AUC = {roc_auc['micro']:.2f})",
    color="deeppink",
    linestyle=":",
    linewidth=4,
)

plt.plot(
    fpr["macro"],
    tpr["macro"],
    label=f"macro-average ROC curve (AUC = {roc_auc['macro']:.2f})",
    color="navy",
    linestyle=":",
    linewidth=4,
)

colors = cycle(["aqua", "darkorange", "cornflowerblue"])
for class_id, color in zip(range(n_classes), colors):
    RocCurveDisplay.from_predictions(
        y_onehot_test[:, class_id],
        y_pred[:, class_id],
        name=f"ROC curve for {class_id+1}",
        color=color,
        ax=ax
    )

_ = ax.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="Extension of Receiver Operating Characteristic\nto One-vs-Rest multiclass",
)