In [None]:
%pip install xgboost lightgbm

In [None]:
# ============== ML Models ==============
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
)


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# ============== Data Preprocessing ==============
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
# ============== Model Evaluation and Tuning ==============
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

import matplotlib.pyplot as plt


# Ignore all warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
FIGSIZE = (15, 9)
FONTSIZE = 15

class MLClassifierPipeline:
    """
    A pipeline for training multiple machine learning classification models,
    comparing their performance, and selecting the best model.
    """
    def __init__(self, X, y, seed):
        """
        Initialize the pipeline with data and optional random seed.
        """
        self.X = X
        self.y = y
        self.models = {}  # Dictionary for trained models
        self.accuracies = {}  # Dictionary accuracies
        self.seed = seed

    def standardize_data(self):
        scaler = RobustScaler()  # Initialize scaler
        self.X = scaler.fit_transform(self.X)  # Fit and transform the data

    def split_data(self, test_size=0.25, random_state=None):
        """
        Splits the data into training and test sets.
        If balance_data is True, applies SMOTE to balance the training data.
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state
        )

            
    def train_models(self):
        """
        Trains various classification models and stores their accuracies.
        """
        self.models['Logistic Regression'] = LogisticRegression()
        self.models['K-Nearest Neighbors'] = KNeighborsClassifier()
        self.models['Decision Tree'] = DecisionTreeClassifier()
        self.models['Random Forest'] = RandomForestClassifier()
        self.models['SVM'] = SVC()
        self.models['Gradient Boosting'] = GradientBoostingClassifier()
        self.models['AdaBoost'] = AdaBoostClassifier()
        self.models['LDA'] = LinearDiscriminantAnalysis()
        self.models['QDA'] = QuadraticDiscriminantAnalysis()
        self.models['Naive Bayes'] = GaussianNB()
        self.models['Extra Trees'] = ExtraTreesClassifier()
        self.models['SGD'] = SGDClassifier()
        self.models['Perceptron'] = Perceptron()
        self.models['XGBoost'] = XGBClassifier()
        self.models['LightGBM'] = LGBMClassifier(verbose=-1)
        self.models['Neural Network'] = MLPClassifier(max_iter=1000, learning_rate_init=0.002,  early_stopping=True)

        # Train each model and compute its accuracy
        for name, model in self.models.items():
            np.random.seed(self.seed)
            model.fit(self.X_train, self.y_train)           # Train model
            y_pred = model.predict(self.X_test)             # Predictions
            accuracy = accuracy_score(self.y_test, y_pred)  # Accuracy
            self.accuracies[name] = accuracy                # Store accuracy

    def plot_accuracies(self, palette='Purples_r', figsize=FIGSIZE):

        # Sort accuracies in descending order
        self.sorted_accuracies = {
            k: v for k, v in sorted(self.accuracies.items(), key=lambda item: item[1], reverse=True)}

        # Plot accuracies
        plt.figure(figsize=figsize)
        ax = sns.barplot(y=list(self.sorted_accuracies.keys()),
                         x=list(self.sorted_accuracies.values()),
                         palette=palette, orient='h')
        plt.ylabel('Accuracy')
        plt.title('Model Accuracy Comparison')

        for p, value in zip(ax.patches, list(self.sorted_accuracies.values())):
            ax.annotate(f"{value:.3f}", (p.get_width(), p.get_y() + p.get_height() / 2),
                      ha='left', va='center', xytext=(5, 0), textcoords='offset points', fontsize=10)
        plt.grid(alpha=0.25)
        plt.show()


    def plot_confusion_matrix(self, figsize=(11, 12), cmap='Blues', fontsize=12):
        # Getting best model
        _, best_model, _ = self.get_best_model()
        
        # Generate predictions
        self.y_pred = best_model.predict(self.X_test)

        # Generate the confusion matrix
        cm = confusion_matrix(self.y_test, self.y_pred)
        
        # Plot the confusion matrix
        plt.figure(figsize=figsize)
        sns.heatmap(cm, annot=True, fmt='g', cmap=cmap, annot_kws={"size": fontsize}, cbar=False)
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.grid(alpha=0)
        plt.show()

    def get_best_model(self):
        # Find model with highest accuracy
        best_model_name = max(self.accuracies, key=self.accuracies.get)
        best_model = self.models[best_model_name]

        return best_model_name, best_model, self.accuracies[best_model_name]

In [None]:
# Loading data
df = pd.read_csv("data/Radiomic_features_all.csv" , sep = ",")
df.head

In [None]:
# Setting parameter
n_fold = 5
rand_seed = 42

In [None]:
# Preprocessing data

## Removing missing value
df = df.dropna()

## Removing duplicate data
df.drop_duplicates(subset=None, keep='first', ignore_index=False, inplace=True)

## Data split
X = df.drop('label', axis=1).values  # Features
Y = df['label'].values.astype(np.uint8)  # Target variable

le = LabelEncoder()
Y = le.fit_transform(Y)

# Initialize pipeline
pipeline = MLClassifierPipeline(X, Y, rand_seed)

# data prep
pipeline.standardize_data()
pipeline.split_data(test_size=0.25, random_state=rand_seed)

# Train and evaluate models
pipeline.train_models()

# Plot accuracies
pipeline.plot_accuracies('Blues_r')

# Get the best model
best_model_name, best_model, best_accuracy = pipeline.get_best_model()
print(best_model_name, best_accuracy)
