# Data Analysis and Predictive Modeling with SHAP and XGBoost

## Title  
**Integrating Machine Learning with Metabolic Models for Precision Trauma Care: Personalized ENDOTYPE Stratification and Metabolic Target Identification**

## Authors  
- **Igor Marin de Mas** (Copenhagen University Hospital, Rigshospitalet)  
- **Lincoln Moura** (Universidade Federal do Ceará)  
- **Fernando Luiz Marcelo Antunes** (Universidade Federal do Ceará)  
- **Josep Maria Guerrero** (Aalborg University)  
- **Pär Ingemar Johansson** (Copenhagen University Hospital, Rigshospitalet)  

## Description:
This script aims to classify patient data using the XGBoost classifier while selecting the most important features based on SHAP values. The workflow includes:

1. **Loading Preprocessed Data**: Reads patient data from CSV files.
2. **Combining Data**: Merges individual patient datasets into a single DataFrame, ensuring data consistency.
3. **Feature Selection**: Utilizes SHAP values to determine the most relevant features for classification.
4. **Model Training and Evaluation**: Trains an XGBoost model on selected features and evaluates performance using confusion matrices and classification reports.
5. **Feature Importance Visualization**: Plots the most influential features to enhance model interpretability.

The script iterates through different feature subsets (top 50, 30, 20, 10) ranked by SHAP importance, training models and analyzing their performance.

In [None]:
# Standard libraries
import os
import glob
import warnings
import re

# Third-party libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import scikitplot as skplt

# Suppress warnings
warnings.filterwarnings("ignore")

In [None]:
# -----------------------------
# Utility Functions
# -----------------------------

def extract_patient_index(file_path):
    """
    Extract the patient index from the file name using regex.

    Args:
        file_path (str): Path to the file.

    Returns:
        int: Patient index extracted from the file name.

    Raises:
        ValueError: If the index cannot be extracted from the file name.
    """
    match = re.search(r'(\d+)', os.path.basename(file_path))
    if match:
        return int(match.group(1)) - 1
    else:
        raise ValueError(f"Cannot extract index from file name: {file_path}")

def load_preprocessed_data(preprocessed_path, num_patients):
    """
    Load preprocessed patient data from CSV files.

    Args:
        preprocessed_path (str): Path to the directory with preprocessed data.
        num_patients (int): Number of patients.

    Returns:
        dict: Dictionary of patient DataFrames indexed by patient number.
        list: List of indices for successfully loaded patients.
    """
    patients = {x: None for x in range(num_patients)}
    test_indices = []

    for file_path in glob.glob(os.path.join(preprocessed_path, '*.csv')):
        try:
            index = extract_patient_index(file_path)
            test_indices.append(index)
            patients[index] = pd.read_csv(file_path, index_col=0)
        except ValueError as e:
            print(f"[ERROR] {e}")
            continue
        except Exception as e:
            print(f"[ERROR] Could not load file: {file_path}. Error: {e}")
            continue

    return patients, test_indices

def combine_patient_data(patients, test_indices, target):
    """
    Combine individual patient data into a single DataFrame with target labels.

    Args:
        patients (dict): Dictionary of patient DataFrames.
        test_indices (list): List of indices for patients.
        target (list): Target labels for patients.

    Returns:
        pd.DataFrame: Combined DataFrame with all patient data and target labels.
    """
    dataframes = []

    for i in test_indices:
        temp = patients[i].T
        temp["target"] = target[i]
        dataframes.append(temp)

    df = pd.concat(dataframes, axis=0).reset_index(drop=True)

    if df.isna().sum().sum() > 0:
        print("[WARNING] Missing values detected. Filling with column means.")
        df.fillna(df.mean(), inplace=True)

    return df

def conf_matrix(clf, X_train, X_test, y_train, y_test):
    """
    Plot confusion matrices for training and testing predictions.

    Args:
        clf: Trained classifier.
        X_train: Training data.
        X_test: Testing data.
        y_train: Training labels.
        y_test: Testing labels.
    """
    Y_train_pred = clf.predict(X_train)
    Y_test_pred = clf.predict(X_test)

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    skplt.metrics.plot_confusion_matrix(Y_train_pred, y_train, normalize=False, title="Training Confusion Matrix", cmap="Oranges", ax=axes[0])
    skplt.metrics.plot_confusion_matrix(Y_test_pred, y_test, normalize=False, title="Testing Confusion Matrix", cmap="Purples", ax=axes[1])
    plt.show()

def train_and_evaluate(feature_set, num_iterations=10):
    """
    Train and evaluate an XGBoost model on a given feature set.

    Args:
        feature_set (list): List of feature names to be used in training.
        num_iterations (int): Number of times to repeat training.
    """
    for i in range(num_iterations):
        X, y = df.loc[:, feature_set], df['target'] - 1
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
        
        model = XGBClassifier(objective='multi:softmax')
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))
        
        conf_matrix(model, X_train, X_test, y_train, y_test)
        
        plt.figure(figsize=(20, 6))
        importance_df = pd.DataFrame(model.feature_importances_, index=feature_set, columns=[f"Importance_{i}"])
        importance_df.sort_values(by=f"Importance_{i}", ascending=False).head(50).plot.bar(figsize=(20, 3))
        plt.show()

In [None]:
# -----------------------------
# Load Data
# -----------------------------

# Load target labels
print("Loading target data...")
target_df = pd.read_csv("Patient_Trauma_Groups.csv", delimiter=";")
target = target_df["Metabo-group"].values

# Load patient data
print("Loading patient data...")
preprocessed_path = "preprocess_PCA/"
num_patients = 95
patients, test_indices = load_preprocessed_data(preprocessed_path, num_patients)

# Combine patient data into a single DataFrame
df = combine_patient_data(patients, test_indices, target)

In [None]:
# Load feature rankings
df_global = pd.read_csv("Ranking_global_sharp.csv", index_col=0)
feature_sets = {
    "top50": df_global.sort_values(by='Soma', ascending=False).head(50).index,
    "top30": df_global.sort_values(by='Soma', ascending=False).head(30).index,
    "top20": df_global.sort_values(by='Soma', ascending=False).head(20).index,
    "top10": df_global.sort_values(by='Soma', ascending=False).head(10).index
}

# -----------------------------
# Model Training and Evaluation
# -----------------------------

# Train models on different feature sets
for feature_name, features in feature_sets.items():
    print(f"\nTraining model with {feature_name} features...")
    train_and_evaluate(features)