In [27]:
import numpy as np
import os 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import (
    StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_val_score
)
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,StackingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
# ---- Column selector to give each pipeline its own features ----
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.cols]
import warnings
warnings.filterwarnings('ignore')  # Suppress all warnings
os.environ['DMLC_LOG_LEVEL'] = '3'

import xgboost as xgb
# Globally set XGBoost verbosity to zero (no INFO/WARNING)
xgb.set_config(verbosity=0)
from sklearn_mrmr.mrmr import MRMRFeatureSelector
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

In [28]:
BASE_DIR = os.getcwd()

# look in the data folder and take the feature you want to reduced.
FEATURES_TO_ANALYSE = "shape_firstorder_glcm_glrlm_features"
# Shape + texture features 
DATA_DIR = os.path.join(BASE_DIR,"data",FEATURES_TO_ANALYSE)

TRAIN_DIR = os.path.join(DATA_DIR,"TrainningDataset_reduced.csv")
LABEL_DIR = os.path.join(DATA_DIR,"TrainningDatasetCategory.csv")
TEST_DIR = os.path.join(DATA_DIR,"TestingDataset_reduced.csv")


In [29]:
# X contain basic features, volumes and border of each segmentation as well as weight and age of each person.
X = pd.read_csv(TRAIN_DIR)
y = pd.read_csv(LABEL_DIR)
X_test = pd.read_csv(TEST_DIR)

# Assurez-vous que les identifiants sont bien formatés avec des zéros
X["Id"] = X["Id"].astype(str).str.zfill(3)
y["Id"] = y["Id"].astype(str).str.zfill(3)

# Mélange des lignes de X
X = X.sample(frac=1, random_state=42).reset_index(drop=True)

# Réalignement parfait de y sur l’ordre de X
y = y.set_index("Id").loc[X["Id"]].reset_index()
y_train = y["Category"]

# Séparation des features
X_train = X.drop(columns=["Id"])

X_test["Id"] = X_test["Id"].astype(str).str.zfill(3)
X_test = X_test.drop(columns=["Id"])


In [32]:



# ---- Define feature groups ----
shape_cols      = [c for c in X_train.columns if "shape" in c]
all_cols        = X_train.columns.tolist()

# ---- mRMR feature selection ----
# Combine with target for pymrmr (it expects DataFrame with target column last)
mrmr_sel = MRMRFeatureSelector(n_features_to_select=20, method='mi_quotient') 
shape_selected = list(mrmr_sel.fit_transform(X_train[shape_cols],y_train).columns)

all_selected = list(mrmr_sel.fit_transform(X_train,y_train).columns)

print("Shape features selected (20):", shape_selected)
print("All features selected (20):", all_selected)
# ---- Define pipelines ----
rf_pipe = Pipeline([
    ("sel", ColumnSelector(shape_selected)),
    ("clf", RandomForestClassifier(class_weight="balanced", random_state=0, n_jobs=-1))
])
svm_pipe = Pipeline([
    ("sel",   ColumnSelector(all_selected)),
    ("scale", StandardScaler()),
    ("clf",   SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=0))
])
xgb_pipe = Pipeline([
    ("sel",   ColumnSelector(all_selected)),
    ("scale", StandardScaler()),
    ("clf",   XGBClassifier(
                   booster='gblinear',
                   objective="multi:softprob",
                   eval_metric="mlogloss",
                   verbosity=0,
                   random_state=0,
                   n_jobs=-1
               ))
])

# ---- Hyperparameter grids ----
rf_param_grid = {
    'clf__n_estimators':    [100, 300, 500],
    'clf__max_depth':       [None, 5, 10],
    'clf__min_samples_leaf':[1, 2, 4],
}
svm_param_grid = {
    'clf__C':     [0.1, 1, 10,5,6,3],
    'clf__gamma': ['scale', 0.01, 0.1,0.05]
}
xgb_param_grid = {
    'clf__reg_alpha':  [0.0, 0.1, 1.0],
    'clf__reg_lambda': [0.1, 1.0, 10.0],
    'clf__learning_rate': [0.01, 0.05, 0.1]
}

# ---- CV setup ----
cv = StratifiedKFold(n_splits=5, shuffle=True)

# ---- GridSearchCV tuning ----
rf_search = GridSearchCV(rf_pipe, rf_param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
svm_search = GridSearchCV(svm_pipe, svm_param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
xgb_search = GridSearchCV(xgb_pipe, xgb_param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

rf_search.fit(X_train, y_train)
svm_search.fit(X_train, y_train)
xgb_search.fit(X_train, y_train)

best_rf  = rf_search.best_estimator_
best_svm = svm_search.best_estimator_
best_xgb = xgb_search.best_estimator_

# ---- Calibrate base learners ----
cal_rf  = CalibratedClassifierCV(best_rf, cv=5)
cal_svm = CalibratedClassifierCV(best_svm, cv=5)
cal_xgb = CalibratedClassifierCV(best_xgb, cv=5)

# ---- True stacking ensemble ----
stack = StackingClassifier(
    estimators=[
        ('rf',  cal_rf),
        ('svm', cal_svm),
        ('xgb', cal_xgb)
    ],
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced'),
    stack_method='predict_proba',
    cv=5,
    n_jobs=-1
)

# ---- Evaluate stacking ----
scores = cross_val_score(stack, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Stacking CV accuracy: {scores.mean():.3f} ± {scores.std():.3f}")

# ---- Fit and predict ----
stack.fit(X_train, y_train)
y_test_pred = stack.predict(X_test)


Shape features selected (20): ['ED_LV_shape_VoxelVolume_over_ES_LV_shape_VoxelVolume', 'ES_RV_shape_SurfaceVolumeRatio', 'ED_LV_shape_Sphericity', 'ES_RV_shape_Maximum2DDiameterRow', 'ED_RV_shape_VoxelVolume_over_ED_LV_shape_VoxelVolume', 'ED_LV_shape_LeastAxisLength', 'ES_RV_shape_VoxelVolume_over_ED_MY_shape_VoxelVolume', 'ES_LV_shape_SurfaceVolumeRatio', 'ES_RV_shape_Sphericity', 'ED_RV_shape_VoxelVolume_over_ED_MY_shape_VoxelVolume', 'ES_LV_shape_VoxelVolume_over_ES_RV_shape_VoxelVolume', 'ES_RV_shape_MajorAxisLength', 'ED_MY_shape_VoxelVolume_over_ES_LV_shape_VoxelVolume', 'ES_LV_shape_Sphericity', 'ED_MY_shape_Flatness', 'ES_RV_shape_VoxelVolume_over_ED_RV_shape_VoxelVolume', 'ED_RV_shape_VoxelVolume_over_ES_LV_shape_VoxelVolume', 'ES_RV_shape_Maximum2DDiameterSlice', 'ES_RV_shape_VoxelVolume_over_ED_LV_shape_VoxelVolume', 'ED_LV_shape_VoxelVolume_over_ED_RV_shape_VoxelVolume']
All features selected (20): ['ED_LV_shape_VoxelVolume_over_ES_LV_shape_VoxelVolume', 'ED_LV_firstorder_

In [33]:
# Predict on the test set
import os

submission_name = "submission_ensemble_stacking_v1.csv"
submission_dataframe = pd.DataFrame(columns=["Id","Category"])
submission_dataframe["Id"] = X_test.index + 101 

submission_dataframe["Category"] = y_test_pred
submission_dataframe.to_csv(os.path.join(os.getcwd(),submission_name),index=False)

print("File saved")

File saved
