In [152]:
import numpy as np
import os 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import (
    StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_val_score
)
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier,StackingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
# ---- Column selector to give each pipeline its own features ----
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.cols]
import warnings
warnings.filterwarnings('ignore')  # Suppress all warnings
os.environ['DMLC_LOG_LEVEL'] = '3'

import xgboost as xgb
# Globally set XGBoost verbosity to zero (no INFO/WARNING)
xgb.set_config(verbosity=0)
from sklearn_mrmr.mrmr import MRMRFeatureSelector
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, VotingClassifier, StackingClassifier,
    ExtraTreesClassifier, BaggingClassifier
)
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.random_projection import GaussianRandomProjection
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score


In [153]:
BASE_DIR = os.getcwd()

# look in the data folder and take the feature you want to reduced.
FEATURES_TO_ANALYSE = "shape_firstorder_glcm_glrlm_features"
# Shape + texture features 
DATA_DIR = os.path.join(BASE_DIR,"data",FEATURES_TO_ANALYSE)

TRAIN_DIR = os.path.join(DATA_DIR,"TrainningDataset_reduced.csv")
LABEL_DIR = os.path.join(DATA_DIR,"TrainningDatasetCategory.csv")
TEST_DIR = os.path.join(DATA_DIR,"TestingDataset_reduced.csv")


In [154]:
# X contain basic features, volumes and border of each segmentation as well as weight and age of each person.
X = pd.read_csv(TRAIN_DIR)
y = pd.read_csv(LABEL_DIR)
X_test = pd.read_csv(TEST_DIR)

# Assurez-vous que les identifiants sont bien formatés avec des zéros
X["Id"] = X["Id"].astype(str).str.zfill(3)
y["Id"] = y["Id"].astype(str).str.zfill(3)

# Mélange des lignes de X
X = X.sample(frac=1, random_state=42).reset_index(drop=True)

# Réalignement parfait de y sur l’ordre de X
y = y.set_index("Id").loc[X["Id"]].reset_index()
y_train = y["Category"]

# Séparation des features
X_train = X.drop(columns=["Id"])

X_test["Id"] = X_test["Id"].astype(str).str.zfill(3)
X_test = X_test.drop(columns=["Id"])


In [155]:
# ---- 1. Définition des groupes de features ----
shape_cols   = [c for c in X_train.columns if "shape" in c]
noshape_cols = [c for c in X_train.columns if c not in shape_cols]
all_cols     = X_train.columns.tolist()

# ---- 2. Sélection mRMR de 20 features sur l'ensemble ----
mrmr_sel   = MRMRFeatureSelector(n_features_to_select=20, method='mi_quotient')
mrmr_feats = list(mrmr_sel.fit_transform(X_train, y_train).columns)

# ---- 3. Paramètres partagés et CV ----
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
lr_grid = {'clf__C': [0.01, 0.1, 1, 10], 'clf__penalty': ['l2'], 'clf__solver': ['lbfgs']}
rf_grid = {'clf__n_estimators': [100, 300, 500], 'clf__max_depth': [None, 5, 10], 'clf__min_samples_leaf': [1, 2, 4]}
svm_grid = {'clf__C': [0.1, 1, 10], 'clf__gamma': ['scale', 0.01, 0.1]}

def make_best_estimators(feature_list):
    """Retourne (best_lr, best_rf, best_svm) via GridSearchCV sur feature_list."""
    lr = Pipeline([
        ("sel",   ColumnSelector(feature_list)),
        ("scale", StandardScaler()),
        ("clf",   LogisticRegression(class_weight="balanced", max_iter=1000, random_state=0))
    ])
    svm = Pipeline([
        ("sel",   ColumnSelector(feature_list)),
        ("scale", StandardScaler()),
        ("clf",   SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=0))
    ])
    best_lr  = GridSearchCV(lr,  lr_grid,  cv=cv, scoring='accuracy', n_jobs=-1).fit(X_train, y_train).best_estimator_
    best_rf  = GridSearchCV(rf,  rf_grid,  cv=cv, scoring='accuracy', n_jobs=-1).fit(X_train, y_train).best_estimator_
    best_svm = GridSearchCV(svm, svm_grid, cv=cv, scoring='accuracy', n_jobs=-1).fit(X_train, y_train).best_estimator_
    return best_lr, best_rf, best_svm

# ---- 4. Voting sur shape-only ----
best_lr_sh, best_rf_sh, best_svm_sh = make_best_estimators(shape_cols)
vote_shape = VotingClassifier(
    estimators=[("lr_sh", best_lr_sh), ("rf_sh", best_rf_sh), ("svm_sh", best_svm_sh)],
    voting="soft", n_jobs=-1
)

# ---- 5. Voting sur no-shape ----
best_lr_ns, best_rf_ns, best_svm_ns = make_best_estimators(noshape_cols)
vote_noshape = VotingClassifier(
    estimators=[("lr_ns", best_lr_ns), ("rf_ns", best_rf_ns), ("svm_ns", best_svm_ns)],
    voting="soft", n_jobs=-1
)

# ---- 6. Voting sur mRMR-features ----
best_lr_m, best_rf_m, best_svm_m = make_best_estimators(mrmr_feats)
vote_mrmr = VotingClassifier(
    estimators=[("lr_m", best_lr_m), ("rf_m", best_rf_m), ("svm_m", best_svm_m)],
    voting="soft", n_jobs=-1
)

# ---- 7. Voting sur TOUTES les features ----
best_lr_f, best_rf_f, best_svm_f = make_best_estimators(all_cols)
vote_full = VotingClassifier(
    estimators=[("lr_f", best_lr_f), ("rf_f", best_rf_f), ("svm_f", best_svm_f)],
    voting="soft", n_jobs=-1
)

# ---- 8. Voteur KNN + Poly sur all_cols ----
knn_poly = make_pipeline(
    ColumnSelector(all_cols),
    StandardScaler(),
    PolynomialFeatures(degree=2, include_bias=False),
    KNeighborsClassifier(n_neighbors=5)
)
vote_knn = VotingClassifier([("knn_p", knn_poly)], voting="soft", n_jobs=-1)

# ---- 9. Voteur GaussianNB sur mRMR ----
gnb_mrmr = Pipeline([("sel", ColumnSelector(mrmr_feats)), ("clf", GaussianNB())])
vote_gnb = VotingClassifier([("gnb", gnb_mrmr)], voting="soft", n_jobs=-1)

# ---- 10. Voteur ExtraTrees + RP sur no-shape ----
et_rp = make_pipeline(
    ColumnSelector(noshape_cols),
    GaussianRandomProjection(n_components=30, random_state=0),
    ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=0, n_jobs=-1)
)
vote_et = VotingClassifier([("et_rp", et_rp)], voting="soft", n_jobs=-1)

# ---- 12. Stacking final de tous les voteurs ----
final_stack = StackingClassifier(
    estimators=[
        ("vote_shape",   vote_shape),
        ("vote_noshape", vote_noshape),
        ("vote_mrmr",    vote_mrmr),
        ("vote_full",    vote_full),
        ("vote_knn",     vote_knn),
        ("vote_gnb",     vote_gnb),
        ("vote_et",      vote_et),
    ],
    final_estimator=LogisticRegression(class_weight="balanced", max_iter=1000, random_state=0),
    stack_method="predict_proba",
    cv=cv, n_jobs=-1
)

# ---- 13. Évaluation par CV ----
scores = cross_val_score(final_stack, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"Diversified stacking — Accuracy CV: {scores.mean():.3f} ± {scores.std():.3f}")

# ---- 14. Entraînement final et prédiction ----
final_stack.fit(X_train, y_train)
y_test_pred = final_stack.predict(X_test)

NameError: name 'rf' is not defined

In [151]:
# Predict on the test set
import os

submission_name = "submission_ensemble_stacking_v7.csv"
submission_dataframe = pd.DataFrame(columns=["Id","Category"])
submission_dataframe["Id"] = X_test.index + 101 

submission_dataframe["Category"] = y_test_pred
submission_dataframe.to_csv(os.path.join(os.getcwd(),submission_name),index=False)

print("File saved")

File saved
