In [1]:
import numpy as np
import os 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import (
    StratifiedKFold, RandomizedSearchCV, GridSearchCV, cross_val_score
)
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
# ---- Column selector to give each pipeline its own features ----
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.cols]
import warnings
warnings.filterwarnings('ignore')  # Suppress all warnings
os.environ['DMLC_LOG_LEVEL'] = '3'

import xgboost as xgb
# Globally set XGBoost verbosity to zero (no INFO/WARNING)
xgb.set_config(verbosity=0)

In [2]:
BASE_DIR = os.getcwd()

# look in the data folder and take the feature you want to reduced.
FEATURES_TO_ANALYSE = "shape_firstorder_glcm_glrlm_features"
# Shape + texture features 
DATA_DIR = os.path.join(BASE_DIR,"data",FEATURES_TO_ANALYSE)

TRAIN_DIR = os.path.join(DATA_DIR,"TrainningDataset_reduced.csv")
LABEL_DIR = os.path.join(DATA_DIR,"TrainningDatasetCategory.csv")
TEST_DIR = os.path.join(DATA_DIR,"TestingDataset_reduced.csv")


In [3]:
# X contain basic features, volumes and border of each segmentation as well as weight and age of each person.
X = pd.read_csv(TRAIN_DIR)
y = pd.read_csv(LABEL_DIR)
X_test = pd.read_csv(TEST_DIR)

# Assurez-vous que les identifiants sont bien formatés avec des zéros
X["Id"] = X["Id"].astype(str).str.zfill(3)
y["Id"] = y["Id"].astype(str).str.zfill(3)

# Mélange des lignes de X
X = X.sample(frac=1, random_state=42).reset_index(drop=True)

# Réalignement parfait de y sur l’ordre de X
y = y.set_index("Id").loc[X["Id"]].reset_index()
y_train = y["Category"]

# Séparation des features
X_train = X.drop(columns=["Id"])

X_test["Id"] = X_test["Id"].astype(str).str.zfill(3)
X_test = X_test.drop(columns=["Id"])


In [None]:

# ---- Common cross-validation setup ----
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---- Build pipelines with SFS selecting 15 features per model ----
svc_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("sfs",   SequentialFeatureSelector(
                   SVC(kernel="linear", class_weight="balanced"),
                   n_features_to_select=15,
                   direction="forward",
                   cv=cv,
                   n_jobs=-1
               )),
    ("clf",   SVC(kernel="rbf", probability=True, class_weight="balanced"))
])

rf_pipe = Pipeline([
    ("sfs", SequentialFeatureSelector(
                RandomForestClassifier(random_state=0),
                n_features_to_select=15,
                direction="forward",
                cv=cv,
                n_jobs=-1
            )),
    ("clf", RandomForestClassifier(
                random_state=0, class_weight="balanced", n_jobs=-1
            ))
])

xgb_pipe = Pipeline([
    ("scale", StandardScaler()),
    ("sfs",   SequentialFeatureSelector(
                   XGBClassifier(eval_metric="mlogloss", verbosity=0, random_state=0, n_jobs=-1),
                   n_features_to_select=15,
                   direction="forward",
                   cv=cv,
                   n_jobs=-1
               )),
    ("clf",   XGBClassifier(
                   eval_metric="mlogloss", verbosity=0, random_state=0, n_jobs=-1
               ))
])

# ---- Parameter distributions for randomized search ----
svc_param_dist = {
    'clf__C':     [0.1, 1, 10, 100],
    'clf__gamma': [1e-3, 1e-2, 1e-1, 1]
}
rf_param_dist = {
    'clf__n_estimators':    [100, 300, 500],
    'clf__max_depth':       [None, 5, 10],
    'clf__min_samples_leaf':[1, 2, 4]
}
xgb_param_dist = {
    'clf__n_estimators':  [100, 300, 500],
    'clf__max_depth':     [3, 4, 6],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__subsample':     [0.6, 0.8, 1.0]
}

# ---- Wrap pipelines in RandomizedSearchCV ----
svc_search = RandomizedSearchCV(
    svc_pipe, svc_param_dist, n_iter=8, cv=cv,
    scoring='accuracy', n_jobs=-1, random_state=0
)
rf_search = RandomizedSearchCV(
    rf_pipe, rf_param_dist, n_iter=8, cv=cv,
    scoring='accuracy', n_jobs=-1, random_state=0
)
xgb_search = RandomizedSearchCV(
    xgb_pipe, xgb_param_dist, n_iter=12, cv=cv,
    scoring='accuracy', n_jobs=-1, random_state=0
)

# ---- Soft-voting ensemble with tuned estimators ----
voter = VotingClassifier(
    estimators=[("svm", svc_search),
                ("rf",  rf_search),
                ("xgb", xgb_search)],
    voting="soft", n_jobs=-1
)

# ---- CV evaluation ----
scores = cross_val_score(voter, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"Ensemble CV accuracy after RandomizedSearch: {scores.mean():.3f} ± {scores.std():.3f}")

# ---- Fit on full training set & predict on test set ----
voter.fit(X_train, y_train)
y_test_pred = voter.predict(X_test)



Exception ignored in: <function ResourceTracker.__del__ at 0x102beafc0>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104eb2fc0>
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/opt/homebrew/Cellar/python@3.12/3.12.10/Frameworks/Python.framework/

In [5]:
# Predict on the test set
import os

submission_name = "submission_ensemble_voting_v4.csv"
submission_dataframe = pd.DataFrame(columns=["Id","Category"])
submission_dataframe["Id"] = X_test.index + 101 

submission_dataframe["Category"] = y_test_pred
submission_dataframe.to_csv(os.path.join(os.getcwd(),submission_name),index=False)

print("File saved")

File saved
