In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

import os
os.chdir("../../")
from Sources.tools import *

# Load data

In [2]:
X = pd.read_csv("Data/Databases/exctracted_features_MF_and_CLAHE_and_USM.csv", sep=";")

In [3]:
X

Unnamed: 0,autocorrelation,contrast,cluster_prominence,entropy,SRE,LRE,GLNU,SRLGE,LRLGE
0,1.125878e+10,241192840.5,-1.768294e+18,-3.329782e+07,0.911443,1120.071171,1374.867653,0.003045,10.189213
1,1.362137e+10,343482122.0,-1.968002e+18,-3.158177e+07,0.917400,817.363562,1577.522941,0.003414,7.658526
2,1.290517e+10,186401566.5,-3.314933e+18,-3.283380e+07,0.916115,1011.056380,1405.321843,0.002652,9.162723
3,1.461546e+10,205781680.0,-2.666998e+18,-3.158472e+07,0.923531,797.469737,1619.473333,0.002318,7.410873
4,1.897350e+10,638419545.0,3.837020e+18,-2.780947e+07,0.929546,390.509294,2434.721973,0.002755,3.910674
...,...,...,...,...,...,...,...,...,...
317,9.059579e+09,355988720.0,6.973620e+17,-3.659844e+07,0.893653,2131.067671,963.062051,0.006248,18.929129
318,1.906799e+10,314087220.0,4.023501e+18,-2.801951e+07,0.928443,403.672710,2261.097254,0.002172,3.711675
319,1.893995e+10,287514011.5,1.602595e+18,-2.882182e+07,0.922948,417.740070,1906.430157,0.002947,3.753486
320,1.506780e+10,486510862.5,1.386037e+18,-3.126636e+07,0.918118,738.986604,1668.789593,0.003209,6.945933


In [4]:
infos = load_infos()

In [5]:
infos

Unnamed: 0,image_idx,tissue,abnormality,severity,x_coord,y_coord,radius
0,mdb001,G,CIRC,B,535,425,197.0
1,mdb002,G,CIRC,B,522,280,69.0
2,mdb003,D,NORM,0,0,0,0.0
3,mdb004,D,NORM,0,0,0,0.0
4,mdb005,F,CIRC,B,477,133,30.0
...,...,...,...,...,...,...,...
325,mdb318,D,NORM,0,0,0,0.0
326,mdb319,D,NORM,0,0,0,0.0
327,mdb320,D,NORM,0,0,0,0.0
328,mdb321,D,NORM,0,0,0,0.0


In [6]:
infos = infos.drop_duplicates(subset=["image_idx"])
y = infos["abnormality"]
y = [1 if ab == "NORM" else 0 for ab in y]

# Pre process

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model

## Optuna optimization

In [8]:
best_params = get_best_hyperparameters_optuna(X_train, y_train, n_trials=500)

[I 2025-02-25 15:29:25,620] A new study created in memory with name: no-name-b2acb7ed-abf2-4f06-9fcd-c85701020c61
[I 2025-02-25 15:29:25,716] Trial 0 finished with value: 0.6029411764705882 and parameters: {'n_estimators': 110, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6029411764705882.
[I 2025-02-25 15:29:25,892] Trial 1 finished with value: 0.6029411764705882 and parameters: {'n_estimators': 200, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6029411764705882.
[I 2025-02-25 15:29:26,013] Trial 2 finished with value: 0.6029411764705882 and parameters: {'n_estimators': 140, 'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6029411764705882.
[I 2025-02-25 15:29:26,083] Trial 3 finished with value: 0.5882352941176471 and parameters: {'n_estimators': 60, 'max_depth': 19, 'min_


Meilleurs hyperparamètres trouvés : {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Meilleure accuracy (LOO) : 0.6765


## Create Model

In [9]:
model = RandomForestClassifier(**best_params, random_state=42)

model.fit(X_train, y_train)

# Results

In [10]:
y_pred_train = model.predict(X_train)

metrics_train = compute_classification_metrics(y_train, y_pred_train)

Accuracy: 0.938
Sensitivity (Recall): 0.993
Specificity: 0.835
PPV (Precision): 0.918
NPV: 0.985
F1-score: 0.954
AUC: 0.914
Balanced Accuracy: 0.914


In [11]:
y_pred = model.predict(X_test)

metrics_test = compute_classification_metrics(y_test, y_pred)

Accuracy: 0.649
Sensitivity (Recall): 0.841
Specificity: 0.294
PPV (Precision): 0.688
NPV: 0.500
F1-score: 0.757
AUC: 0.568
Balanced Accuracy: 0.568
