In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

import os
os.chdir("../../")
from Sources.tools import *

# Load data

In [2]:
X = pd.read_csv("Data/Databases/exctracted_features_CLAHE.csv", sep=";")

In [3]:
X

Unnamed: 0,autocorrelation,contrast,cluster_prominence,entropy,SRE,LRE,GLNU,SRLGE,LRLGE
0,1.102093e+10,324484259.5,-3.635332e+18,-3.374980e+07,0.901026,1235.261820,1408.621151,0.000157,8.619439
1,1.320325e+10,420459688.0,-2.123014e+18,-3.207894e+07,0.903045,909.777325,1629.239484,0.000155,6.346289
2,1.267555e+10,194163010.5,7.374988e+17,-3.328780e+07,0.895406,1127.672494,1425.964160,0.000140,7.857125
3,1.431725e+10,239132287.0,-6.131754e+17,-3.206426e+07,0.904266,875.852207,1675.534250,0.000150,6.111674
4,1.828023e+10,935900315.5,-1.506827e+18,-2.818272e+07,0.897033,427.183943,2551.090559,0.000192,2.983285
...,...,...,...,...,...,...,...,...,...
317,8.584570e+09,407491043.5,-2.900451e+18,-3.713318e+07,0.865654,2418.572568,981.616462,0.000160,16.826272
318,1.874011e+10,435034556.5,2.913108e+18,-2.839721e+07,0.902870,435.376341,2341.483525,0.000148,3.038904
319,1.865391e+10,405805096.0,2.944321e+18,-2.931694e+07,0.892149,446.645046,1929.420127,0.000170,3.109180
320,1.424301e+10,573536596.0,7.009558e+17,-3.178009e+07,0.888044,843.837119,1761.498579,0.000163,5.885866


In [4]:
infos = load_infos()

In [5]:
infos

Unnamed: 0,image_idx,tissue,abnormality,severity,x_coord,y_coord,radius
0,mdb001,G,CIRC,B,535,425,197.0
1,mdb002,G,CIRC,B,522,280,69.0
2,mdb003,D,NORM,0,0,0,0.0
3,mdb004,D,NORM,0,0,0,0.0
4,mdb005,F,CIRC,B,477,133,30.0
...,...,...,...,...,...,...,...
325,mdb318,D,NORM,0,0,0,0.0
326,mdb319,D,NORM,0,0,0,0.0
327,mdb320,D,NORM,0,0,0,0.0
328,mdb321,D,NORM,0,0,0,0.0


In [6]:
infos = infos.drop_duplicates(subset=["image_idx"])
y = infos["abnormality"]
y = [1 if ab == "NORM" else 0 for ab in y]

# Pre process

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model

## Optuna optimization

In [8]:
best_params = get_best_hyperparameters_optuna(X_train, y_train, n_trials=500)

[I 2025-02-25 15:29:23,970] A new study created in memory with name: no-name-62b31ced-f43b-46a7-bb0e-d53616e43c24
[I 2025-02-25 15:29:24,102] Trial 0 finished with value: 0.6323529411764706 and parameters: {'n_estimators': 160, 'max_depth': 2, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6323529411764706.
[I 2025-02-25 15:29:24,148] Trial 1 finished with value: 0.6470588235294118 and parameters: {'n_estimators': 50, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6470588235294118.
[I 2025-02-25 15:29:24,204] Trial 2 finished with value: 0.6176470588235294 and parameters: {'n_estimators': 60, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None}. Best is trial 1 with value: 0.6470588235294118.
[I 2025-02-25 15:29:24,352] Trial 3 finished with value: 0.6764705882352942 and parameters: {'n_estimators': 200, 'max_depth': 13, 'min_sam


Meilleurs hyperparamètres trouvés : {'n_estimators': 170, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2'}
Meilleure accuracy (LOO) : 0.6912


## Create Model

In [9]:
model = RandomForestClassifier(**best_params, random_state=42)

model.fit(X_train, y_train)

# Results

In [10]:
y_pred_train = model.predict(X_train)

metrics_train = compute_classification_metrics(y_train, y_pred_train)

Accuracy: 0.933
Sensitivity (Recall): 1.000
Specificity: 0.810
PPV (Precision): 0.907
NPV: 1.000
F1-score: 0.951
AUC: 0.905
Balanced Accuracy: 0.905


In [11]:
y_pred = model.predict(X_test)

metrics_test = compute_classification_metrics(y_test, y_pred)

Accuracy: 0.598
Sensitivity (Recall): 0.905
Specificity: 0.029
PPV (Precision): 0.633
NPV: 0.143
F1-score: 0.745
AUC: 0.467
Balanced Accuracy: 0.467
