In [None]:
import pandas as pd

# Charger le CSV
df = pd.read_csv("../data/dataset_selection.csv")

# Aper√ßu du dataset
print("Aper√ßu du dataset :")
print(df.head())

print("\nInformations g√©n√©rales :")
print(df.info())


In [None]:
print(f"Nombre total d'images : {len(df)}")
print(f"Nombre de colonnes : {df.shape[1]}")


In [None]:
# Compter le nombre d'images par label
class_counts = df['label'].value_counts()

print("\nR√©partition des labels :")
print(class_counts)

# Pourcentage par label
class_percent = df['label'].value_counts(normalize=True) * 100
print("\nPourcentage par label :")
print(class_percent.round(2))


In [None]:
import matplotlib.pyplot as plt

class_counts.plot(kind='bar')
plt.title("Distribution des labels")
plt.xlabel("label")
plt.ylabel("Nombre d'images")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
for classe in df['label'].unique():
    print(f"\nExemples pour la classe '{classe}' :")
    print(df[df['label'] == classe]['path'].head(3).tolist())


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)




print("Tailles des splits :")
print(f"Train : {len(df_train)}")
print(f"Test : {len(df_test)}")


In [None]:
def show_distribution(name, data):
    print(f"\n{name}")
    print(data['label'].value_counts(normalize=True).round(3))

show_distribution("Train", df_train)
show_distribution("Test", df_test)


In [None]:
import cv2
import numpy as np
import os

def augment_image(img):
    aug_images = []

    # rotation
    for angle in [-15, 15]:
        h, w = img.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1)
        aug_images.append(cv2.warpAffine(img, M, (w, h)))

    # flip
    aug_images.append(cv2.flip(img, 1))

    return aug_images


In [None]:
from PIL import Image
import numpy as np

def load_image_gray(path):
    img = Image.open(path).convert("L")
    return np.array(img)

Version avec extraction de caracteristique


In [None]:
from skimage.feature import hog

def extract_hog(image):
    image = cv2.resize(image, (128, 128))
    features = hog(
        image,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm='L2-Hys'
    )
    return features


In [None]:
X_train, y_train = [], []

for _, row in df_train.iterrows():
    img = load_image_gray(row['path'])

    # image originale
    X_train.append(extract_hog(img))
    y_train.append(row['label'])

    # data augmentation
    for aug in augment_image(img):
        X_train.append(extract_hog(aug))
        y_train.append(row['label'])

X_train = np.array(X_train)
y_train = np.array(y_train)

print("Train :", X_train.shape, y_train.shape)


In [None]:
X_test, y_test = [], []

for _, row in df_test.iterrows():
    img = load_image_gray(row['path'])
    X_test.append(extract_hog(img))
    y_test.append(row['label'])

X_test = np.array(X_test)
y_test = np.array(y_test)

print("Test :", X_test.shape, y_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression(
    max_iter=5000,
    random_state=42
)

param_grid = [
    {
        "solver": ["lbfgs"],
        "penalty": ["l2"],
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
        "tol": [1e-4, 1e-3, 1e-2],
        "max_iter": [2000, 5000]
    }
]

grid = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=1,
    verbose=2
)

grid.fit(X_train, y_train)


In [None]:
print("Meilleurs param√®tres :", grid.best_params_)
print("Meilleur score CV (F1-macro) :", grid.best_score_)
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)


In [None]:
from sklearn.metrics import classification_report
print("\n===== Logistic Regression (Best Model) =====")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred, labels=best_lr.classes_)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=best_lr.classes_,
    yticklabels=best_lr.classes_
)
plt.xlabel("Pr√©dit")
plt.ylabel("R√©el")
plt.title("Matrice de confusion - Logistic Regression (Best)")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

acc = accuracy_score(y_test, y_pred)

prec_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
rec_macro  = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1_macro   = f1_score(y_test, y_pred, average='macro', zero_division=0)

prec_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec_weighted  = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1_weighted   = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {acc:.4f}")
print(f"Precision macro: {prec_macro:.4f} | weighted: {prec_weighted:.4f}")
print(f"Recall macro:    {rec_macro:.4f} | weighted: {rec_weighted:.4f}")
print(f"F1 macro:        {f1_macro:.4f} | weighted: {f1_weighted:.4f}")


In [None]:
specificities = []

for i in range(len(best_lr.classes_)):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    tn = cm.sum() - (tp + fp + fn)

    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    specificities.append(spec)

spec_macro = np.mean(specificities)

print(f"Specificity macro: {spec_macro:.4f}")
print("Specificity par classe:")
print(dict(zip(best_lr.classes_, specificities)))


In [None]:
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap='Blues',
    xticklabels=best_lr.classes_,
    yticklabels=best_lr.classes_
)
plt.xlabel("Pr√©dit")
plt.ylabel("R√©el")
plt.title("Matrice de confusion normalis√©e - Logistic Regression (Best)")
plt.tight_layout()
plt.show()


In [None]:
import joblib

joblib.dump(best_lr, "logistic_regression_best.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Meilleur mod√®le et scaler sauvegard√©s üíæ")


Seulement PCA

In [None]:
def image_to_vector(image, size=(128, 128)):
    image = cv2.resize(image, size)
    return image.flatten()


In [None]:
X_train, y_train = [], []

for _, row in df_train.iterrows():
    img = load_image_gray(row["path"])

    # image originale
    X_train.append(image_to_vector(img))
    y_train.append(row["label"])

    # data augmentation
    for aug in augment_image(img):
        X_train.append(image_to_vector(aug))
        y_train.append(row["label"])

X_train = np.array(X_train)
y_train = np.array(y_train)

print("Train :", X_train.shape, y_train.shape)


In [None]:
X_test, y_test = [], []

for _, row in df_test.iterrows():
    img = load_image_gray(row["path"])
    X_test.append(image_to_vector(img))
    y_test.append(row["label"])

X_test = np.array(X_test)
y_test = np.array(y_test)

print("Test :", X_test.shape, y_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA (95 % de variance conserv√©e)
pca = PCA(n_components=0.95, random_state=42)
X_train = pca.fit_transform(X_train_scaled)
X_test = pca.transform(X_test_scaled)

print("Pixels initiaux :", X_train_scaled.shape[1])
print("Dimensions apr√®s PCA :", X_train.shape[1])
print("Variance expliqu√©e :", pca.explained_variance_ratio_.sum())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(random_state=42)

param_grid = [
    {
        "solver": ["lbfgs"],
        "penalty": ["l2"],
        "C": [0.001, 0.01, 0.1, 1, 10],
        "max_iter": [2000, 5000]
    }
]

grid = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    verbose=2
)

grid.fit(X_train, y_train)


In [None]:
print("Meilleurs param√®tres :", grid.best_params_)
print("Meilleur score CV (F1-macro) :", grid.best_score_)
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)


In [None]:
print("\n===== Logistic Regression (Best Model) =====")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred, labels=best_lr.classes_)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=best_lr.classes_,
    yticklabels=best_lr.classes_
)
plt.xlabel("Pr√©dit")
plt.ylabel("R√©el")
plt.title("Matrice de confusion - Logistic Regression (Best)")
plt.tight_layout()
plt.show()


PCA + extraction de donn√©es

In [None]:
X_train, y_train = [], []

for _, row in df_train.iterrows():
    img = load_image_gray(row["path"])

    # image originale
    X_train.append(extract_hog(img))
    y_train.append(row["label"])

    # data augmentation
    for aug in augment_image(img):
        X_train.append(extract_hog(aug))
        y_train.append(row["label"])

X_train = np.array(X_train)
y_train = np.array(y_train)

print("Train :", X_train.shape, y_train.shape)


In [None]:
X_test, y_test = [], []

for _, row in df_test.iterrows():
    img = load_image_gray(row["path"])
    X_test.append(extract_hog(img))
    y_test.append(row["label"])

X_test = np.array(X_test)
y_test = np.array(y_test)

print("Test :", X_test.shape, y_test.shape)


In [None]:
# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA (95 % de variance conserv√©e)
pca = PCA(n_components=0.95, random_state=42)
X_train = pca.fit_transform(X_train_scaled)
X_test = pca.transform(X_test_scaled)

print("Dimensions HOG initiales :", X_train_scaled.shape[1])
print("Dimensions apr√®s PCA :", X_train.shape[1])
print("Variance expliqu√©e :", pca.explained_variance_ratio_.sum())


In [None]:
lr = LogisticRegression(random_state=42)

param_grid = [
    {
        "solver": ["lbfgs"],
        "penalty": ["l2"],
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
        "tol": [1e-4, 1e-3, 1e-2],
        "max_iter": [2000, 5000]
    }
]

grid = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    verbose=2
)

grid.fit(X_train, y_train)


In [None]:
print("Meilleurs param√®tres :", grid.best_params_)
print("Meilleur score CV (F1-macro) :", grid.best_score_)
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)

In [None]:
print("\n===== Logistic Regression (Best Model) =====")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(y_test, y_pred, labels=best_lr.classes_)

plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=best_lr.classes_,
    yticklabels=best_lr.classes_
)
plt.xlabel("Pr√©dit")
plt.ylabel("R√©el")
plt.title("Matrice de confusion - Logistic Regression (Best)")
plt.tight_layout()
plt.show()
