# **Model EfficicientNet-B4-B5**

In [None]:
import pandas as pd; df = pd.read_csv("/kaggle/input/data-final-augmente/df_final_augmente.csv")
import os
df['cropped_image_path'] = df['cropped_image_path'].replace({
    '/content/drive/MyDrive/data/cropped/': '/kaggle/input/dataset/cropped/cropped/',
    '/content/drive/MyDrive/data/augmentation/': '/kaggle/input/dataset/augmentation/augmentation/',
    '/content/drive/MyDrive/data/fond/': '/kaggle/input/fond-crees/fond/'
}, regex=True)

df['path_exists'] = df['cropped_image_path'].apply(os.path.exists)

invalid_paths = df[~df['path_exists']]
if not invalid_paths.empty:
    print("Les chemins suivants sont invalides :")
    print(invalid_paths[['cropped_image_path']])
else:
    print("Tous les chemins sont valides !")
df.head(3)

Tous les chemins sont valides !


Unnamed: 0,cropped_image_path,label_collembole,path_exists
0,/kaggle/input/dataset/cropped/cropped/0.001032...,AUTRE,True
1,/kaggle/input/dataset/cropped/cropped/0.001985...,AUTRE,True
2,/kaggle/input/dataset/cropped/cropped/0.005582...,AUTRE,True


In [None]:
df = df[['cropped_image_path', 'label_collembole']]

In [None]:
df['label_collembole'].value_counts()

label_collembole
AUTRE      414
FOND       300
Cer        200
CRY_THE    150
PAR_NOT    150
LEP        150
MET_AFF    150
HYP_MAN    150
ISO_MIN    146
Name: count, dtype: int64

In [None]:
df

Unnamed: 0,cropped_image_path,label_collembole
0,/kaggle/input/dataset/cropped/cropped/0.001032...,AUTRE
1,/kaggle/input/dataset/cropped/cropped/0.001985...,AUTRE
2,/kaggle/input/dataset/cropped/cropped/0.005582...,AUTRE
3,/kaggle/input/dataset/cropped/cropped/0.005608...,ISO_MIN
4,/kaggle/input/dataset/cropped/cropped/0.005608...,AUTRE
...,...,...
1805,/kaggle/input/dataset/augmentation/augmentatio...,ISO_MIN
1806,/kaggle/input/dataset/augmentation/augmentatio...,ISO_MIN
1807,/kaggle/input/dataset/augmentation/augmentatio...,ISO_MIN
1808,/kaggle/input/dataset/augmentation/augmentatio...,ISO_MIN


# **EfficientNet-B4**

In [None]:
!pip install efficientnet-pytorch

Collecting efficientnet-pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16424 sha256=d2349be5fb451e0c6c36627d256bfac956200a3f4da03b9c0afc726786a52b8d
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms, models
from efficientnet_pytorch import EfficientNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from torchmetrics.classification import MulticlassF1Score
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:


# 1. Configuration initiale
class Config:
    SEED = 42
    IMG_SIZE = 300  # Augmenté pour mieux capturer les détails
    BATCH_SIZE = 32
    NUM_WORKERS = 4
    NUM_EPOCHS = 50
    LR = 1e-4
    WEIGHT_DECAY = 1e-5
    TTA_STEPS = 5  # Test Time Augmentation steps

pl.seed_everything(Config.SEED)

# 2. Dataset et DataLoaders avec gestion améliorée du déséquilibre
class CollemboleDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['cropped_image_path']
        image = Image.open(img_path).convert('RGB')
        label = self.df.iloc[idx]['label_idx']

        if self.transform:
            image = self.transform(image)

        return image, label

# Augmentations plus robustes
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(Config.IMG_SIZE, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.GaussianBlur(kernel_size=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize(Config.IMG_SIZE + 50),
    transforms.CenterCrop(Config.IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 3. Modèle avec gestion avancée des classes déséquilibrées
class CollemboleModel(pl.LightningModule):
    def __init__(self, num_classes, class_weights=None):
        super().__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=num_classes)
        self.class_weights = class_weights
        self.criterion = nn.CrossEntropyLoss(weight=self.class_weights)
        self.train_f1 = MulticlassF1Score(num_classes=num_classes, average='macro')
        self.val_f1 = MulticlassF1Score(num_classes=num_classes, average='macro')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        f1 = self.train_f1(preds, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_f1', f1, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        f1 = self.val_f1(preds, y)
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_f1', f1, prog_bar=True)
        return {'val_loss': loss, 'val_f1': f1, 'preds': preds, 'targets': y}

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=Config.LR, weight_decay=Config.WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.1, patience=3, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_f1',
            }
        }

# 4. Pipeline complet
def main():
    # Chargement des données
    classes = df['label_collembole'].unique()
    num_classes = len(classes)
    class_to_idx = {cls: i for i, cls in enumerate(classes)}
    idx_to_class = {i: cls for i, cls in enumerate(classes)}
    df['label_idx'] = df['label_collembole'].map(class_to_idx)

    # Calcul des poids de classe
    class_counts = df['label_idx'].value_counts().sort_index().values
    class_weights = 1. / class_counts
    class_weights = torch.tensor(class_weights, dtype=torch.float32)

    # Split des données
    train_df, val_df = train_test_split(
        df, test_size=0.2, stratify=df['label_collembole'], random_state=Config.SEED
    )

    # Création des datasets
    train_dataset = CollemboleDataset(train_df, transform=train_transform)
    val_dataset = CollemboleDataset(val_df, transform=val_transform)

    # Sampler pour le déséquilibre
    sample_weights = class_weights[train_df['label_idx'].values]
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights))

    # DataLoaders
    train_loader = DataLoader(
        train_dataset, batch_size=Config.BATCH_SIZE, sampler=sampler,
        num_workers=Config.NUM_WORKERS, pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False,
        num_workers=Config.NUM_WORKERS, pin_memory=True
    )

    # Callbacks
    early_stop = EarlyStopping(monitor="val_f1", patience=7, mode="max", verbose=True)
    checkpoint = ModelCheckpoint(
        monitor="val_f1", mode="max", save_top_k=1,
        filename="best_model-{epoch}-{val_f1:.2f}"
    )

    # Entraînement
    model = CollemboleModel(num_classes=num_classes, class_weights=class_weights)
    trainer = pl.Trainer(
        max_epochs=Config.NUM_EPOCHS,
        callbacks=[early_stop, checkpoint],
        accelerator='auto',
        devices=1,
        precision=16,  # Mixed precision pour accélérer l'entraînement
        deterministic=True
    )
    trainer.fit(model, train_loader, val_loader)

    # 5. Évaluation et prédictions avec TTA
    class TestDataset(Dataset):
        def __init__(self, test_dir, transform=None):
            self.test_dir = test_dir
            self.image_files = sorted([f for f in os.listdir(test_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
            self.transform = transform

        def __len__(self):
            return len(self.image_files)

        def __getitem__(self, idx):
            img_path = os.path.join(self.test_dir, self.image_files[idx])
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, self.image_files[idx]

    # Chargement du meilleur modèle
    best_model = CollemboleModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path,
        num_classes=num_classes,
        class_weights=class_weights
    ).to('cuda' if torch.cuda.is_available() else 'cpu')
    best_model.eval()

    # Transformations pour TTA
    tta_transforms = [
        transforms.Compose([
            transforms.Resize(Config.IMG_SIZE + 50),
            transforms.CenterCrop(Config.IMG_SIZE),
            transforms.RandomHorizontalFlip(p=1.0),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]),
        transforms.Compose([
            transforms.Resize(Config.IMG_SIZE + 50),
            transforms.CenterCrop(Config.IMG_SIZE),
            transforms.RandomRotation(30),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    ]

    def predict_with_tta(model, image_path, n_tta=Config.TTA_STEPS):
        original_image = Image.open(image_path).convert('RGB')
        preds = []

        # Prédiction originale
        with torch.no_grad():
            img = val_transform(original_image).unsqueeze(0).to(best_model.device)
            logits = model(img)
            preds.append(logits.softmax(dim=1))

        # TTA
        for _ in range(n_tta - 1):
            tta_transform = tta_transforms[np.random.randint(0, len(tta_transforms))]
            with torch.no_grad():
                img = tta_transform(original_image).unsqueeze(0).to(best_model.device)
                logits = model(img)
                preds.append(logits.softmax(dim=1))

        # Moyenne des prédictions
        avg_probs = torch.mean(torch.cat(preds, dim=0), dim=0)
        return avg_probs.argmax().item()

    # Prédictions pour Kaggle
    test_dir = "/kaggle/input/datatest/datatest"
    test_dataset = TestDataset(test_dir)
    filenames = test_dataset.image_files

    predictions = []
    for filename in tqdm(filenames, desc="Processing Test Images"):
        img_path = os.path.join(test_dir, filename)
        pred_idx = predict_with_tta(best_model, img_path)
        predictions.append(idx_to_class[pred_idx])

    # Sauvegarde des résultats
    submission = pd.DataFrame({
        'image_filename': filenames,
        'predicted_label': predictions
    })
    submission.to_csv('kaggle_submission_robust.csv', index=False)
    print("Soumission Kaggle générée avec succès!")

if __name__ == "__main__":
    main()

Loaded pretrained weights for efficientnet-b4


/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (46) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Loaded pretrained weights for efficientnet-b4


Processing Test Images: 100%|██████████| 1344/1344 [03:57<00:00,  5.67it/s]

Soumission Kaggle générée avec succès!





In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from torchvision.utils import make_grid
import pandas as pd
import torch

def evaluate_model_performance(best_model, val_loader, idx_to_class):
    """Évalue les performances du modèle sur l'ensemble de validation"""
    best_model.eval()
    all_preds = []
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for batch in val_loader:
            images, labels = batch
            images = images.to(best_model.device)
            labels = labels.to(best_model.device)

            outputs = best_model(images)
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

    # 1. Rapport de classification
    class_names = [idx_to_class[i] for i in range(len(idx_to_class))]
    print("Classification Report:")
    print(classification_report(all_targets, all_preds, target_names=class_names))

    # 2. Matrice de confusion
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(all_targets, all_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Matrice de confusion')
    plt.xlabel('Prédictions')
    plt.ylabel('Vraies étiquettes')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.show()

    # 3. Visualisation des prédictions incorrectes
    val_dataset = val_loader.dataset
    incorrect_indices = [i for i, (pred, target) in enumerate(zip(all_preds, all_targets)) if pred != target]

    print(f"\nNombre total d'erreurs: {len(incorrect_indices)}/{len(val_dataset)}")
    print(f"Taux d'erreur: {len(incorrect_indices)/len(val_dataset):.2%}")

    # Afficher quelques exemples erronés
    num_samples = min(9, len(incorrect_indices))
    if num_samples > 0:
        plt.figure(figsize=(15, 15))
        for i, idx in enumerate(incorrect_indices[:num_samples]):
            image, label = val_dataset[idx]
            pred_label = all_preds[idx]

            plt.subplot(3, 3, i+1)
            image = image.numpy().transpose((1, 2, 0))
            mean = np.array([0.485, 0.456, 0.406])
            std = np.array([0.229, 0.224, 0.225])
            image = std * image + mean
            image = np.clip(image, 0, 1)

            plt.imshow(image)
            plt.title(f"Vrai: {idx_to_class[label]}\nPrédit: {idx_to_class[pred_label]}")
            plt.axis('off')
        plt.tight_layout()
        plt.show()

    # 4. Analyse des probabilités de prédiction
    all_probs = np.array(all_probs)
    max_probs = np.max(all_probs, axis=1)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(max_probs, bins=20, kde=True)
    plt.title('Distribution des probabilités maximales')
    plt.xlabel('Probabilité maximale')
    plt.ylabel('Fréquence')

    plt.subplot(1, 2, 2)
    correct = np.array(all_preds) == np.array(all_targets)
    sns.boxplot(x=correct, y=max_probs)
    plt.title('Probabilités maximales: Correct vs Incorrect')
    plt.xticks([0, 1], ['Incorrect', 'Correct'])
    plt.ylabel('Probabilité maximale')
    plt.tight_layout()
    plt.show()

    # 5. Analyse par classe
    class_metrics = []
    for class_idx in range(len(idx_to_class)):
        class_mask = np.array(all_targets) == class_idx
        class_correct = np.array(all_preds)[class_mask] == np.array(all_targets)[class_mask]
        accuracy = np.mean(class_correct) if sum(class_mask) > 0 else 0
        avg_prob = np.mean(max_probs[class_mask]) if sum(class_mask) > 0 else 0

        class_metrics.append({
            'Classe': idx_to_class[class_idx],
            'Exemples': sum(class_mask),
            'Précision': accuracy,
            'Probabilité moyenne': avg_prob
        })

    metrics_df = pd.DataFrame(class_metrics)
    print("\nMétriques par classe:")
    display(metrics_df.sort_values('Précision', ascending=False))

    return {
        'predictions': all_preds,
        'targets': all_targets,
        'probabilities': all_probs,
        'incorrect_indices': incorrect_indices,
        'class_metrics': metrics_df
    }

# Utilisation:
# Supposons que vous avez déjà:
# - best_model: votre modèle entraîné chargé
# - val_loader: le DataLoader pour l'ensemble de validation
# - idx_to_class: le mapping des indices aux noms de classes

# Charger le meilleur modèle (si ce n'est pas déjà fait)
best_model = CollemboleModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path,
    num_classes=len(idx_to_class),
    class_weights=class_weights
).to('cuda' if torch.cuda.is_available() else 'cpu')

# Évaluer les performances
results = evaluate_model_performance(best_model, val_loader, idx_to_class)

# Analyse supplémentaire: Courbe ROC pour un problème multiclasse
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

# Préparation des données pour ROC
y_test = label_binarize(results['targets'], classes=range(len(idx_to_class)))
y_score = np.array(results['probabilities'])

# Calcul des courbes ROC pour chaque classe
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(idx_to_class)):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Tracer toutes les courbes ROC
plt.figure(figsize=(10, 8))
colors = plt.cm.rainbow(np.linspace(0, 1, len(idx_to_class)))
for i, color in zip(range(len(idx_to_class)), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='{0} (AUC = {1:0.2f})'
             ''.format(idx_to_class[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Taux de faux positifs')
plt.ylabel('Taux de vrais positifs')
plt.title('Courbes ROC par classe')
plt.legend(loc="lower right")
plt.show()

NameError: name 'trainer' is not defined

In [None]:
s = pd.read_csv("/kaggle/working/kaggle_submission_robust.csv")
s

Unnamed: 0,image_filename,predicted_label
0,0.000287271854623405430.157444760850390830.996...,AUTRE
1,0.0010863036917236890.47680670826091410.132699...,HYP_MAN
2,0.00353289859414340770.76539580712550110.01220...,AUTRE
3,0.0042943058174115260.7384949814241050.9756010...,FOND
4,0.0046691375801363180.53378260284225820.730659...,FOND
...,...,...
1339,0.99716142613047390.197750336695433180.6459868...,Cer
1340,0.9973257779891610.138331744890855870.49351583...,FOND
1341,0.99770272541465340.482771084491531170.2954322...,PAR_NOT
1342,0.9981505977730490.27455536007485760.980027279...,AUTRE


In [None]:
s['predicted_label'].value_counts()

predicted_label
FOND       513
AUTRE      262
LEP        161
Cer        127
CRY_THE     87
ISO_MIN     86
PAR_NOT     60
HYP_MAN     35
MET_AFF     13
Name: count, dtype: int64

In [None]:
s = s.rename(columns={'image_filename': 'idx', 'predicted_label': 'gt'})

In [None]:
s['idx'] = s['idx'].str.replace(r'\.[^.]+$', '', regex=True)


In [None]:
label_mapping = {
    'AUTRE': 0,
    'Cer': 1,
    'HYP_MAN': 3,
    'ISO_MIN': 4,
    'MET_AFF': 6,
    'PAR_NOT': 7,
    'CRY_THE': 2,
    'LEP': 5,
    'FOND': 8
}

s['gt'] = s['gt'].map(label_mapping)

In [None]:
s.to_csv('model1_submission.csv', index=False)


# **EfficientNet-B5**

In [None]:
def get_train_transforms():
    return A.Compose([
        A.RandomResizedCrop(Config.IMG_SIZE, Config.IMG_SIZE, scale=(0.7, 1.0)),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.3),
        A.Rotate(limit=30, p=0.5),
        A.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1, p=0.5),
        A.GaussianBlur(blur_limit=(3, 7)),
        A.GaussNoise(var_limit=(10, 50)),
        A.CoarseDropout(max_holes=8, max_height=32, max_width=32, fill_value=0, p=0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

def get_val_transforms():
    return A.Compose([
        A.Resize(Config.IMG_SIZE + 50, Config.IMG_SIZE + 50),
        A.CenterCrop(Config.IMG_SIZE, Config.IMG_SIZE),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

class CollemboleDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['cropped_image_path']
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.iloc[idx]['label_idx']

        if self.transform:
            image = self.transform(image=image)['image']

        return image, label

In [None]:
def get_train_transforms():
    return A.Compose([
        A.RandomResizedCrop(Config.IMG_SIZE, Config.IMG_SIZE, scale=(0.7, 1.0)),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.3),
        A.Rotate(limit=30, p=0.5),
        A.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1, p=0.5),
        A.GaussianBlur(blur_limit=(3, 7)),
        A.GaussNoise(var_limit=(10, 50)),
        A.CoarseDropout(max_holes=8, max_height=32, max_width=32, fill_value=0, p=0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

def get_val_transforms():
    return A.Compose([
        A.Resize(Config.IMG_SIZE + 50, Config.IMG_SIZE + 50),
        A.CenterCrop(Config.IMG_SIZE, Config.IMG_SIZE),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

class CollemboleDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.iloc[idx]['cropped_image_path']
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.iloc[idx]['label_idx']

        if self.transform:
            image = self.transform(image=image)['image']

        return image, label



In [None]:
class CollemboleModel(pl.LightningModule):
    def __init__(self, num_classes, class_weights=None):
        super().__init__()
        self.model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=num_classes)
        self.class_weights = class_weights
        self.criterion = nn.CrossEntropyLoss(weight=self.class_weights)

        self.train_f1 = MulticlassF1Score(num_classes=num_classes, average='macro')
        self.val_f1 = MulticlassF1Score(num_classes=num_classes, average='macro')

        self.training_loss = []
        self.validation_loss = []
        self.training_f1 = []
        self.validation_f1 = []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        f1 = self.train_f1(preds, y)

        self.log('train_loss', loss, prog_bar=True)
        self.log('train_f1', f1, prog_bar=True)

        # Stockage pour visualisation
        self.training_loss.append(loss.detach().cpu().numpy())
        self.training_f1.append(f1.detach().cpu().numpy())

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        f1 = self.val_f1(preds, y)

        self.log('val_loss', loss, prog_bar=True)
        self.log('val_f1', f1, prog_bar=True)

        self.validation_loss.append(loss.detach().cpu().numpy())
        self.validation_f1.append(f1.detach().cpu().numpy())

        return {'val_loss': loss, 'val_f1': f1, 'preds': preds, 'targets': y}

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=Config.LR, weight_decay=Config.WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Config.NUM_EPOCHS, eta_min=1e-6)
        return [optimizer], [scheduler]

In [None]:
 #%% Cell 4: Entraînement du modèle

classes = df['label_collembole'].unique()
num_classes = len(classes)
class_to_idx = {cls: i for i, cls in enumerate(classes)}
idx_to_class = {i: cls for i, cls in enumerate(classes)}
df['label_idx'] = df['label_collembole'].map(class_to_idx)

# Calcul des poids de classe
class_counts = df['label_idx'].value_counts().sort_index().values
class_weights = 1 / (class_counts ** 0.75)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Split stratifié
train_df, val_df = train_test_split(
    df, test_size=0.15, stratify=df['label_collembole'], random_state=Config.SEED
)

# Datasets et DataLoaders
train_dataset = CollemboleDataset(train_df, transform=get_train_transforms())
val_dataset = CollemboleDataset(val_df, transform=get_val_transforms())

# Sampler pour déséquilibre
sample_weights = class_weights[train_df['label_idx'].values]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

train_loader = DataLoader(
    train_dataset, batch_size=Config.BATCH_SIZE, sampler=sampler,
    num_workers=Config.NUM_WORKERS, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False,
    num_workers=Config.NUM_WORKERS, pin_memory=True
)

# Callbacks
early_stop = EarlyStopping(monitor="val_f1", patience=8, mode="max", verbose=True)
checkpoint = ModelCheckpoint(
    monitor="val_f1", mode="max", save_top_k=2,
    filename="best_model-{epoch}-{val_f1:.3f}"
)

# Entraînement
model = CollemboleModel(num_classes=num_classes, class_weights=class_weights)
trainer = pl.Trainer(
    max_epochs=Config.NUM_EPOCHS,
    callbacks=[early_stop, checkpoint],
    accelerator='auto',
    devices=1,
    precision=16 if Config.MIXED_PRECISION else 32,
    deterministic=True,
    gradient_clip_val=1.0
)
trainer.fit(model, train_loader, val_loader)

# Sauvegarde du modèle final
torch.save(model.state_dict(), 'collembole_model_final.pth')

In [None]:
# %% Cell 6: Prédictions et soumission
best_model = CollemboleModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path,
    num_classes=num_classes,
    class_weights=class_weights
).eval().to('cuda')

# Transformations TTA
tta_transforms = [
    A.Compose([
        A.Resize(Config.IMG_SIZE + 50, Config.IMG_SIZE + 50),
        A.CenterCrop(Config.IMG_SIZE, Config.IMG_SIZE),
        A.HorizontalFlip(p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ]),
    A.Compose([
        A.Resize(Config.IMG_SIZE + 70, Config.IMG_SIZE + 70),
        A.RandomCrop(Config.IMG_SIZE, Config.IMG_SIZE),
        A.Rotate(limit=25, p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ]),
    A.Compose([
        A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
        A.GaussianBlur(blur_limit=(5, 9), p=1.0),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
]

def predict_tta(image_path, model, n_tta=Config.TTA_STEPS):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    preds = []

    # Prédiction de base
    with torch.no_grad():
        aug = get_val_transforms()(image=image)['image'].unsqueeze(0).to('cuda')
        logits = model(aug)
        preds.append(logits.softmax(dim=1))

    # TTA aléatoire
    for _ in range(n_tta - 1):
        tta = tta_transforms[np.random.randint(0, len(tta_transforms))]
        with torch.no_grad():
            aug = tta(image=image)['image'].unsqueeze(0).to('cuda')
            logits = model(aug)
            preds.append(logits.softmax(dim=1))

    avg_probs = torch.mean(torch.cat(preds, dim=0), dim=0)
    return avg_probs.argmax().item(), avg_probs.cpu().numpy()

# Génération de la soumission
test_dir = "/kaggle/input/dataset/datatest/datatest"
test_files = sorted([f for f in os.listdir(test_dir) if f.endswith(('.jpg', '.png'))])

predictions = []
all_probs = []
for file in tqdm(test_files, desc="Generating TTA Predictions"):
    img_path = os.path.join(test_dir, file)
    pred_idx, probs = predict_tta(img_path, best_model)
    predictions.append(idx_to_class[pred_idx])
    all_probs.append(probs)

submission = pd.DataFrame({
    'image_filename': test_files,
    'predicted_label': predictions
})
submission.to_csv('merci_final.csv', index=False)
print("Soumission Kaggle générée avec succès!")
