# Soft label

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Keras / TensorFlow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, Callback
from sklearn.metrics import f1_score



In [None]:
import pandas as pd
import numpy as np

# Paramètres
NUM_CLASSES = 9    # Classes 0..8

csv_path = "/kaggle/input/annotations-all-soft/annotations_all.csv"
df = pd.read_csv(csv_path)

# df ressemble à :
#      id                                                votes
# 0    0.5093551905..._box0               "2_8_8_2"
# 1    0.5093551905..._background0        "8"
# 2    0.5833849204..._box0               "6_6_6_6"
# ...

# 1) Créer la colonne "filename" = <id>.jpg ou .png selon vos fichiers :
df["filename"] = df["id"].astype(str) + ".jpg"  # ou ".png" si vos images sont en png

# 2) Créer des colonnes vides "class_0" .. "class_8"
for c in range(NUM_CLASSES):
    df[f"class_{c}"] = 0.0

# 3) Parser la colonne "votes" pour remplir la distribution
for idx, row in df.iterrows():
    vote_str = row["votes"]          # ex "2_8_8_2"
    votes_list = [int(v) for v in vote_str.split("_")] if "_" in vote_str else [int(vote_str)]
    # votes_list = [2, 8, 8, 2] par ex.

    # Comptage
    freq = np.zeros(NUM_CLASSES)
    for v in votes_list:
        freq[v] += 1

    # Normaliser
    total = freq.sum()
    freq = freq / total  # ex [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5] pour "2_8_8_2"

    # Placer ces valeurs dans df
    for c in range(NUM_CLASSES):
        df.loc[idx, f"class_{c}"] = freq[c]

# 4) Nettoyer (optionnel) : vous pouvez enlever la colonne "votes" si vous n’en avez plus besoin
df.drop(columns=["votes"], inplace=True)

# 5) Sauvegarder éventuellement ce nouveau CSV dans un dossier en écriture
output_csv_path = "/kaggle/working/annotations_all_soft.csv"
df.to_csv(output_csv_path, index=False)

print(f"CSV sauvegardé dans : {output_csv_path}")

# Maintenant df a la structure :
#   id                        filename                class_0  class_1  ... class_8
#   "0.5093..._box0"          "0.5093..._box0.jpg"    0.0      0.0           0.5
#   "0.5093..._background0"   "0.5093..._background0.jpg"   ...           ...
#   ...


CSV sauvegardé dans : /kaggle/working/annotations_all_soft.csv


In [5]:
df

Unnamed: 0,id,filename,class_0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8
0,0.50935519057888360.493314924113490540.9354201...,0.50935519057888360.493314924113490540.9354201...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5
1,0.50935519057888360.493314924113490540.9354201...,0.50935519057888360.493314924113490540.9354201...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.58338492048237630.8085755055351820.176967644...,0.58338492048237630.8085755055351820.176967644...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.58338492048237630.8085755055351820.176967644...,0.58338492048237630.8085755055351820.176967644...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0146642344262495340.0082065942776055060.4910...,0.0146642344262495340.0082065942776055060.4910...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1491,0.0056082525335631180.072100167826147480.09500...,0.0056082525335631180.072100167826147480.09500...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1492,0.0056082525335631180.072100167826147480.09500...,0.0056082525335631180.072100167826147480.09500...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1493,0.82490590543815430.147464726755823340.0742258...,0.82490590543815430.147464726755823340.0742258...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1494,0.82490590543815430.147464726755823340.0742258...,0.82490590543815430.147464726755823340.0742258...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.model_selection import train_test_split

# Supposez que votre DataFrame final s'appelle df et possède les colonnes :
# "filename", "class_0", "class_1", ..., "class_8"

# 1) Définir la classe majoritaire (pseudo-label) pour stratifie
majority_class = df[[f"class_{i}" for i in range(NUM_CLASSES)]].idxmax(axis=1)
# Cela crée une Série contenant des valeurs "class_0", "class_1", etc.
# correspondant à la colonne où la probabilité est la plus forte.

# 2) Split train / validation avec stratification sur la classe majoritaire
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=majority_class
)

print("Taille train :", len(train_df), "Taille val :", len(val_df))


Taille train : 1196 Taille val : 300


In [9]:
##############################################################################
# 3) Préparation des ImageDataGenerator et random crop
##############################################################################
import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator

data_dir = "/kaggle/input/data-train-decoupes-all-images"

# Fonction de preprocessing : random crop
target_size = (224,224)
def random_crop_fn(image):
    # image est déjà en (224,224,3) après le redimensionnement standard.
    # On veut faire un crop random plus petit (par ex 200x200), puis re-resize en 224x224.
    crop_height = 200
    crop_width = 200
    cropped = tf.image.random_crop(image, size=(crop_height, crop_width, 3))
    resized = tf.image.resize(cropped, target_size)
    return resized

# Data augmentation pour le train
train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    rotation_range=20,
    brightness_range=(0.8, 1.2),
    zoom_range=(1.0, 1.2),
    preprocessing_function=random_crop_fn
)

# Pour la validation, on se contente du rescale
val_datagen = ImageDataGenerator(rescale=1./255)

# IMPORTANT : y_col doit être **une liste de colonnes** correspondant au vecteur soft label.
#            on utilise class_mode="raw" pour récupérer ce vecteur "brut".
#            (Keras n'essaiera pas de faire un one-hot automatique,
#             on fournira directement un vecteur de taille 9).
label_cols = [f"class_{i}" for i in range(NUM_CLASSES)]
batch_size = 32

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    directory=data_dir,                 # dossier où se trouvent les images
    x_col="filename",                   # colonne indiquant le nom de l'image
    y_col=label_cols,                   # les 9 colonnes de distribution
    target_size=target_size,
    batch_size=batch_size,
    class_mode="raw",                   # on récupère un vecteur float (soft label)
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    val_df,
    directory=data_dir,
    x_col="filename",
    y_col=label_cols,
    target_size=target_size,
    batch_size=batch_size,
    class_mode="raw",
    shuffle=False
)

# Vérifions le nombre de classes de sortie (should be 9)
# On peut inspecter un batch pour vérifier la forme de y
x_sample, y_sample = next(train_generator)
print("Shape X :", x_sample.shape)  # (batch_size, 224,224,3)
print("Shape Y :", y_sample.shape)  # (batch_size, 9)



Found 1196 validated image filenames.
Found 300 validated image filenames.
Shape X : (32, 224, 224, 3)
Shape Y : (32, 9)


## Construction du modèle

In [11]:
##############################################################################
# 4) Construction du modèle (EfficientNetB0) et compilation
##############################################################################
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.models import Model


base_model = EfficientNetB0(
    weights="imagenet",
    include_top=False,
    input_shape=(224,224,3)
)
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.6)(x)
predictions = Dense(NUM_CLASSES, activation="softmax")(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Rendre le backbone entraînable (fine-tuning complet)
for layer in base_model.layers:
    layer.trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="categorical_crossentropy",  # s'attend à une distribution de probas en label
    metrics=["accuracy"]
)

model.summary()



Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
##############################################################################
# 5) Callback F1 (macro) sur la validation
##############################################################################

class F1MacroCallback(Callback):
    def __init__(self, val_generator):
        super().__init__()
        self.val_generator = val_generator

    def on_epoch_end(self, epoch, logs=None):
        # Calcul du F1 macro sur la validation
        steps = len(self.val_generator)
        preds = self.model.predict(self.val_generator, steps=steps)
        # preds = shape (N, 9), distribution de probas
        val_predict = np.argmax(preds, axis=1)
        # Récupérer les labels "vrais" sous forme "argmax" (car y_true est un vecteur soft)
        y_true_soft = []
        for i in range(steps):
            _, y_batch = self.val_generator[i]
            y_true_soft.append(y_batch)
        y_true_soft = np.concatenate(y_true_soft, axis=0)  # (N, 9)
        val_true = np.argmax(y_true_soft, axis=1)          # (N,)

        f1 = f1_score(val_true, val_predict, average='macro')
        print(f"\nF1 Score Macro (val) = {f1:.4f}")
        logs["val_f1_macro"] = f1


f1_callback = F1MacroCallback(val_generator)

callbacks = [
    EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True, verbose=1),
    f1_callback
]



In [None]:
##############################################################################
# 6) Entraînement
##############################################################################

num_epochs = 50
history = model.fit(
    train_generator,
    epochs=num_epochs,
    validation_data=val_generator,
    callbacks=callbacks
)




In [None]:
# history = model.fit(...)  # déjà effectué

import matplotlib.pyplot as plt

# Tracé de la loss (entraînement et validation)
plt.figure()
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Évolution de la Loss')
plt.xlabel('Époque')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Tracé de la précision (s'il y a 'accuracy' et 'val_accuracy' dans metrics)
# Si vous voulez tracer autre chose (par ex. 'val_f1_macro'), voir plus bas
plt.figure()
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Évolution de la Précision')
plt.xlabel('Époque')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Tracé de la courbe de F1 macro si elle est bien stockée dans history
if 'val_f1_macro' in history.history:
    plt.figure()
    plt.plot(history.history['val_f1_macro'], label='Val F1 Macro')
    plt.title('Évolution du F1 Score Macro (Validation)')
    plt.xlabel('Époque')
    plt.ylabel('F1 Macro')
    plt.legend()
    plt.show()
else:
    print("La clé 'val_f1_macro' n'est pas présente dans history.history. "
          "Vérifiez si elle est bien enregistrée par le callback.")


## Soumission

In [None]:
##############################################################################
# 7) Prédiction (exemple)
##############################################################################

# Sur votre jeu de test Kaggle (supposons un DataFrame test_df avec filename),
# vous feriez un flow_from_dataframe similaire (class_mode=None, pas de labels),
# puis un model.predict(...) et np.argmax(...) pour obtenir un label unique 0..8.

# test_preds = model.predict(test_generator, verbose=1)
# predicted_labels = np.argmax(test_preds, axis=1)
# etc.

##############################################################################
# FIN
##############################################################################