In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
import pickle
from tqdm import tqdm

import pickle
import pandas as pd
import numpy as np
import os

2025-11-18 11:51:25.577262: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-18 11:51:25.618222: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-18 11:51:26.503898: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# ============================================================================
# CONFIGURATION - CORRIG√âE
# ============================================================================

DATA_DIR = "data/"
IMAGES_DIR = "data/Images"  # ‚ö†Ô∏è I majuscule
CAPTIONS_FILE = "data/captions.txt"

OUTPUT_DIR = "split_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

IMG_SIZE = (224, 224)
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1
TEST_RATIO = 0.1

PermissionError: [Errno 13] Permission denied: '/content'

In [None]:
# ============================================================================
# 1. CHARGEMENT DES CAPTIONS
# ============================================================================

def load_captions(captions_file):
    """
    Charge le fichier captions.txt et retourne un DataFrame
    Format attendu: image,caption
    """
    print("üìñ Chargement des captions...")
    df = pd.read_csv(captions_file)

    # Nettoyage de base
    df.columns = df.columns.str.strip()

    print(f"‚úÖ {len(df)} captions charg√©es pour {df['image'].nunique()} images")
    print(f"Exemple:\n{df.head()}\n")

    return df

In [None]:

# ============================================================================
# 2. SPLIT TRAIN/VAL/TEST
# ============================================================================

def split_dataset(df, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, random_state=42):
    """
    Split le dataset au niveau des images (pas des captions)
    Une image peut avoir plusieurs captions, donc on garde toutes les captions ensemble
    """
    print("‚úÇÔ∏è  Split du dataset...")

    # Obtenir la liste unique des images
    unique_images = df['image'].unique()
    print(f"Nombre d'images uniques: {len(unique_images)}")

    # Split train/temp
    train_imgs, temp_imgs = train_test_split(
        unique_images,
        test_size=(val_ratio + test_ratio),
        random_state=random_state
    )

    # Split val/test
    val_imgs, test_imgs = train_test_split(
        temp_imgs,
        test_size=test_ratio/(val_ratio + test_ratio),
        random_state=random_state
    )

    # Cr√©er les DataFrames
    train_df = df[df['image'].isin(train_imgs)].reset_index(drop=True)
    val_df = df[df['image'].isin(val_imgs)].reset_index(drop=True)
    test_df = df[df['image'].isin(test_imgs)].reset_index(drop=True)

    print(f"‚úÖ Train: {len(train_imgs)} images, {len(train_df)} captions")
    print(f"‚úÖ Val:   {len(val_imgs)} images, {len(val_df)} captions")
    print(f"‚úÖ Test:  {len(test_imgs)} images, {len(test_df)} captions\n")

    return train_df, val_df, test_df

In [None]:
# ============================================================================
# 3. PR√âTRAITEMENT DES IMAGES
# ============================================================================

def preprocess_image(img_path, target_size=(224, 224)):
    """
    Charge et pr√©traite UNE image pour VGG16:
    - Resize √† (224, 224)
    - Convert to array
    - Normalisation VGG16 (ImageNet)
    """
    try:
        # Charger l'image
        img = Image.open(img_path).convert('RGB')

        # Resize
        img = img.resize(target_size, Image.LANCZOS)

        # Convert to array
        img_array = img_to_array(img)

        # Normalisation VGG16 (soustraction moyenne ImageNet)
        img_array = preprocess_input(img_array)

        return img_array

    except Exception as e:
        print(f"‚ùå Erreur avec {img_path}: {e}")
        return None


In [None]:
def preprocess_images_batch(df, images_dir, target_size=(224, 224)):
    """
    Pr√©traite toutes les images d'un DataFrame
    Retourne un dictionnaire {image_name: preprocessed_array}
    """
    print(f"üñºÔ∏è  Pr√©traitement de {df['image'].nunique()} images...")

    processed_images = {}
    unique_images = df['image'].unique()

    for img_name in tqdm(unique_images, desc="Processing images"):
        img_path = os.path.join(images_dir, img_name)

        if not os.path.exists(img_path):
            print(f"‚ö†Ô∏è  Image non trouv√©e: {img_name}")
            continue

        img_array = preprocess_image(img_path, target_size)

        if img_array is not None:
            processed_images[img_name] = img_array

    print(f"‚úÖ {len(processed_images)} images pr√©trait√©es\n")

    return processed_images

In [None]:
# ============================================================================
# 4. SAUVEGARDE
# ============================================================================

def save_split(split_name, df, images_dict, output_dir):
    """
    Sauvegarde un split (train/val/test):
    - DataFrame des captions
    - Dictionnaire des images pr√©trait√©es
    """
    print(f"üíæ Sauvegarde du split {split_name}...")

    split_dir = os.path.join(output_dir, split_name)
    os.makedirs(split_dir, exist_ok=True)

    # Sauvegarder les captions (CSV)
    captions_path = os.path.join(split_dir, "captions.csv")
    df.to_csv(captions_path, index=False)

    # Sauvegarder les images (pickle pour rapidit√©)
    images_path = os.path.join(split_dir, "images_preprocessed.pkl")
    with open(images_path, 'wb') as f:
        pickle.dump(images_dict, f)

    print(f"‚úÖ Sauvegard√© dans {split_dir}")
    print(f"   - Captions: {len(df)} lignes")
    print(f"   - Images: {len(images_dict)} fichiers\n")

In [None]:
def save_all_splits(train_df, val_df, test_df,
                    train_imgs, val_imgs, test_imgs,
                    output_dir):
    """
    Sauvegarde tous les splits
    """
    save_split("train", train_df, train_imgs, output_dir)
    save_split("val", val_df, val_imgs, output_dir)
    save_split("test", test_df, test_imgs, output_dir)

    print("üéâ Tous les splits sauvegard√©s avec succ√®s!")


In [None]:

# ============================================================================
# 5. PIPELINE PRINCIPAL
# ============================================================================

def preprocess_flickr8k(data_dir, images_dir, captions_file, output_dir,
                        train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Pipeline complet de pr√©traitement
    """
    print("="*70)
    print("üöÄ PR√âTRAITEMENT FLICKR8K POUR VGG16")
    print("="*70 + "\n")

    # 1. Charger les captions
    df = load_captions(captions_file)

    # 2. Split train/val/test
    train_df, val_df, test_df = split_dataset(
        df, train_ratio, val_ratio, test_ratio
    )

    # 3. Pr√©traiter les images
    print("üîÑ Pr√©traitement des images pour chaque split...\n")

    train_imgs = preprocess_images_batch(train_df, images_dir)
    val_imgs = preprocess_images_batch(val_df, images_dir)
    test_imgs = preprocess_images_batch(test_df, images_dir)

    # 4. Sauvegarder
    save_all_splits(train_df, val_df, test_df,
                   train_imgs, val_imgs, test_imgs,
                   output_dir)

    print("="*70)
    print("‚ú® PR√âTRAITEMENT TERMIN√â")
    print("="*70)

    return {
        'train': (train_df, train_imgs),
        'val': (val_df, val_imgs),
        'test': (test_df, test_imgs)
    }

In [None]:
# ============================================================================
# 6. FONCTION DE CHARGEMENT (POUR PLUS TARD)
# ============================================================================

def load_preprocessed_split(split_name, output_dir):
    """
    Charge un split pr√©trait√©
    """
    split_dir = os.path.join(output_dir, split_name)

    # Charger captions
    captions_path = os.path.join(split_dir, "captions.csv")
    df = pd.read_csv(captions_path)

    # Charger images
    images_path = os.path.join(split_dir, "images_preprocessed.pkl")
    with open(images_path, 'rb') as f:
        images_dict = pickle.load(f)

    print(f"‚úÖ Charg√© {split_name}: {len(df)} captions, {len(images_dict)} images")

    return df, images_dict


In [None]:
# ============================================================================
# EX√âCUTION DU PR√âTRAITEMENT FLICKR8K
# ============================================================================

# Lancer le pr√©traitement
data = preprocess_flickr8k(
    data_dir=DATA_DIR,
    images_dir=IMAGES_DIR,
    captions_file=CAPTIONS_FILE,
    output_dir=OUTPUT_DIR,
    train_ratio=TRAIN_RATIO,
    val_ratio=VAL_RATIO,
    test_ratio=TEST_RATIO
)

print("\n" + "="*70)
print("üìä R√âSUM√â FINAL")
print("="*70)
print(f"‚úÖ Train: {len(data['train'][0])} captions, {len(data['train'][1])} images")
print(f"‚úÖ Val:   {len(data['val'][0])} captions, {len(data['val'][1])} images")
print(f"‚úÖ Test:  {len(data['test'][0])} captions, {len(data['test'][1])} images")
print("="*70)

# V√©rifier que tout est bien sauvegard√©
print("\nüîç V√©rification des fichiers sauvegard√©s:")
for split in ['train', 'val', 'test']:
    split_dir = os.path.join(OUTPUT_DIR, split)
    print(f"\n{split.upper()}:")
    print(f"  üìÅ {split_dir}")
    for file in os.listdir(split_dir):
        file_path = os.path.join(split_dir, file)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"    ‚îú‚îÄ‚îÄ {file} ({size_mb:.2f} MB)")

print("\n‚ú® Pr√©traitement termin√© ! Pr√™t pour l'√©tape suivante.")

In [None]:
output_dir = "/content/drive/MyDrive/Fliker8K_processed"

# Charger images pr√©trait√©es, split_name train ou test ou val 
def load_images_preprocessed(split_name, output_dir):
    split_dir = os.path.join(output_dir, split_name)
    images_path = os.path.join(split_dir, "images_preprocessed.pkl")
    with open(images_path, 'rb') as f:
        images_dict = pickle.load(f)
    return images_dict

for split in ['train', 'val', 'test']:
    images_dict = load_images_preprocessed(split, output_dir)
    save_features(split, output_dir, images_dict, vgg_model)
