In [None]:
import os
import random
import numpy as np 
import pandas as pd 
from PIL import Image

def process_images_by_simplified_genre(
    main_image_dir,
    output_base_dir,
    target_images_per_simplified_genre=2000,
    target_image_size=(224, 224)
):
    """
    Processes images from detailed genre folders, categorizes them into simplified genres,
    randomly samples a target number of images per simplified genre (proportionally
    from contributing detailed folders), resizes them, and saves them to new directories.

    Args:
        main_image_dir (str): The path to the directory containing all the detailed
                              genre folders (e.g., 'Abstract_Expressionism', 'Baroque', etc.).
        output_base_dir (str): The path where the new simplified genre folders
                               and processed images will be saved.
        target_images_per_simplified_genre (int): The maximum number of images to
                                                  select for each simplified genre.
        target_image_size (tuple): A tuple (width, height) for the desired
                                   output image size.
    """

    genre_mapping = {
        # 1. Classical & Pre-Modern Eras (pre-19th century art)
        "Early_Renaissance": "01_Classical_Pre_Modern",
        "High_Renaissance": "01_Classical_Pre_Modern",
        "Northern_Renaissance": "01_Classical_Pre_Modern",
        "Mannerism_Late_Renaissance": "01_Classical_Pre_Modern",
        "Baroque": "01_Classical_Pre_Modern",
        "Rococo": "01_Classical_Pre_Modern",

        # 2. 19th Century Art (Romanticism, Realism, Impressionism, etc.)
        "Romanticism": "02_19th_Century_Art",
        "Realism": "02_19th_Century_Art",
        "Impressionism": "02_19th_Century_Art",
        "Post_Impressionism": "02_19th_Century_Art",
        "Pointillism": "02_19th_Century_Art",
        "Symbolism": "02_19th_Century_Art",

        # 3. Early 20th Century Modernism (Figurative/Expressive)
        "Fauvism": "03_Early_20th_C_Modernism",
        "Expressionism": "03_Early_20th_C_Modernism",
        "Art_Nouveau_Modern": "03_Early_20th_C_Modernism",

        # 4. Cubism & Related Geometric Abstraction
        "Cubism": "04_Cubism_Geometric_Abstraction",
        "Analytical_Cubism": "04_Cubism_Geometric_Abstraction",
        "Synthetic_Cubism": "04_Cubism_Geometric_Abstraction",

        # 5. Mid-20th Century Abstraction (Abstract Expressionism, Color Field)
        "Action_painting": "05_Mid_20th_C_Abstraction",
        "Color_Field_Painting": "05_Mid_20th_C_Abstraction",
        "Abstract_Expressionism": "05_Mid_20th_C_Abstraction",

        # 6. Pop Art
        "Pop_Art": "06_Pop_Art",

        # 7. Minimalism
        "Minimalism": "07_Minimalism",

        # 8. Contemporary Realism (Art after Mid-20th C, distinct from Pop Art)
        "New_Realism": "08_Contemporary_Realism",
        "Contemporary_Realism": "08_Contemporary_Realism",

        # 9. Naive and Primitivism
        "Naive_Art_Primitivism": "09_Naive_Primitivism",

        # 10. Asian / Other Cultural Art
        "Ukiyo_e": "10_Asian_Cultural_Art"
    }

    
    os.makedirs(output_base_dir, exist_ok=True)
    print(f"Output directory '{output_base_dir}' ensured.")

    # Create a reverse map for easy lookup: simplified_genre -> list of detailed_genres
    simplified_to_detailed_map = {}
    for detailed, simplified in genre_mapping.items():
        if simplified not in simplified_to_detailed_map:
            simplified_to_detailed_map[simplified] = []
        simplified_to_detailed_map[simplified].append(detailed)

    # Allowed image extensions
    image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')

    # Process each simplified genre
    for simplified_genre, detailed_genres in simplified_to_detailed_map.items():
        print(f"\nProcessing Simplified Genre: {simplified_genre}")
        output_simplified_dir = os.path.join(output_base_dir, simplified_genre)
        os.makedirs(output_simplified_dir, exist_ok=True)

        all_images_in_simplified_category = []
        detailed_genre_image_lists = {} # Stores list of image paths for each detailed genre

        # First pass: Collect all image paths and count total available
        for detailed_genre_name in detailed_genres:
            detailed_genre_path = os.path.join(main_image_dir, detailed_genre_name)
            if os.path.isdir(detailed_genre_path):
                images_in_folder = [
                    os.path.join(detailed_genre_path, f)
                    for f in os.listdir(detailed_genre_path)
                    if f.lower().endswith(image_extensions)
                ]
                detailed_genre_image_lists[detailed_genre_name] = images_in_folder
                all_images_in_simplified_category.extend(images_in_folder)
            else:
                print(f"  Warning: Detailed genre folder '{detailed_genre_path}' not found. Skipping.")

        num_total_available = len(all_images_in_simplified_category)
        if num_total_available == 0:
            print(f"  No images found for simplified genre '{simplified_genre}'. Skipping.")
            continue

       
        effective_target_for_category = min(target_images_per_simplified_genre, num_total_available)
        print(f"  Total images available: {num_total_available}. Target for category: {effective_target_for_category}")

        selected_images_paths = []

        remaining_quota = effective_target_for_category
        remaining_detailed_genres = list(detailed_genre_image_lists.keys())

        # Distribute images in rounds until quota is met or no more images available
        while remaining_quota > 0 and len(remaining_detailed_genres) > 0:
            num_detailed_genres_to_sample = len(remaining_detailed_genres)
            if num_detailed_genres_to_sample == 0:
                break # Should not happen if loop condition is correct, but for safety

            # Calculate a base number of images to try and take from each remaining detailed genre
            base_take_per_genre = remaining_quota // num_detailed_genres_to_sample
            if base_take_per_genre == 0 and remaining_quota > 0:
                base_take_per_genre = 1

            next_round_remaining_detailed_genres = []
            taken_this_round = 0

            for detailed_genre_name in remaining_detailed_genres:
                images_list = detailed_genre_image_lists[detailed_genre_name]
                already_selected_count = sum(1 for p in selected_images_paths if os.path.dirname(p) == os.path.join(main_image_dir, detailed_genre_name))
                available_to_take = len(images_list) - already_selected_count

                if available_to_take > 0:
                    num_to_take = min(base_take_per_genre, available_to_take, remaining_quota - taken_this_round)
                    if num_to_take <= 0 and remaining_quota - taken_this_round > 0 and available_to_take > 0:
                        # Ensure at least one is taken if quota is still available
                        num_to_take = 1

                    if num_to_take > 0:
                        unselected_images = [img for img in images_list if img not in selected_images_paths]
                        sample = random.sample(unselected_images, num_to_take)
                        selected_images_paths.extend(sample)
                        taken_this_round += len(sample)
                        if len(selected_images_paths) >= effective_target_for_category:
                            break # Met the target

                    if available_to_take > num_to_take: 
                        next_round_remaining_detailed_genres.append(detailed_genre_name)
               

            remaining_quota -= taken_this_round
            remaining_detailed_genres = next_round_remaining_detailed_genres

            if taken_this_round == 0 and remaining_quota > 0 and len(remaining_detailed_genres) > 0:
                # If no images were taken in a round but quota remains and genres are available
                # it means base_take_per_genre was too small for some reason
                for detailed_genre_name in remaining_detailed_genres:
                    if remaining_quota <= 0: break
                    images_list = detailed_genre_image_lists[detailed_genre_name]
                    unselected_images = [img for img in images_list if img not in selected_images_paths]
                    if len(unselected_images) > 0:
                        sample = random.sample(unselected_images, 1)
                        selected_images_paths.extend(sample)
                        remaining_quota -= 1
                        if len(selected_images_paths) >= effective_target_for_category:
                            break

        # If we have slightly more than effective_target_for_category due to rounding
        # or if previous logic didn't perfectly hit the target but collected enough trim it down
        if len(selected_images_paths) > effective_target_for_category:
            selected_images_paths = random.sample(selected_images_paths, effective_target_for_category)
        elif len(selected_images_paths) < effective_target_for_category and num_total_available >= effective_target_for_category:
             # This indicates an issue in the sampling loop, should not happen if logic is perfect,
             # but as a fallback, take randomly from all if still under target and total available allows.
             print(f"  Warning: Target not fully met proportionally. Attempting final random fill. "
                   f"Current: {len(selected_images_paths)}, Target: {effective_target_for_category}")
             all_unselected = [img for img in all_images_in_simplified_category if img not in selected_images_paths]
             needed = effective_target_for_category - len(selected_images_paths)
             if needed > 0 and len(all_unselected) > 0:
                 additional = random.sample(all_unselected, min(needed, len(all_unselected)))
                 selected_images_paths.extend(additional)


        print(f"  Selected {len(selected_images_paths)} images for {simplified_genre}.")

        # Process and save the selected images
        for i, img_path in enumerate(selected_images_paths):
            try:
                img = Image.open(img_path).convert("RGB") # Ensure RGB mode
                img = img.resize(target_image_size, Image.Resampling.LANCZOS) 
                original_filename = os.path.basename(img_path)
                filename_base, file_extension = os.path.splitext(original_filename)
                output_filename = f"{filename_base}_{i}{file_extension}" # Append index for uniqueness
                output_path = os.path.join(output_simplified_dir, output_filename)
                img.save(output_path)
            except FileNotFoundError:
                print(f"    Error: Image file not found: {img_path}")
            except Image.UnidentifiedImageError:
                print(f"    Error: Cannot identify image file (corrupted or unsupported format): {img_path}")
            except Exception as e:
                print(f"    Error processing {img_path}: {e}")

    print("\nImage processing complete.")


main_genres_directory = '/kaggle/input/wikiart'

# This will be the directory where new folders 
output_standardized_images_dir = '/kaggle/working/'

# Call the function to start processing
process_images_by_simplified_genre(
    main_genres_directory,
    output_standardized_images_dir,
    target_images_per_simplified_genre=2000, # Max 2000 images per simplified genre
    target_image_size=(224, 224) # Standardize to 224x224 pixels
)

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import (
    Dense,
    GlobalAveragePooling2D,
    Input,
    RandomFlip,
    RandomRotation,
    RandomZoom,
    RandomContrast
)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import os
import matplotlib.pyplot as plt
import math


# --- Set Global Policy for Mixed Precision Training ---
try:
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    print("Mixed precision training enabled.")
except Exception as e:
    print(f"Could not enable mixed precision: {e}")

# --- Define Paths and Constants ---
PROCESSED_IMAGES_DIR = '/kaggle/input/imageclassification-v1'
MODEL_SAVE_PATH = '/kaggle/working/efficient_art_classifier_final.keras'

# Image dimensions and training parameters
IMG_HEIGHT, IMG_WIDTH = 224, 224
EPOCHS = 150
BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE

# --- Create 3-Way Data Split ---
print("\nSetting up 3-way data split (70% train, 15% val, 15% test)...")

# Load the full dataset from the directory
full_ds = tf.keras.utils.image_dataset_from_directory(
    PROCESSED_IMAGES_DIR,
    seed=123,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE
)

class_names = full_ds.class_names
num_classes = len(class_names)
print(f"Discovered {num_classes} classes: {class_names}")

# Get the full size of the dataset
dataset_size = tf.data.experimental.cardinality(full_ds).numpy()
train_size = int(0.7 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = int(0.15 * dataset_size)

# Shuffle the dataset before splitting
full_ds = full_ds.shuffle(buffer_size=1000, seed=123, reshuffle_each_iteration=False)

# Create the splits 
train_ds = full_ds.take(train_size)
val_ds = full_ds.skip(train_size).take(val_size)
test_ds = full_ds.skip(train_size + val_size).take(test_size)

print(f"Training set size: ~{train_size * BATCH_SIZE} images")
print(f"Validation set size: ~{val_size * BATCH_SIZE} images")
print(f"Test set size: ~{test_size * BATCH_SIZE} images")

# Configure datasets for performance
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

# --- Set up Multi-GPU Strategy ---
strategy = tf.distribute.MirroredStrategy()
print(f"\nTraining with {strategy.num_replicas_in_sync} GPUs.")

# --- Build and Compile Model within Strategy Scope ---
with strategy.scope():
    print("Building EfficientNetB0 model for fine-tuning")

    data_augmentation = Sequential([
        RandomFlip("horizontal"),
        RandomRotation(0.2),
        RandomZoom(0.2),
        RandomContrast(0.2),
    ], name='data_augmentation')

    inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))
    x = data_augmentation(inputs)

    base_model = EfficientNetB0(
        input_shape=(IMG_HEIGHT, IMG_WIDTH, 3),
        include_top=False,
        weights='imagenet'
    )

    base_model.trainable = True
    fine_tune_at = 30 # Number of unfrozen layers 
    print(f"Unfreezing the top {fine_tune_at} layers of the base model.")
    for layer in base_model.layers[:-fine_tune_at]:
        layer.trainable = False

    x = base_model(x, training=True)
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax', dtype='float32')(x)
    model = Model(inputs, predictions)

    # Compile the model with gradient clipping for stability
    optimizer = Adam(learning_rate=1e-5, clipnorm=1.0)
    print(f"Compiling model with Adam optimizer (lr={optimizer.learning_rate.numpy()}, clipnorm=1.0).")
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

print("\n--- Model Summary ---")
model.summary()

# --- Configure Callbacks ---
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    mode='min',
    min_delta=0.001,
    verbose=1,
    restore_best_weights=True
)

# --- Train the Model ---
print(f"\nStarting model fine-tuning for up to {EPOCHS} epochs...")
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=[early_stopping_callback]
)
print("\nModel training completed.")

# --- Final Evaluation on the Independent Test Set ---
print("\n Evaluating final model on the independent test set...")
test_loss, test_accuracy = model.evaluate(test_ds)
print(f"Test Set Loss: {test_loss:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# --- Save Model ---
model.save(MODEL_SAVE_PATH)
print(f"\nModel saved to: {MODEL_SAVE_PATH}")

# --- Plot Training History ---
print("\nPlotting training history...")
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pathlib

# --- Configuration ---
# Adjust these paths to match your project setup
MODEL_PATH = '/kaggle/working/efficient_art_classifier_final.keras'
PROCESSED_IMAGES_DIR = '/kaggle/input/imageclassification-v1'

# Image and batch settings (must match the training script)
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE

print("--- Loading Model and Data ---")

# Load the trained model
model = tf.keras.models.load_model(MODEL_PATH)
print(f"Model loaded from {MODEL_PATH}")

# Create a Stratified 3-Way Data Split
print("\n--- Creating Stratified Data Splits (70/15/15) ---")
data_dir = pathlib.Path(PROCESSED_IMAGES_DIR)

# Get all image paths and their corresponding labels
all_paths = list(data_dir.glob('*/*'))
all_paths = [str(p) for p in all_paths]
class_names = sorted([item.name for item in data_dir.glob('*') if item.is_dir()])
label_to_index = dict((name, index) for index, name in enumerate(class_names))
all_labels = [label_to_index[pathlib.Path(p).parent.name] for p in all_paths]

print(f"Found {len(all_paths)} images belonging to {len(class_names)} classes.")

# --- Seed Set the same as Model Training To Ensure Testing Data is Consistent ---
# First split 70% train, 30% temp 
train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    all_paths, all_labels, test_size=0.30, random_state=123, stratify=all_labels
)

# Second split 15% validation, 15% test (split the 30% temp set in half)
val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels, test_size=0.50, random_state=123, stratify=temp_labels
)

print(f"Training set size: {len(train_paths)}")
print(f"Validation set size: {len(val_paths)}")
print(f"Test set size: {len(test_paths)}")

# Function to load and process images
def load_and_preprocess_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_HEIGHT, IMG_WIDTH])
    return image, label

# Create tf.data.Dataset objects from the file paths
test_ds = tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
test_ds = test_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

print("Test dataset created successfully with stratification.")


#  Get Predictions and True Labels
print("\n--- Generating Predictions on Test Set ---")
predictions_raw = model.predict(test_ds)
predicted_labels = np.argmax(predictions_raw, axis=1)

# The true labels are already available from the split
true_labels = np.array(test_labels)

# This is a check to ensure the test set is not empty or malformed
if len(true_labels) == 0:
    raise ValueError("The test dataset is empty. Check the data splitting logic.")
if len(true_labels) != len(predicted_labels):
    print(f"Warning: Mismatch in number of true labels ({len(true_labels)}) and predicted labels ({len(predicted_labels)}).")
    true_labels = true_labels[:len(predicted_labels)]

print("Predictions and true labels extracted.")

# --- Quantitative Evaluation ---

#  Top-k Accuracy
print("\n--- Top-k Accuracy ---")
# Top-1 Accuracy is the standard accuracy
top1_accuracy = np.mean(predicted_labels == true_labels)
print(f"Top-1 Accuracy (Standard Accuracy): {top1_accuracy:.4f}")

# Top-3 Accuracy
# Get the indices of the top 3 predictions for each image
top3_predictions_indices = np.argsort(predictions_raw, axis=1)[:, -3:]

# Check if the true label is in the top 3 predictions for each sample
top3_correct = [1 if true_labels[i] in top3_predictions_indices[i] else 0 for i in range(len(true_labels))]
top3_accuracy = np.mean(top3_correct)
print(f"Top-3 Accuracy: {top3_accuracy:.4f}")


#  Classification Report (Precision, Recall, F1-Score)
print("\n--- Classification Report ---")
report = classification_report(true_labels, predicted_labels, target_names=class_names)
print(report)

#  Confusion Matrix
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# --- Qualitative Evaluation ---

#  Visualize Misclassified Images
print("\n--- Analyzing Misclassified Images ---")
misclassified_indices = np.where(predicted_labels != true_labels)[0]
num_images_to_show = 12
if len(misclassified_indices) > num_images_to_show:
    selected_indices = np.random.choice(misclassified_indices, size=num_images_to_show, replace=False)
else:
    selected_indices = misclassified_indices

plt.figure(figsize=(15, 15))
for i, index in enumerate(selected_indices):
    path = test_paths[index]
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    
    true_label_name = class_names[true_labels[index]]
    pred_label_name = class_names[predicted_labels[index]]
    
    plt.subplot(4, 4, i + 1)
    plt.imshow(image)
    plt.title(f"True: {true_label_name}\nPred: {pred_label_name}")
    plt.axis('off')

plt.tight_layout()
plt.show()
