In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import argparse
import os
from PIL import Image, UnidentifiedImageError
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Function to load and verify image

def load_image(file_path):
    """
    Load and verify an image. Return None if the image is invalid.

    Args:
        file_path (str): Path to the image file.

    Returns:
        Image object or None if the image is invalid.
    """
    try:
        img = Image.open(file_path)
        img.verify()  # Verify image integrity
        img = Image.open(file_path)  # Reopen to display
        img = img.convert("RGB")  # Ensure RGB mode
        return img
    except (UnidentifiedImageError, OSError, AttributeError) as e:
        print(f"[WARNING] Skipped corrupted file: {file_path} ({e})")
        return None

# Function for EDA visualization

def visualize_dataset(data_dir, english_classes):
    """
    Visualize the dataset by displaying class distribution and sample images.

    Args:
        data_dir (str): Path to the dataset directory.
        english_classes (list): English class names corresponding to dataset folders.
    """
    class_dirs = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
    counts = {}
    for cls in class_dirs:
        cls_path = os.path.join(data_dir, cls)
        valid_files = [f for f in os.listdir(cls_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        counts[cls] = len(valid_files)

    df_counts = pd.DataFrame({"Italian_Class": class_dirs, "Count": list(counts.values()), "English_Class": english_classes})
    print("Class distribution (Italian -> English):")
    print(df_counts)

    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_counts, x="English_Class", y="Count")
    plt.title("Image Count per Animal Class (Animals-10 Dataset)")
    plt.ylabel("Number of Images")
    plt.xticks(rotation=45)
    plt.show()

    print("\nDisplaying one sample image per class:")
    plt.figure(figsize=(15, 10))
    for idx, (italian_class, english_class) in enumerate(zip(class_dirs, english_classes), 1):
        class_path = os.path.join(data_dir, italian_class)
        files = [f for f in os.listdir(class_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        for file in files:
            img = load_image(os.path.join(class_path, file))
            if img is not None:
                plt.subplot(2, 5, idx)
                plt.imshow(img)
                plt.axis('off')
                plt.title(f"{english_class}")
                break
    plt.tight_layout()
    plt.show()

# Main EDA execution

def main():
    """
    Main function to perform EDA on the dataset.
    """
    data_dir = "/kaggle/input/animals10/raw-img/"
    english_classes = ['horse', 'sheep', 'elephant', 'cat', 'squirrel', 'chicken', 'spider', 'cow', 'dog', 'butterfly']

    if os.path.exists(data_dir):
        print(f"Dataset verified at: {data_dir}")
        visualize_dataset(data_dir, english_classes)
    else:
        print(f"Dataset not found at {data_dir}. Check path or add dataset via Kaggle 'Add Data'.")

# Entry point
if __name__ == "__main__":
    main()
