# Data EDA - Customized Image Generation

**Nguyễn Khang Hy (2352662)**

EDA cho dự án Style Transfer với Stable Diffusion + LoRA.


In [None]:
# Clone repository từ GitHub
!git clone https://github.com/HyIsNoob/customized-image-generation.git
%cd customized-image-generation

## 1. Setup Environment


In [None]:
# Import libraries
import sys
sys.path.append('/kaggle/working/customized-image-generation')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
import cv2

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 2. Dataset Paths


In [None]:
# Kaggle dataset paths
COCO_TRAIN_DIR = Path('/kaggle/input/coco-2017-dataset/coco2017/train2017')
COCO_VAL_DIR = Path('/kaggle/input/coco-2017-dataset/coco2017/val2017')
WIKIART_DIR = Path('/kaggle/input/wikiart/wikiart')

print(f"COCO train directory exists: {COCO_TRAIN_DIR.exists()}")
print(f"COCO val directory exists: {COCO_VAL_DIR.exists()}")
print(f"WikiArt directory exists: {WIKIART_DIR.exists()}")


## 3. COCO Dataset Analysis


In [None]:
# Get all COCO images
coco_train_images = list(COCO_TRAIN_DIR.glob('*.jpg'))
coco_val_images = list(COCO_VAL_DIR.glob('*.jpg')) if COCO_VAL_DIR.exists() else []

print(f"Total COCO train images: {len(coco_train_images)}")
print(f"Total COCO val images: {len(coco_val_images)}")
print(f"Total COCO images: {len(coco_train_images) + len(coco_val_images)}")

coco_images = coco_train_images  


In [None]:
# Analyze image sizes
def get_image_size(image_path):
    try:
        img = Image.open(image_path)
        return img.size
    except:
        return None

sample_size = min(1000, len(coco_images))
sample_images = np.random.choice(coco_images, sample_size, replace=False)

sizes = []
for img_path in sample_images:
    size = get_image_size(img_path)
    if size:
        sizes.append(size)

widths = [s[0] for s in sizes]
heights = [s[1] for s in sizes]
aspect_ratios = [w/h for w, h in zip(widths, heights)]


In [None]:
# Visualize COCO statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].hist(widths, bins=50, alpha=0.7, color='blue')
axes[0, 0].set_title('Width Distribution')
axes[0, 0].set_xlabel('Width (px)')
axes[0, 0].set_ylabel('Frequency')

axes[0, 1].hist(heights, bins=50, alpha=0.7, color='green')
axes[0, 1].set_title('Height Distribution')
axes[0, 1].set_xlabel('Height (px)')
axes[0, 1].set_ylabel('Frequency')

axes[1, 0].hist(aspect_ratios, bins=50, alpha=0.7, color='red')
axes[1, 0].set_title('Aspect Ratio Distribution')
axes[1, 0].set_xlabel('Aspect Ratio')
axes[1, 0].set_ylabel('Frequency')

axes[1, 1].scatter(widths, heights, alpha=0.3, s=10)
axes[1, 1].set_title('Width vs Height')
axes[1, 1].set_xlabel('Width (px)')
axes[1, 1].set_ylabel('Height (px)')

plt.tight_layout()
plt.savefig('/kaggle/working/coco_stats.png', dpi=150)
plt.show()

print(f"Mean width: {np.mean(widths):.0f}px")
print(f"Mean height: {np.mean(heights):.0f}px")
print(f"Mean aspect ratio: {np.mean(aspect_ratios):.2f}")


### 3.1. File Size Analysis


In [None]:
# Analyze file sizes
file_sizes = []
for img_path in sample_images[:100]:
    try:
        size = os.path.getsize(img_path) / (1024 * 1024)  # MB
        file_sizes.append(size)
    except:
        pass

if file_sizes:
    plt.figure(figsize=(10, 5))
    plt.hist(file_sizes, bins=50, alpha=0.7, color='purple')
    plt.xlabel('File Size (MB)')
    plt.ylabel('Frequency')
    plt.title('COCO Image File Size Distribution')
    plt.tight_layout()
    plt.savefig('/kaggle/working/coco_file_sizes.png', dpi=150)
    plt.show()
    
    print(f"Mean file size: {np.mean(file_sizes):.2f} MB")
    print(f"Median file size: {np.median(file_sizes):.2f} MB")


In [None]:
# Display sample COCO images
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
sample_imgs = np.random.choice(coco_images, 6, replace=False)

for ax, img_path in zip(axes.flat, sample_imgs):
    img = Image.open(img_path)
    ax.imshow(img)
    ax.set_title(f"{img.size[0]}x{img.size[1]}")
    ax.axis('off')

plt.tight_layout()
plt.savefig('/kaggle/working/coco_samples.png', dpi=150)
plt.show()


## 4. WikiArt Dataset Analysis


In [None]:
# Explore WikiArt structure
wikiart_subdirs = [d for d in WIKIART_DIR.iterdir() if d.is_dir()]
print(f"Number of style directories: {len(wikiart_subdirs)}")
print(f"Style directories: {[d.name for d in wikiart_subdirs[:10]]}")


In [None]:
# Count images per style
style_counts = {}
for style_dir in wikiart_subdirs:
    images = list(style_dir.glob('*.jpg')) + list(style_dir.glob('*.png'))
    style_counts[style_dir.name] = len(images)

style_df = pd.DataFrame(list(style_counts.items()), columns=['Style', 'Count'])
style_df = style_df.sort_values('Count', ascending=False)

print(style_df.head(20))


### 4.1. WikiArt Image Size Analysis


In [None]:
# Analyze WikiArt image sizes
wikiart_sizes = []
for style_dir in wikiart_subdirs[:5]:
    images = list(style_dir.glob('*.jpg'))[:20]
    for img_path in images:
        size = get_image_size(img_path)
        if size:
            wikiart_sizes.append(size)

if wikiart_sizes:
    wikiart_widths = [s[0] for s in wikiart_sizes]
    wikiart_heights = [s[1] for s in wikiart_sizes]
    
    print(f"WikiArt mean width: {np.mean(wikiart_widths):.0f}px")
    print(f"WikiArt mean height: {np.mean(wikiart_heights):.0f}px")
    
    plt.figure(figsize=(10, 5))
    plt.scatter(wikiart_widths, wikiart_heights, alpha=0.5, s=20)
    plt.xlabel('Width (px)')
    plt.ylabel('Height (px)')
    plt.title('WikiArt Image Sizes')
    plt.tight_layout()
    plt.savefig('/kaggle/working/wikiart_sizes.png', dpi=150)
    plt.show()


In [None]:
# Visualize style distribution
top_styles = style_df.head(15)
plt.figure(figsize=(12, 6))
plt.barh(top_styles['Style'], top_styles['Count'])
plt.xlabel('Number of Images')
plt.title('Top 15 Styles by Image Count')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('/kaggle/working/wikiart_style_distribution.png', dpi=150)
plt.show()


In [None]:
# Select styles for training (3-5 styles with 50-100 images each)
selected_styles = style_df[(style_df['Count'] >= 50) & (style_df['Count'] <= 200)].head(5)
print("Selected styles for training:")
print(selected_styles)


### 4.2. Color Analysis


In [None]:
# Analyze color distribution (HSV) - Optional, có thể skip nếu chạy lâu
def get_color_stats(image_path, n_samples=1000):
    try:
        img = Image.open(image_path).resize((256, 256))
        img_array = np.array(img)
        hsv = cv2.cvtColor(img_array, cv2.COLOR_RGB2HSV)
        
        pixels = hsv.reshape(-1, 3)
        sample_pixels = pixels[np.random.choice(len(pixels), min(n_samples, len(pixels)), replace=False)]
        
        return {
            'hue_mean': np.mean(sample_pixels[:, 0]),
            'saturation_mean': np.mean(sample_pixels[:, 1]),
            'value_mean': np.mean(sample_pixels[:, 2])
        }
    except:
        return None

coco_sample = np.random.choice(coco_images, 20, replace=False)
coco_colors = []
for img_path in coco_sample:
    stats = get_color_stats(img_path)
    if stats:
        coco_colors.append(stats)

if coco_colors:
    coco_color_df = pd.DataFrame(coco_colors)
    print("COCO Color Statistics:")
    print(coco_color_df.describe())


In [None]:
# Display sample WikiArt images from different styles
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
selected_style_dirs = [WIKIART_DIR / style for style in selected_styles['Style'].head(3).tolist()]

for row, style_dir in enumerate(selected_style_dirs):
    images = list(style_dir.glob('*.jpg'))[:3]
    for col, img_path in enumerate(images):
        img = Image.open(img_path)
        axes[row, col].imshow(img)
        axes[row, col].set_title(f"{style_dir.name}")
        axes[row, col].axis('off')

plt.tight_layout()
plt.savefig('/kaggle/working/wikiart_samples.png', dpi=150)
plt.show()


## 5. Summary Report


In [None]:
# Generate summary report
report = f"""
# Data EDA Report

## COCO Dataset
- Train images: {len(coco_train_images)} (dùng cho training)
- Val images: {len(coco_val_images)} (dùng cho validation/testing)
- Mean size: {np.mean(widths):.0f}x{np.mean(heights):.0f}px
- Mean aspect ratio: {np.mean(aspect_ratios):.2f}

## WikiArt Dataset
- Total styles: {len(wikiart_subdirs)}
- Selected styles for training: {len(selected_styles)}
- Styles: {', '.join(selected_styles['Style'].head(5).tolist())}

## Recommendations
- Resize all images to 512x512 for training
- Use train2017 ({len(coco_train_images)} images) cho training LoRA
- Use val2017 ({len(coco_val_images)} images) cho validation/testing sau khi train
- Use selected styles with 50-200 images each
- Consider data augmentation for style images
"""

print(report)

# Save report
with open('/kaggle/working/eda_report.md', 'w') as f:
    f.write(report)
